In [None]:
import gym
import math
import random
import keras
import os
import tensorflow as tf
import numpy as np
import scipy as sp
import sklearn as sk
import matplotlib as mpl
import matplotlib.pyplot as plt

from tqdm import tqdm
from keras.models import Sequential, load_model
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *
from keras import backend as K
%matplotlib inline

In [None]:
# Setting Global Parameters

class PG_Agent(): 
    
    def __init__(self, num_state, num_action, params):
        
        self.params =  params
        self.num_state = num_state
        self.num_action = num_action
        self.steps = 0        
        self.gamma = params['gamma']   
        self.learning_rate = params['lr']
        self.load_model = params['load_model']
        
        self.model = self._create_model(load = self.load_model)
        self.states = []
        self.gradients = []
        self.rewards = []
        self.probs = []

        
    # Model OK    
    def _create_model(self, load = False):
                
        if not load:
            model =  Sequential()

            model.add(Conv2D(input_shape = self.num_state, filters = 32, 
                             kernel_size = 4, strides=2, 
                             activation = 'relu'))

            model.add(Conv2D(filters = 64, kernel_size = 4, strides=2, 
                             activation = 'relu'))

            model.add(Conv2D(filters = 128, kernel_size = 4, strides=2, 
                             activation = 'relu'))            
            
            model.add(Flatten()) 
            model.add(Dense(units=200, kernel_initializer='glorot_normal',
                            activation = 'relu')) 
            model.add(Dense(self.num_action, activation ='softmax'))              

            optimizer = Adam(lr = self.learning_rate)
            model.compile(optimizer = optimizer, loss = 'categorical_crossentropy')

            print("Model constructed...", end ="\r", flush=True)
        else: 
            model = load_model('pg-atari.h5')
            print("Model loaded...", end ="\r", flush=True)
        
        return model
    
    # Check OK
    def predict(self, state):        
        if len(state.shape) == 3:
            state = np.expand_dims(state, axis=0)
        return self.model.predict(state)
    
    def discounted_rewards(self, rewards):
        discounted_r = np.zeros_like(rewards)
        running_add = 0
        for t in range(rewards.size, 0, -1):
            if rewards[t-1] !=0:
                running_add = 0
            running_add = running_add * self.gamma + rewards[t-1]
            discounted_r[t-1] = running_add
            
        discounted_r = (discounted_r -np.mean(discounted_r))/np.std(discounted_r)
        
        return discounted_r


    # Check OK
    def observe(self, state, action, prob, reward):
        y = np.zeros((self.num_action))
        y[action] = 1
        self.gradients.append(y - prob)
        self.states.append(state)
        self.rewards.append(reward)
        self.steps +=1 
             
    # Check OK    
    def act(self, state): 
        prob = self.predict(state)
        self.probs.append(prob)
        prob /= np.sum(prob)
        prob = np.squeeze(prob)
        action = np.random.choice(self.num_action, 1, p=prob)[0]
        return prob, action
    
    def train(self):
        gradients = np.vstack(self.gradients)
        rewards = self.discounted_rewards(np.vstack(self.rewards))
        gradients *=rewards
        
        X = np.squeeze(np.vstack([self.states]))
        Y = self.learning_rate * np.squeeze(np.vstack([gradients])) # np.squeeze(np.vstack(self.probs)) + 
        
        self.model.train_on_batch(X,Y)
        #self.states, self.probs, self.gradients, self.rewards = [], [], [], []


class input_pipeline():    
    
    def __init__(self, state):       
        self.history_length = 4        
        self.input_x=[]
        self.input_x = [self._preprocess(state) for i in range(self.history_length)]
        self.x = np.moveaxis(np.array(self.input_x), 0, -1) 

    def _preprocess(self, state):    
        state = state[33:196,:, 0]
        state = state[::2, ::2]
        state[state == 109] = 0
        state[state == 144] = 0
        state[state != 0] = 1 
        return state
    
    def update(self, state):
        self.input_x.pop(0)
        self.input_x.append(self._preprocess(state))
        self.x = np.moveaxis(np.array(self.input_x), 0, -1)
    

In [None]:
def main(train=True):
    
    env = gym.make('PongDeterministic-v4')
    env = gym.wrappers.Monitor(env, './tmp/pong-1', force=True)
    num_state = env.observation_space.shape    
    num_action = env.action_space.n
    
    agent = PG_Agent((82,80,4), num_action, params) 
    loss, mean_av, render = [], [], False    
    
    for episode in range(N_EPISODE):        
        state, step, total_reward, done, render = env.reset(), 0, 0, False, False        
        pipeline = input_pipeline(state)
       
        if episode >= 1000:
            render = True
        
        while not done:
            if render:
                env.render()
                
            state = pipeline.x
            prob, action = agent.act(state)
            next_state, reward, done, _ = env.step(action)  
            agent.observe(state, action, prob, reward)
            pipeline.update(next_state)
            total_reward += reward
            step +=1
            
            if done:
                agent.train()
        
                if episode % 10==0:
                    agent.model.save('pg-atari.h5')

        
        print('Episode: {}/{}, Step: {}, Iteration: {}, Reward: {}'
              .format(episode+1, N_EPISODE, step, agent.steps, total_reward), 
              end = '\r') 
        
    return np.array(loss), np.array(mean_av)           

In [None]:
params = {}
params['gamma'] = 0.99
params['lr'] = 1e-3
params['load_model'] = True

In [None]:
# x = np.array(range(150000))
# y = params['eps_min'] + (params['eps_max']-params['eps_min'])*np.exp(-params['decay']*x)
# plt.plot(y)
# plt.title('$\epsilon$ decay rate')
# plt.ylabel('epsilon')
# plt.xlabel('iteration')
# plt.show()

In [None]:
N_EPISODE = 1000000
loss, mean_q = main(True)

In [None]:
# plt.figure(figsize=(12,4))
# plt.subplot(1,2,1)
# plt.title('Loss')
# plt.xlabel('iteration')
# plt.ylabel('loss')
# plt.plot(loss)

# plt.subplot(1,2,2)
# plt.plot(mean_q)
# plt.title('Mean Q-Values')
# plt.xlabel('iteration')
# plt.ylabel('mean Q value')
# plt.show()