In [2]:
import gym
import numpy as np
import random
from collections import deque
import MyNN

import matplotlib.pyplot as plt

In [2]:
def prepro(state):
    state = state.reshape(state_size, 1).astype('float64')
    return state

In [3]:
class PGAgent:
    def __init__(self, state_size, action_size, gamma=0.99, epsilon=1.0,
                 epsilon_min=0.01, epsilon_decay=0.995, learning_rate=0.01,
                load=False, load_path='/tmp/model.bigl'):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.model = self._build_model()
        self.value_func = self._build_vf()
        if load:
            self.model = self.model.load(load_path)
        

    def _build_model(self):
        model = MyNN.MyNN(self.state_size)
        model.add(200, 'Tanh')
        model.add(110, 'Tanh')
        model.add(60, 'Tanh')
        model.add(1, 'Sigmoid')
        model.compile('Cross entropy', 'Adam')
        return model
    
    def _build_vf(self):
        model = MyNN.MyNN(self.state_size)
        model.add(100, 'Tanh')
        model.add(25,'Tanh')
        model.add(2, 'Linear')
        model.compile('MSE', 'Adam')
        return model

    def train_val_func(self, minibatch):
        for state, action, reward, next_state in minibatch:
            target = reward
            state = state.reshape(self.state_size, 1)
            next_state = next_state.reshape(self.state_size, 1)
            if reward == 0:
                target = (reward + self.gamma *
                          np.amax(self.value_func.forward(next_state)[:,0]))
            target_f = self.value_func.forward(state)
            target_f[int(action), 0] = target
            self.value_func.optimize(state, target_f, num_epochs=1, lr=self.learning_rate, report_cost=False)

In [4]:
def play_game(agent, render=False):
    state = env.reset()
    memory = np.zeros((agent.state_size,1))
    actions = np.array([])
    rewards = np.array([])
    values = np.array([])
    done = False
    score = [0,0]
    while not done:
        if render:
            env.render()
        state = prepro(state)
        memory = np.hstack((memory, state))
        val = agent.value_func.forward(state)[:,0]
        prediction = agent.model.forward(state)[0][0]
        value = val[0]*(1-prediction) + val[1]*prediction
        values = np.append(values, value)
        action = 3 if prediction > np.random.rand() else 2
        actions = np.append(actions, action)
        state, reward, done, _ = env.step(action)
        rewards = np.append(rewards, reward)
        if reward == 1:
            score[1] +=1
        elif reward == -1:
            score[0] +=1
    return memory[:,1:], actions.reshape(1,-1), rewards.reshape(1,-1), values.reshape(1,-1) ,score

In [5]:
def running_reward(rewards, st_values, gamma):
    result = []
    run_rew = 0
    for reward, st_value in  list(zip(rewards[0,:], st_values[0,:]))[::-1]:
        if reward != 0:
            run_rew = 0
        run_rew = run_rew*gamma + reward
        result.append(run_rew)
    return np.hstack(result[::-1]).reshape(1,-1)

In [6]:
def play_n_games(agent, n=5):
    X_batch = np.zeros((agent.state_size,1))
    y_batch = np.array([[0]])
    rewards = np.array([[0]])
    weights = np.array([[0]])
    results = [0,0]
    for i in range(n):
        a, b, c, d, e = play_game(agent)
        X_batch = np.hstack((X_batch, a))
        y_batch = np.hstack((y_batch, b))
        rewards = np.hstack((rewards, c))
        weights = np.hstack((weights, running_reward(c, d, agent.gamma)))
        results[0]+=e[0]
        results[1]+=e[1]
    return X_batch[:,1:], y_batch[0,1:].reshape(1,-1)-2, rewards, weights[0,1:].reshape(1,-1), np.array(results)/n

In [7]:
env = gym.make('Pong-ram-v0')
action_size = env.action_space.n
state_size = env.observation_space.shape[0]
agent = PGAgent(state_size, action_size)
t=1

[2018-01-12 22:00:26,433] Making new env: Pong-ram-v0


In [8]:
while True:
    X_batch, Y_batch, rewards, weights, results = play_n_games(agent, n=5)
    indices = np.random.choice(np.arange(X_batch.shape[1]-1), size=256, replace=False)
    agent.train_val_func(list(zip(X_batch[:,indices].T, Y_batch[0,indices].T, rewards[0,indices].T, X_batch[:,indices+1].T)))
    print(t, 'Score is {} : {}'.format(results[0], results[1]))
    if results[1] >= 10:
        break
    agent.model.optimize(X_batch, Y_batch, weights=weights, lr=0.005, num_epochs=1,report_cost=False)
    t += 1

1 Score is 21.0:0.4
2 Score is 21.0:0.0
3 Score is 21.0:0.0
4 Score is 21.0:0.0
5 Score is 21.0:0.0
6 Score is 21.0:0.0
7 Score is 21.0:0.0
8 Score is 21.0:0.0
9 Score is 21.0:0.0
10 Score is 21.0:0.0
11 Score is 21.0:0.0
12 Score is 21.0:0.0
13 Score is 21.0:0.0
14 Score is 21.0:0.0
15 Score is 21.0:0.0
16 Score is 21.0:0.0
17 Score is 21.0:0.0
18 Score is 21.0:0.0
19 Score is 21.0:0.0
20 Score is 21.0:0.0
21 Score is 21.0:0.0
22 Score is 21.0:0.0
23 Score is 21.0:0.0
24 Score is 21.0:0.0
25 Score is 21.0:0.0
26 Score is 21.0:0.0
27 Score is 21.0:0.4
28 Score is 21.0:0.0
29 Score is 21.0:0.2
30 Score is 21.0:0.4
31 Score is 21.0:0.6
32 Score is 21.0:0.6
33 Score is 21.0:0.0
34 Score is 21.0:0.0
35 Score is 21.0:0.0
36 Score is 21.0:0.0
37 Score is 21.0:0.0
38 Score is 21.0:0.0
39 Score is 21.0:0.0
40 Score is 21.0:0.0
41 Score is 21.0:0.0
42 Score is 21.0:0.0
43 Score is 21.0:0.0
44 Score is 21.0:0.0
45 Score is 21.0:0.0
46 Score is 21.0:0.0
47 Score is 21.0:0.0
48 Score is 21.0:0.0
4

KeyboardInterrupt: 