In [1]:
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import gym
import time

Using TensorFlow backend.


In [2]:
env = gym.make('SkiingDeterministic-v4')
action_size = env.action_space.n
print(" ".join( [f"{i}:{a}" for i,a in enumerate(env.get_action_meanings())]))

0:NOOP 1:RIGHT 2:LEFT


In [4]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
        running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
        discounted_r[t] = running_add
    discounted_r -= np.mean(discounted_r) #normalizing the result
    discounted_r /= np.std(discounted_r) #idem
    return discounted_r

class Agent:
    def __init__(self, gamma=0.95):
        self.episode = 0
        self.model = self._make_model()
        self.restart()
        self.gamma = gamma
        
        self.add_total = True
        
        self.autosave = None
    
    def preprocessFrame(self,I):
        """ 
        Outputs a 72x72 image where background is black
        and important game elements are white.
        Output is [0,1]
        """
        I = I[::2,::2,1]
        I = I[31:103,4:76]
        I[I == 236] = 0
        I[I == 192] = 0
        I[I == 214] = 0
        I[I != 0] = 255
        return I/255
    
    def _make_model(self):
        model = Sequential()
        model.add(Dense(
            units=256,
            input_dim=72*72,
            activation='relu',
            #kernel_initializer='glorot_uniform'
        ))
        model.add(Dense(
            units=3,
            activation='softmax',
            #kernel_initializer='RandomNormal'
        ))
        model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy']
        )
        return model
    
    def restart(self):
        self.x_train = []
        self.y_train = []
        self.rewards = []
        self.last = np.zeros(72*72)
        self.total_reward = 0
        
    def save_reward(self, reward):
        self.rewards.append(reward)
        self.total_reward += reward
    
    def action(self, frame):
        frame = self.preprocessFrame(frame).flatten()
        #x = np.array([frame - self.last])
        x = np.array([frame])
        probs = self.model.predict(x)
        y = np.random.choice([0,1,2], p=probs[0])
        self.x_train.append(x)
        self.y_train.append(to_categorical(y, num_classes=3))
        self.last = frame
        return y
        
    def train(self):
        self.episode += 1
        if self.add_total: self.rewards[-1] += self.total_reward
        self.model.fit(
            x=np.vstack(self.x_train),
            y=np.vstack(self.y_train),
            verbose=1,
            sample_weight=discount_rewards(self.rewards, self.gamma)
        )
        if self.autosave is not None and self.episode % self.autosave == 0:
            self.model.save("last.h5")
            print("Saved!")
        
    def set_autosave(self, interval):
        self.autosave = interval
        

In [5]:
agent = Agent()
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               1327360   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 771       
Total params: 1,328,131
Trainable params: 1,328,131
Non-trainable params: 0
_________________________________________________________________


In [None]:
agent.set_autosave(10)
observation = env.reset()
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
    
    observation, reward, done, _ = env.step(action)
    
    agent.save_reward(reward)
    
    if done:
        print(f"Ep: {agent.episode:4}\nTotal reward: {agent.total_reward}")
        agent.train()
        agent.restart()
        
        observation = env.reset()
        
env.close()

Ep:    0
Total reward: -13789.0
Epoch 1/1
Ep:    1
Total reward: -30000.0
Epoch 1/1
Ep:    2
Total reward: -23312.0
Epoch 1/1
Ep:    3
Total reward: -15089.0
Epoch 1/1
Ep:    4
Total reward: -18008.0
Epoch 1/1
Ep:    5
Total reward: -30000.0
Epoch 1/1
Ep:    6
Total reward: -15175.0
Epoch 1/1
Ep:    7
Total reward: -20114.0
Epoch 1/1
Ep:    8
Total reward: -16541.0
Epoch 1/1
Ep:    9
Total reward: -16215.0
Epoch 1/1
Saved!
Ep:   10
Total reward: -30000.0
Epoch 1/1
Ep:   11
Total reward: -18975.0
Epoch 1/1
Ep:   12
Total reward: -23027.0
Epoch 1/1
Ep:   13
Total reward: -22248.0
Epoch 1/1
Ep:   14
Total reward: -30000.0
Epoch 1/1
Ep:   15
Total reward: -30000.0
Epoch 1/1
Ep:   16
Total reward: -30000.0
Epoch 1/1
Ep:   17
Total reward: -13963.0
Epoch 1/1
Ep:   18
Total reward: -15191.0
Epoch 1/1
Ep:   19
Total reward: -18731.0
Epoch 1/1
Saved!
Ep:   20
Total reward: -25713.0
Epoch 1/1
Ep:   21
Total reward: -37885.0
Epoch 1/1
Ep:   22
Total reward: -26985.0
Epoch 1/1
Ep:   23
Total rewar

In [None]:
observation = env.reset()
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
        
    observation, reward, done, _ = env.step(action)
    
    agent.save_reward(reward)
    
    if done:
        print(f"Total reward: {agent.total_reward}")
        break
        
        
env.close()