In [1]:
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import gym
import time
from collections import deque

Using TensorFlow backend.


In [2]:
env = gym.make('SkiingDeterministic-v4')
action_size = env.action_space.n
print(" ".join( [f"{i}:{a}" for i,a in enumerate(env.get_action_meanings())]))

0:NOOP 1:RIGHT 2:LEFT


In [8]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
        running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
        discounted_r[t] = running_add
    discounted_r -= np.mean(discounted_r) #normalizing the result
    discounted_r /= np.std(discounted_r) #idem
    return discounted_r

class Agent:
    def __init__(self, gamma=0.95):
        self.episode = 0
        self.model = self._make_model()
        self.restart()
        self.gamma = gamma
        
        self.add_total = True
        
        self.autosave = None
    
    def preprocessFrame(self,I):
        """ 
        Outputs a 72x72 image where background is black
        and important game elements are white.
        Output is [0,1]
        """
        I = I[::2,::2,1]
        I = I[31:103,4:76]
        I[I == 236] = 0
        I[I == 192] = 0
        I[I == 214] = 0
        I[I != 0] = 255
        return I/255
    
    def _make_model(self):
        model = Sequential()
        model.add(Dense(
            units=256,
            input_dim=72*72,
            activation='relu',
            #kernel_initializer='glorot_uniform'
        ))
        model.add(Dense(
            units=3,
            activation='softmax',
            #kernel_initializer='RandomNormal'
        ))
        model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy']
        )
        return model
    
    def restart(self):
        self.x_train = []
        self.y_train = []
        self.rewards = []
        self.last = np.zeros(72*72)
        self.total_reward = 0
        
    def save_reward(self, reward):
        self.rewards.append(reward)
        self.total_reward += reward
    
    def action(self, frame):
        frame = self.preprocessFrame(frame).flatten()
        #x = np.array([frame - self.last])
        x = np.array([frame])
        probs = self.model.predict(x)
        y = np.random.choice([0,1,2], p=probs[0])
        self.x_train.append(x)
        self.y_train.append(to_categorical(y, num_classes=3))
        self.last = frame
        return y
        
    def train(self):
        if self.add_total: self.rewards[-1] += self.total_reward
        self.model.fit(
            x=np.vstack(self.x_train),
            y=np.vstack(self.y_train),
            verbose=1,
            sample_weight=discount_rewards(self.rewards, self.gamma)
        )
        if self.autosave is not None and self.episode % self.autosave == 0:
            self.save("last.h5")
            print("Saved!")
        self.episode += 1
        
    def set_autosave(self, interval):
        self.autosave = interval
        
    def save(self, name):
        self.model.save_weights(name)
        
    def load(self, name):
        self.model.load_weights(name)
        

In [9]:
agent = Agent()
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 256)               1327360   
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 771       
Total params: 1,328,131
Trainable params: 1,328,131
Non-trainable params: 0
_________________________________________________________________


In [14]:
class RewardHist:
    def __init__(self, maxlen=100):
        self.mem = deque(maxlen=maxlen)
        self.last = 0
    
    def add(self, reward):
        self.mem.append(reward)
    
    def _nparr(self):
        return np.array(self.mem)
    
    def max(self):
        return self._nparr().max()
    
    def mean(self):
        return self._nparr().mean()
    
    def report(self):
        mean = self.mean()
        symbol = '▲' if mean > self.last else '▼' if mean < self.last else '-'
        print(f"Reward AVG: {mean:8.2f} | {symbol} {(mean - self.last):8.2f}")
        print(f"Best: {self.max()}")
        self.last = mean

In [None]:
agent.set_autosave(10)
observation = env.reset()
hist = RewardHist()
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
    
    observation, reward, done, _ = env.step(action)
    
    agent.save_reward(reward)
    
    if done:
        print('# - = - = - = - #')
        print(f"Ep: {agent.episode:4}\nTotal reward: {agent.total_reward}")
        hist.add(agent.total_reward)
        hist.report()
        agent.train()
        agent.restart()
        
        observation = env.reset()
        
env.close()

# - = - = - = - #
Ep:   18
Total reward: -30000.0
Reward AVG: -30000.00 | ▼ -30000.00
Best: -30000.0
Epoch 1/1
# - = - = - = - #
Ep:   19
Total reward: -30808.0
Reward AVG: -30404.00 | ▼  -404.00
Best: -30000.0
Epoch 1/1
# - = - = - = - #
Ep:   20
Total reward: -30000.0
Reward AVG: -30269.33 | ▲   134.67
Best: -30000.0
Epoch 1/1
Saved!


In [None]:
observation = env.reset()
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
        
    observation, reward, done, _ = env.step(action)
    
    agent.save_reward(reward)
    
    if done:
        print(f"Total reward: {agent.total_reward}")
        break
        
        
env.close()