In [8]:
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import gym
import time
from collections import deque

In [9]:
env = gym.make('SkiingDeterministic-v4')
action_size = env.action_space.n
print(" ".join( [f"{i}:{a}" for i,a in enumerate(env.get_action_meanings())]))

0:NOOP 1:RIGHT 2:LEFT


In [46]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
        discounted_r[t] = running_add
    #discounted_r -= np.mean(discounted_r) #normalizing the result
    #discounted_r /= np.std(discounted_r) #idem
    return discounted_r

class Agent:
    def __init__(self, gamma=0.95):
        self.episode = 0
        self.frame = 0
        self.model = self._make_model()
        self.restart()
        self.gamma = gamma
        
        self.add_total = True
        
        self.autosave = None
    
    def preprocessFrame(self,I):
        """ 
        Outputs a 72x72 image where background is black
        and important game elements are white.
        Output is [0,1]
        """
        I = I[::2,::2,1]
        I = I[31:103,4:76]
        I[I == 236] = 0
        I[I == 192] = 0
        I[I == 214] = 0
        I[I != 0] = 255
        return I/255
    
    def _make_model(self):
        model = Sequential()
        model.add(Dense(
            units=256,
            input_dim=72*72,
            activation='relu',
            #kernel_initializer='glorot_uniform'
        ))
        model.add(Dense(
            units=128,
            activation='relu'
        ))
        model.add(Dense(
            units=3,
            activation='softmax',
            #kernel_initializer='RandomNormal'
        ))
        model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy']
        )
        return model
    
    def restart(self):
        self.x_train = []
        self.y_train = []
        self.rewards = []
        self.last = np.zeros(72*72)
        self.final_reward = 0
        self.frame_counter = 0
        
    def done(self, reward):
        if done:
            time_r = 4507 / self.frame_counter - 1
            flag_r = 20 - (-reward) // 500 if time_r > 0 else -10
            self.final_reward = time_r + flag_r
    
    def action(self, frame):
        frame = self.preprocessFrame(frame).flatten()
        #x = np.array([frame - self.last])
        x = np.array([frame])
        probs = self.model.predict(x)
        y = np.random.choice([0,1,2], p=probs[0])
        # Append flattened frame to x_train 
        self.x_train.append(frame)
        # Append selected action to y_train
        self.y_train.append(to_categorical(y, num_classes=3))
        # Append a 0 to sample weight 
        # Will be updated later
        self.rewards.append(0)
        self.last = frame
        self.frame_counter  += 1 
        return y
        
    def train(self):
        if self.add_total: self.rewards[-1] = self.final_reward
        #print(np.vstack(self.x_train).shape)
        #print(discount_rewards(self.rewards, self.gamma))
        #print(len(self.rewards))
        self.model.fit(
            x=np.vstack(self.x_train),
            y=np.vstack(self.y_train),
            verbose=1,
            sample_weight=discount_rewards(self.rewards, self.gamma)
        )
        if self.autosave is not None and self.episode % self.autosave == 0:
            self.save("last.h5")
            print("Saved!")
        self.episode += 1
        
    def set_autosave(self, interval):
        self.autosave = interval
        
    def save(self, name):
        self.model.save_weights(name)
        
    def load(self, name):
        self.model.load_weights(name)
        

In [47]:
agent = Agent(gamma=0.98)
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 256)               1327360   
_________________________________________________________________
dense_26 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_27 (Dense)             (None, 3)                 387       
Total params: 1,360,643
Trainable params: 1,360,643
Non-trainable params: 0
_________________________________________________________________


In [48]:
class RewardHist:
    def __init__(self, maxlen=100):
        self.mem = deque(maxlen=maxlen)
        self.last = 0
    
    def add(self, reward):
        self.mem.append(reward)
    
    def _nparr(self):
        return np.array(self.mem)
    
    def max(self):
        return self._nparr().max()
    
    def mean(self):
        return self._nparr().mean()
    
    def report(self):
        mean = self.mean()
        symbol = '▲' if mean > self.last else '▼' if mean < self.last else '-'
        print(f"Reward AVG: {mean:8.2f} | {symbol} {(mean - self.last):8.2f}")
        print(f"Best: {self.max()}")
        self.last = mean

In [49]:
agent.set_autosave(10)
observation = env.reset()
hist = RewardHist(25)
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
    
    observation, reward, done, _ = env.step(action)
    
    if done:
        agent.done(reward)
        print('# - = - = - = - #')
        print(f"Ep: {agent.episode:4}\nTotal reward: {agent.final_reward:.3f}")
        hist.add(agent.final_reward)
        hist.report()
        agent.train()
        agent.restart()
        
        observation = env.reset()
        #break
        
env.close()

# - = - = - = - #
Ep:    0
Total reward: 3.172
Reward AVG:     3.17 | ▲     3.17
Best: 3.171710063335679
Epoch 1/1
Saved!
# - = - = - = - #
Ep:    1
Total reward: 5.855
Reward AVG:     4.51 | ▲     1.34
Best: 5.855431993156544
Epoch 1/1
# - = - = - = - #
Ep:    2
Total reward: 6.043
Reward AVG:     5.02 | ▲     0.51
Best: 6.0432140445644835
Epoch 1/1
# - = - = - = - #
Ep:    3
Total reward: 8.154
Reward AVG:     5.81 | ▲     0.78
Best: 8.153917050691245
Epoch 1/1
# - = - = - = - #
Ep:    4
Total reward: 6.285
Reward AVG:     5.90 | ▲     0.10
Best: 8.153917050691245
Epoch 1/1
# - = - = - = - #
Ep:    5
Total reward: 1.805
Reward AVG:     5.22 | ▼    -0.68
Best: 8.153917050691245
Epoch 1/1
# - = - = - = - #
Ep:    6
Total reward: 3.222
Reward AVG:     4.93 | ▼    -0.29
Best: 8.153917050691245
Epoch 1/1
# - = - = - = - #
Ep:    7
Total reward: 5.950
Reward AVG:     5.06 | ▲     0.13
Best: 8.153917050691245
Epoch 1/1
# - = - = - = - #
Ep:    8
Total reward: 4.462
Reward AVG:     4.99 | ▼ 

# - = - = - = - #
Ep:   40
Total reward: 15.135
Reward AVG:     4.10 | ▲     0.27
Best: 16.51984877126654
Epoch 1/1
Saved!
# - = - = - = - #
Ep:   41
Total reward: 16.352
Reward AVG:     4.61 | ▲     0.52
Best: 16.51984877126654
Epoch 1/1
# - = - = - = - #
Ep:   42
Total reward: 15.135
Reward AVG:     5.04 | ▲     0.42
Best: 16.51984877126654
Epoch 1/1
# - = - = - = - #
Ep:   43
Total reward: 16.352
Reward AVG:     5.57 | ▲     0.53
Best: 16.51984877126654
Epoch 1/1
# - = - = - = - #
Ep:   44
Total reward: 15.135
Reward AVG:     5.74 | ▲     0.17
Best: 16.51984877126654
Epoch 1/1
# - = - = - = - #
Ep:   45
Total reward: 15.135
Reward AVG:     6.03 | ▲     0.29
Best: 16.51984877126654
Epoch 1/1
# - = - = - = - #
Ep:   46
Total reward: 9.140
Reward AVG:     6.79 | ▲     0.77
Best: 16.51984877126654
Epoch 1/1
# - = - = - = - #
Ep:   47
Total reward: 15.135
Reward AVG:     7.80 | ▲     1.01
Best: 16.51984877126654
Epoch 1/1
# - = - = - = - #
Ep:   48
Total reward: 9.756
Reward AVG:     8.5

# - = - = - = - #
Ep:   80
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
Saved!
# - = - = - = - #
Ep:   81
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:   82
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:   83
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:   84
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:   85
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:   86
Total reward: 16.352
Reward AVG:    15.18 | ▲     0.05
Best: 16.35236541598695
Epoch 1/1
# - = - = - = - #
Ep:   87
Total reward: 15.135
Reward AVG:    15.18 | -     0.00
Best: 16.35236541598695
Epoch 1/1
# - = - = - = - #
Ep:   88
Total reward: 15.135
Reward AVG:

# - = - = - = - #
Ep:  120
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
Saved!
# - = - = - = - #
Ep:  121
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:  122
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:  123
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:  124
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:  125
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:  126
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:  127
Total reward: 15.135
Reward AVG:    15.14 | -     0.00
Best: 15.135379061371841
Epoch 1/1
# - = - = - = - #
Ep:  128
Total reward: 15.135
Reward AV

KeyboardInterrupt: 

In [28]:
env.close()

In [None]:
observation = env.reset()
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
        
    observation, reward, done, _ = env.step(action)
    
    agent.save_reward(reward)
    
    if done:
        print(f"Total reward: {agent.total_reward}")
        break
        
        
env.close()