In [8]:
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import gym
import time
from collections import deque

In [9]:
env = gym.make('SkiingDeterministic-v4')
action_size = env.action_space.n
print(" ".join( [f"{i}:{a}" for i,a in enumerate(env.get_action_meanings())]))

0:NOOP 1:RIGHT 2:LEFT


In [97]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
        discounted_r[t] = running_add
    #discounted_r -= np.mean(discounted_r) #normalizing the result
    #discounted_r /= np.std(discounted_r) #idem
    return discounted_r

class Agent:
    def __init__(self, gamma=0.95, epsilon=1, e_min=0.05, e_decay=0.99):
        self.episode = 0
        self.frame = 0
        self.model = self._make_model()
        self.restart()
        self.gamma = gamma
        
        self.autosave = None
        
        self.epsilon = epsilon
        self.epsilon_min = e_min
        self.epsilon_decay = e_decay
        
    def decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def preprocessFrame(self,I):
        """ 
        Outputs a 72x72 image where background is black
        and important game elements are white.
        Output is [0,1]
        """
        I = I[::2,::2,1]
        I = I[31:103,4:76]
        I[I == 236] = 0
        I[I == 192] = 0
        I[I == 214] = 0
        I[I != 0] = 255
        return I/255
    
    def _make_model(self):
        model = Sequential()
        model.add(Dense(
            units=256,
            input_dim=72*72,
            activation='relu',
            #kernel_initializer='glorot_uniform'
        ))
        model.add(Dense(
            units=128,
            activation='relu'
        ))
        model.add(Dense(
            units=3,
            activation='softmax',
            #kernel_initializer='RandomNormal'
        ))
        model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy']
        )
        return model
    
    def restart(self):
        self.x_train = []
        self.y_train = []
        self.rewards = []
        self.last = np.zeros(72*72)
        self.final_reward = 0
        self.frame_counter = 0
        
    def done(self, reward):
        time_r = 4507 / self.frame_counter - 1
        flag_r = 5*(20 - (-reward) // 500) if time_r > 0 else -10
        self.final_reward = time_r + flag_r
    
    def action(self, frame, training=False):
        self.frame_counter  += 1
        frame = self.preprocessFrame(frame).flatten()
        #x = np.array([frame - self.last])
        x = np.array([frame])
        probs = self.model.predict(x)
        y = np.random.choice([0,1,2], p=probs[0])
        
        if not training:
            return y
        else:
            # Explore a bit
            if np.random.rand() <= self.epsilon:
                y = np.random.choice([0,1,2])
                
        # Append flattened frame to x_train 
        self.x_train.append(frame)
        # Append selected action to y_train
        self.y_train.append(to_categorical(y, num_classes=3))
        # Append a 0 to sample weight 
        # Will be updated later
        self.rewards.append(0)
        self.last = frame
        return y
        
    def train(self):
        self.rewards[-1] = self.final_reward
        #print(np.vstack(self.x_train).shape)
        #print(discount_rewards(self.rewards, self.gamma))
        #print(len(self.rewards))
        self.model.fit(
            x=np.vstack(self.x_train),
            y=np.vstack(self.y_train),
            verbose=0,
            sample_weight=discount_rewards(self.rewards, self.gamma)
        )
        if self.autosave is not None and self.episode % self.autosave == 0:
            self.save("last.h5")
            print("Saved!")
        self.episode += 1
        self.decay()
        
    def set_autosave(self, interval):
        self.autosave = interval
        
    def save(self, name):
        self.model.save_weights(name)
        
    def load(self, name):
        self.model.load_weights(name)
        

In [98]:
agent = Agent(gamma=0.95, e_decay=0.95)
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_55 (Dense)             (None, 256)               1327360   
_________________________________________________________________
dense_56 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_57 (Dense)             (None, 3)                 387       
Total params: 1,360,643
Trainable params: 1,360,643
Non-trainable params: 0
_________________________________________________________________


In [99]:
class RewardHist:
    def __init__(self, maxlen=100):
        self.mem = deque(maxlen=maxlen)
        self.last = 0
    
    def add(self, reward):
        self.mem.append(reward)
    
    def _nparr(self):
        return np.array(self.mem)
    
    def max(self):
        return self._nparr().max()
    
    def mean(self):
        return self._nparr().mean()
    
    def report(self):
        mean = self.mean()
        symbol = '▲' if mean > self.last else '▼' if mean < self.last else '-'
        print(f"Reward AVG: {mean:8.2f} | {symbol} {(mean - self.last):8.2f}")
        print(f"Best: {self.max()}")
        self.last = mean

In [100]:
agent.set_autosave(10)
observation = env.reset()
hist = RewardHist(100)
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation, training=True)
    
    observation, reward, done, _ = env.step(action)
    
    if done:
        agent.done(reward)
        hist.add(agent.final_reward)
        if agent.episode % 25 == 0:
            print('# - = - = - = - #')
            print(f"Ep: {agent.episode:4}\nTotal reward: {agent.final_reward:.3f}\nEpsilon: {agent.epsilon:.4f}")
            hist.report()
        agent.train()
        agent.restart()
        
        observation = env.reset()
        #break
        
env.close()

# - = - = - = - #
Ep:    0
Total reward: 27.110
Epsilon: 1.0000
Reward AVG:    27.11 | ▲    27.11
Best: 27.1104209799862
Saved!
Saved!
Saved!
# - = - = - = - #
Ep:   25
Total reward: 6.521
Epsilon: 0.2774
Reward AVG:    18.95 | ▼    -8.16
Best: 58.11598173515982
Saved!
Saved!
# - = - = - = - #
Ep:   50
Total reward: 23.086
Epsilon: 0.0769
Reward AVG:    17.34 | ▼    -1.61
Best: 58.11598173515982
Saved!
Saved!
Saved!
# - = - = - = - #
Ep:   75
Total reward: 17.430
Epsilon: 0.0485
Reward AVG:    14.30 | ▼    -3.05
Best: 58.11598173515982
Saved!
Saved!
# - = - = - = - #
Ep:  100
Total reward: -10.000
Epsilon: 0.0485
Reward AVG:    10.20 | ▼    -4.10
Best: 58.11598173515982
Saved!
Saved!
Saved!
# - = - = - = - #
Ep:  125
Total reward: 18.719
Epsilon: 0.0485
Reward AVG:     7.55 | ▼    -2.65
Best: 32.21011396011396
Saved!
Saved!
# - = - = - = - #
Ep:  150
Total reward: 20.560
Epsilon: 0.0485
Reward AVG:     7.14 | ▼    -0.42
Best: 56.38852459016393
Saved!
Saved!
Saved!
# - = - = - = - #
Ep:

KeyboardInterrupt: 

In [66]:
env.close()

In [95]:
observation = env.reset()
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
        
    observation, reward, done, _ = env.step(action)
    
    if done:
        agent.done(reward)
        print(f"Final reward: {agent.final_reward}")
        break
        
        
env.close()

Final reward: 12.33851851851852
