In [1]:
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import gym
import time
from collections import deque

Using TensorFlow backend.


In [2]:
env = gym.make('SkiingDeterministic-v4')
action_size = env.action_space.n
print(" ".join( [f"{i}:{a}" for i,a in enumerate(env.get_action_meanings())]))

0:NOOP 1:RIGHT 2:LEFT


In [33]:
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        if r[t] is not 0: running_add = 0 # Reset running add for each game "stage"
        running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
        discounted_r[t] = running_add
    #discounted_r -= np.mean(discounted_r) #normalizing the result
    #discounted_r /= np.std(discounted_r) #idem
    return discounted_r

class Agent:
    def __init__(self, gamma=0.95, epsilon=1, e_min=0.05, e_decay=0.99):
        self.episode = 0
        self.frame = 0
        self.model = self._make_model()
        self.restart()
        self.gamma = gamma
        
        self.autosave = None
        
        self.epsilon = epsilon
        self.epsilon_min = e_min
        self.epsilon_decay = e_decay
        
    def decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def preprocessFrame(self,I):
        """ 
        Outputs a 72x72 image where background is black
        and important game elements are white.
        Output is [0,1]
        """
        I = I[::2,::2,1]
        I = I[31:103,4:76]
        I[I == 236] = 0
        I[I == 192] = 0
        I[I == 214] = 0
        I[I != 0] = 255
        return I/255
    
    def cutoutScore(self,I):
        I = I[:,:,1]
        I = I[30:40,65:83]
        I[I == 236] = 255
        return I
    
    def _make_model(self):
        model = Sequential()
        model.add(Dense(
            units=256,
            input_dim=72*72,
            activation='relu',
            #kernel_initializer='glorot_uniform'
        ))
        model.add(Dense(
            units=128,
            activation='relu'
        ))
        model.add(Dense(
            units=3,
            activation='softmax',
            #kernel_initializer='RandomNormal'
        ))
        model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy']
        )
        return model
    
    def restart(self):
        self.x_train = []
        self.y_train = []
        self.rewards = []
        self.last = np.zeros(72*72)
        self.last_score_window = None
        self.final_reward = 0
        self.frame_counter = 0
        
    def record(self, obs):
        A = self.cutoutScore(obs)
        if self.last_score_window is not None and not np.array_equal(A, self.last_score_window):
            self.rewards.append(10)
        else:
            self.rewards.append(0)
        self.last_score_window = A
        
    def done(self, reward):
        time_r = 4507 / self.frame_counter - 1
        if time_r is 0: time_r = -100
        #flag_r = 5*(20 - (-reward) // 500) if time_r > 0 else 0
        #self.final_reward = time_r #+ flag_r
        self.rewards[-1] = time_r
        
    def total_reward(self):
        return np.array(self.rewards).sum()
    
    def action(self, frame, training=False):
        self.frame_counter  += 1
        frame = self.preprocessFrame(frame).flatten()
        #x = np.array([frame - self.last])
        x = np.array([frame])
        probs = self.model.predict(x)
        y = np.random.choice([0,1,2], p=probs[0])
        
        if not training:
            return y
        else:
            # Explore a bit
            if np.random.rand() <= self.epsilon:
                y = np.random.choice([0,1,2])
                
        # Append flattened frame to x_train 
        self.x_train.append(frame)
        # Append selected action to y_train
        self.y_train.append(to_categorical(y, num_classes=3))
        # Append a 0 to sample weight 
        # Will be updated later
        #self.rewards.append(0)
        self.last = frame
        return y
        
    def train(self, verbose=0):
        #self.rewards[-1] = self.final_reward
        #print(np.vstack(self.x_train).shape)
        #print(discount_rewards(self.rewards, self.gamma))
        #print(len(self.rewards))
        self.model.fit(
            x=np.vstack(self.x_train),
            y=np.vstack(self.y_train),
            verbose=verbose,
            sample_weight=discount_rewards(self.rewards, self.gamma)
        )
        if self.autosave is not None and self.episode % self.autosave == 0:
            self.save("last.h5")
            print("Saved!")
        self.episode += 1
        self.decay()
        
    def set_autosave(self, interval):
        self.autosave = interval
        
    def save(self, name):
        self.model.save_weights(name)
        
    def load(self, name):
        self.model.load_weights(name)
        

In [34]:
agent = Agent(gamma=0.95, e_decay=0.95)
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 256)               1327360   
_________________________________________________________________
dense_23 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_24 (Dense)             (None, 3)                 387       
Total params: 1,360,643
Trainable params: 1,360,643
Non-trainable params: 0
_________________________________________________________________


In [1]:
class RewardHist:
    def __init__(self, maxlen=100):
        self.mem = deque(maxlen=maxlen)
        self.last = 0
    
    def add(self, reward):
        self.mem.append(reward)
    
    def _nparr(self):
        return np.array(self.mem)
    
    def max(self):
        return self._nparr().max()
    
    def mean(self):
        return self._nparr().mean()
    
    def report(self):
        mean = self.mean()
        symbol = '▲' if mean > self.last else '▼' if mean < self.last else '-'
        print(f"Reward AVG: {mean:8.2f} | {symbol} {(mean - self.last):8.2f}")
        print(f"Best: {self.max()}")
        self.last = mean

In [36]:
agent.set_autosave(10)
observation = env.reset()
hist = RewardHist(100)
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation, training=True)
    
    observation, reward, done, _ = env.step(action)
    
    agent.record(observation)
    
    
    if done:
        agent.done(reward)
        total_reward = agent.total_reward()
        hist.add(total_reward)
        if agent.episode % 1 == 0:
            print('# - = - = - = - #')
            print(f"Ep: {agent.episode:4}\nTotal reward: {total_reward:.3f}\nEpsilon: {agent.epsilon:.4f}")
            hist.report()
        agent.train()
        agent.restart()
        
        observation = env.reset()
        #break
        
env.close()

# - = - = - = - #
Ep:    0
Total reward: 32.535
Epsilon: 1.0000
Reward AVG:    32.53 | ▲    32.53
Best: 32.53490196078431
Saved!




# - = - = - = - #
Ep:    1
Total reward: 13.154
Epsilon: 0.9500
Reward AVG:    22.84 | ▼    -9.69
Best: 32.53490196078431
# - = - = - = - #
Ep:    2
Total reward: 44.627
Epsilon: 0.9025
Reward AVG:    30.11 | ▲     7.26
Best: 44.62671660424469
# - = - = - = - #
Ep:    3
Total reward: 62.224
Epsilon: 0.8574
Reward AVG:    38.13 | ▲     8.03
Best: 62.223891273247496
# - = - = - = - #
Ep:    4
Total reward: 42.591
Epsilon: 0.8145
Reward AVG:    39.03 | ▲     0.89
Best: 62.223891273247496
# - = - = - = - #
Ep:    5
Total reward: 52.688
Epsilon: 0.7738
Reward AVG:    41.30 | ▲     2.28
Best: 62.223891273247496
# - = - = - = - #
Ep:    6
Total reward: 32.670
Epsilon: 0.7351
Reward AVG:    40.07 | ▼    -1.23
Best: 62.223891273247496
# - = - = - = - #
Ep:    7
Total reward: 61.959
Epsilon: 0.6983
Reward AVG:    42.81 | ▲     2.74
Best: 62.223891273247496
# - = - = - = - #
Ep:    8
Total reward: 42.516
Epsilon: 0.6634
Reward AVG:    42.77 | ▼    -0.03
Best: 62.223891273247496
# - = - = - = - #


KeyboardInterrupt: 

In [18]:
env.close()

In [95]:
observation = env.reset()
agent.restart()
while True:
    env.render()
    
    action = agent.action(observation)
        
    observation, reward, done, _ = env.step(action)
    
    if done:
        agent.done(reward)
        print(f"Final reward: {agent.final_reward}")
        break
        
        
env.close()

Final reward: 12.33851851851852
