In [8]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor

import matplotlib
import matplotlib.pyplot as plt

import cv2
import numpy as np
import random, math

from keras import models, layers, optimizers

from collections import deque



In [9]:
env = gym.make('SkiingDeterministic-v4')

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print(state_size, action_size)

actions = env.unwrapped.get_action_meanings()

print(actions)

batch_size = 32

n_episodes = 10000

print(np.random.choice([2,3]))

250 3
['NOOP', 'RIGHT', 'LEFT']
2


In [10]:
env = gym.make('SkiingDeterministic-v4')
observation = env.reset()

while True:
  
    env.render()
    
    #your agent goes here
    action = np.random.choice([0, 1,2])
    #action = env.action_space.sample() 
    
    observation, reward, done, info = env.step(action) 

    if done: 
      break;
            
env.close()

In [11]:
env.close()

In [12]:
def get_frame_reward(I, prev):
    I = I[:, :, 1]
    I = I[74:75, 8:152]  # Jugador 92, bandera roja 50, bandera azul 72
    if 72 not in I and 50 not in I:
        return 0
    if 72 in I:
        flags = np.where(I == 72)
    else:
        flags = np.where(I == 50)

    player = np.where(I == 92)[1]

    if len(player) == 0:
        return 1

    player = player.mean()

    if len(flags[1]) == 2:
        if player >= flags[1][0] and player <= flags[1][1]:
            return 1
        else:
            return -1
    else:
        return prev

## Define the Deep Q learning Agent

In [48]:
class DQNAgent:
    
    def __init__(self, state_size, action_size):
      
        self.state_size = state_size
        self.action_size = action_size
        
        # Events that are near in time are too coralated and do not give aditional information
        # we will use moves that are further separated in time
        self.max_memory = 300000
        self.memory = [] #deque(maxlen=800000)
        
        # Discount factor
        self.gamma = 0.99
        
        # Exploration
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = (1-self.epsilon_min) / 1000
        
        print(self.epsilon_decay)
        
        self.learning_rate = 0.00025
        
        self.model = self._build_model()
        

    def _build_model(self):
        
        model = models.Sequential()
        
        model.add(layers.Conv2D(16, kernel_size = (8,8), strides=(4,4), 
                                padding = 'valid', 
                                kernel_initializer='glorot_uniform', 
                                input_shape=(72, 72, 2)))
        model.add(layers.LeakyReLU(alpha=0.3))
        model.add(layers.Conv2D(32, kernel_size = (4,4), strides=(2,2), 
                                padding = 'valid',
                                kernel_initializer='glorot_uniform'))
        model.add(layers.LeakyReLU(alpha=0.3))

        model.add(layers.Flatten())
        model.add(layers.Dense(256, kernel_initializer='glorot_uniform', 
                               activation='relu'))
        model.add(layers.Dense(self.action_size, 
                               kernel_initializer='glorot_uniform', activation='linear'))
        
        model.compile(loss='mse', optimizer= optimizers.RMSprop(lr=self.learning_rate, rho=0.95, epsilon=0.01))
        
        return model
    
    def remember(self, state, action, reward, next_state, done):
        '''
            state, action, reward at current time
            next_state is the state that occurs after the state-action
            done is if the episode ended
        '''
        if len(self.memory) > self.max_memory:
          self.memory.pop(0)
          
        self.memory.append((state, action, reward, next_state, done))
        
    def action(self, state):
        
        # Exploration mode
        if np.random.rand() <= self.epsilon:
            #return np.random.choice([0,2,3])
            return random.randrange(self.action_size)
        
        # Use what action is predicted by the model as the best choice
        act_values = self.model.predict(state)
        
        return np.argmax(act_values[0])
      
    def get_batch(self, batch_size):
        
        minibatch = random.sample(range(2, len(self.memory)), batch_size)

        batch = [
            (
                np.expand_dims(np.stack(( self.memory[i-1][0], self.memory[i][0]), axis = 2), axis = 0),
                self.memory[i][1],
                np.sum((self.memory[i-1][2], self.memory[i][2])),
                np.expand_dims(np.stack((self.memory[i-1][3], self.memory[i][3]), axis = 2), axis = 0),
                self.memory[i][4]
            ) for i in minibatch
        ]
        
        return batch
      
    def train(self, batch_size):
        
        #minibatch = random.sample(self.memory, batch_size)
        batch = self.get_batch(batch_size)
        
        for state, action, reward, next_state, done in batch:
            target = reward
            
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
                
            target_f = self.model.predict(state)
            
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    
           
    def load(self, name):
        self.model.load_weights(name)
        
    def save(self, name):
        self.model.save_weights(name)

In [49]:
agent = DQNAgent(state_size, action_size)
agent.model.summary()

0.0009
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_15 (Conv2D)           (None, 17, 17, 16)        2064      
_________________________________________________________________
leaky_re_lu_15 (LeakyReLU)   (None, 17, 17, 16)        0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 7, 7, 32)          8224      
_________________________________________________________________
leaky_re_lu_16 (LeakyReLU)   (None, 7, 7, 32)          0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 1568)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 256)               401664    
_________________________________________________________________
dense_16 (Dense)             (None, 3)                 771       
Tot

In [50]:
def preprocessFrame(I):
    I = I[::2, ::2, 1]
    I = I[31:103, 4:76]
    I[I == 236] = 0
    I[I == 192] = 0
    I[I == 214] = 0
    I[I != 0] = 255
    return I/255


In [51]:
env = gym.make('SkiingDeterministic-v4')

try:
    for e in range(n_episodes):
        
        state = preprocessFrame(env.reset())
        states = deque((state, state, state, state), maxlen=2)
        states_tensor = None
        total_reward = 0
        done = False
        
        while not done:
            
            env.render()
            states_tensor = np.stack((states), axis = 2).reshape((1, 72, 72, 2))
            
            # Takes a random action from the action space of the environment
            action = agent.action(states_tensor)
            
            next_state, reward, done, info = env.step(action)
            next_state = preprocessFrame(next_state)
            
            # Define the reward for this problem
            total_reward += reward
            
            agent.remember(state, action, total_reward, next_state, done)
            
            state = next_state
            states.append(next_state)
        
            if len(agent.memory) > batch_size + 4:
              agent.train(batch_size)
        
        if e%50 == 0:
          print("Episode: {}/{}, score: {}, e: {:.9}, m: {}".format(e, n_episodes, total_reward, agent.epsilon, len(agent.memory)))
        
        agent.save('max_reward_weights.hdf5')
                
        
finally:
    env.close()


Instructions for updating:
Use tf.cast instead.


KeyboardInterrupt: 

In [0]:

env = gym.make('SkiingDeterministic-v4')

try:
      state = env.reset()
      state = np.reshape(state, [1, state_size])

      total_reward = 0
      done = False

      while not done:

          env.render()

          # Takes a random action from the action space of the environment
          action = agent.action(state)

          next_state, reward, done, info = env.step(action)

          total_reward += reward

          next_state = np.reshape(next_state, [1, state_size])
          state = next_state
        
finally:
    env.close()       
    show_video()