In [1]:
from keras import layers
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import gym
import time
from RunHist import RewardHist

Using TensorFlow backend.


In [2]:
env = gym.make('SkiingDeterministic-v4')
action_size = env.action_space.n

In [3]:
def get_frame_reward(I, prev):
    I = I[:, :, 1]
    I = I[74:75, 8:152]  # Jugador 92, bandera roja 50, bandera azul 72
    if 72 not in I and 50 not in I:
        return 0
    if 72 in I:
        flags = np.where(I == 72)
    else:
        flags = np.where(I == 50)

    player = np.where(I == 92)[1]

    if len(player) == 0:
        return 1

    player = player.mean()

    if len(flags[1]) == 2:
        if player >= flags[1][0] and player <= flags[1][1]:
            return 1
        else:
            return -1
    else:
        return prev

# reward discount used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def discount_rewards(r, gamma):
    """ take 1D float array of rewards and compute discounted reward """
    r = np.array(r)
    discounted_r = np.zeros_like(r)
    running_add = 0
    # we go from last reward to first one so we don't have to do exponentiations
    for t in reversed(range(0, r.size)):
        if r[t] != 0:
            # if the game ended (in Pong), reset the reward sum
            running_add = 0
        # the point here is to use Horner's method to compute those rewards efficiently
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    discounted_r -= np.mean(discounted_r)  # normalizing the result
    discounted_r /= np.std(discounted_r)  # idem
    return discounted_r

#other code
def get_pos_player(observe):
  ids = np.where(np.sum(observe == [214, 92, 92], -1) == 3)
  return ids[0].mean(), ids[1].mean()

def get_pos_flags(observe):
  if np.any(np.sum(observe == [184, 50, 50], -1) == 3):
    ids = np.where(np.sum(observe == [184, 50, 50], -1) == 3)
    return ids[0].mean(), ids[1].mean()
  else:
    base = 0
    ids = np.where(np.sum(observe[base:-60] == [66, 72, 200], -1) == 3)
    return ids[0].mean() + base, ids[1].mean()

def get_speed(observe, observe_old):
  min_val = np.inf
  min_idx = 0
  for k in range(0, 7):
    val = np.sum(np.abs(observe[54:-52,8:152] - observe_old[54+k:-52+k,8:152]))
    if min_val > val:
      min_idx = k
      min_val = val
  return min_idx


In [4]:
class Skier:
    def __init__(self, gamma=0.95, epsilon=1, e_min=0.05, e_decay=0.99, ideal_flag_interval=25):
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = e_min
        self.epsilon_decay = e_decay

        self.episode = 0

        self.ideal_flag_interval = ideal_flag_interval

        self.autosave = None

        self.model = self._make_model()

        self.reset()

    def _make_model(self):
        model = Sequential()

        model.add(layers.Conv2D(8, (3, 3), activation='relu', input_shape=(146, 144, 2)))
        model.add(layers.Conv2D(8, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(16, (3, 3), activation='relu'))
        model.add(layers.Conv2D(16, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Flatten())
        model.add(layers.Dense(units=512,activation='relu'))
        model.add(layers.Dense(units=3,activation='softmax'))
        model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
        return model

    def preprocessFrame(self, I):
        """ 
        Outputs a 72x72 image where background is black
        and important game elements are white.
        Output is [0,1]
        """
        I = I[57:203, 8:152, 1]
        return I/255

    def decay(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def reset(self):
        self.last_reward = 0
        self.rewards = []
        self.train_x = []
        self.train_y = []
        self.lr_counter = 0
        self.missing_flags = 20
        self.last_frame = np.zeros((146, 144))

    def action(self, frame, training=False):
        frame = self.preprocessFrame(frame)
        data = np.zeros((146, 144, 2))
        data[:,:,0] = self.last_frame
        data[:,:,0] = frame
        self.last_frame = frame
        probs = self.model.predict(np.expand_dims(data, 0))
        #y = np.random.choice([0, 1, 2], p=probs[0])
        y = np.argmax(probs[0])
        print(probs[0], end='\r')
        
        if float('nan') in probs[0]:
            print("NANANANANANANANANANANANANANANANANANA", probs[0])
            exit()

        # Append flattened frame to x_train
        self.train_x.append(data)
        # Append selected action to y_train
        self.train_y.append(to_categorical(y, num_classes=3))
        return y

    def register_frame(self, frame, rew = 0):
        frame_reward = get_frame_reward(frame, self.last_reward)
        reward = rew
        self.lr_counter += 1
        if frame_reward == 0 and self.last_reward != 0:
            reward += self.last_reward + 2
            reward -= 0.25 * np.tanh((0.05 * (self.lr_counter - self.ideal_flag_interval)))
            self.lr_counter = 0
            self.missing_flags -= 1
            print(reward, end='\r')
        self.last_reward = frame_reward

        self.rewards.append(reward)

    def done(self):
        self.rewards[-1] -= self.missing_flags * 5

    def train(self, verbose=0):
        if self.autosave is not None and self.episode % self.autosave == 0:
            self.save("last_lf.h5")
            print("Saved!")
        
        #print("missed:", self.missing_flags, "flags")
        #self.rewards[-1] -= self.missing_flags * 5
        sample_weights = discount_rewards(self.rewards, self.gamma)
        self.model.fit(
            x=np.array(self.train_x),
            y=np.vstack(self.train_y),
            verbose=verbose,
            sample_weight=sample_weights
        )
        self.episode += 1
        self.decay()

    def total_reward(self):
        return np.array(self.rewards).sum()

    def set_autosave(self, interval):
        self.autosave = interval

    def save(self, name):
        self.model.save_weights(name)

    def load(self, name):
        self.model.load_weights(name)


In [5]:
agent = Skier(gamma=0.99, e_decay=0.995)
agent.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 144, 142, 8)       152       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 142, 140, 8)       584       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 71, 70, 8)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 69, 68, 16)        1168      
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 67, 66, 16)        2320      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 33, 33, 16)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 17424)             0         
__________

In [64]:

#agent.load("last.h5")

agent.set_autosave(10)
observation = env.reset()
hist = RewardHist(100)
agent.reset()

observation_old = observation

#other code

cnt = 0
r_a, c_a = get_pos_player(observation)
r_f, c_f = get_pos_flags(observation)
r_a_old, c_a_old = r_a, c_a
#------------

env_reward = 0

while True:
    env.render()
    #action = agent.action(observation, training=True)
    

    #other code
    # TEACHER
    v_f = np.arctan2(r_f - r_a, c_f - c_a) # direction from player to target
    spd = get_speed(observation, observation_old)
    v_a = np.arctan2(spd, c_a - c_a_old) # speed vector of the player
    r_a_old, c_a_old = r_a, c_a
    
    
    
    frame = agent.preprocessFrame(observation)
    data = np.zeros((146, 144, 2))
    data[:,:,0] = agent.last_frame
    data[:,:,1] = frame
    agent.last_frame = frame
        
    
    observation_old = observation
    if np.random.rand() <= agent.epsilon:
        if spd == 0 and (c_a - c_a_old) == 0:
            cnt += 1
            act_t = np.random.choice(3, 1)[0]
        else:
            cnt = 0
            if v_f - v_a < -0.1:
                act_t = 1
            elif v_f - v_a > 0.1:
                act_t = 2
            else:
                act_t = 0

    else:
        if spd == 0 and (c_a - c_a_old) == 0:
            cnt += 1
            act_t = np.random.choice(3, 1)[0]
        else:
            cnt = 0
            probs = agent.model.predict(np.expand_dims(data, 0))
            act_t = np.argmax(probs[0])

            if float('nan') == probs[0][0]:
                print("NANANANANANANANANANANANANANANANANANA", probs[0])
                exit()


    
    # Append flattened frame to x_train
    agent.train_x.append(data)
    # Append selected action to y_train
    agent.train_y.append(to_categorical(act_t, num_classes=3))
    
    observation, reward, done, info = env.step(act_t)
    r_a, c_a = get_pos_player(observation)
    r_f, c_f = get_pos_flags(observation)
    #------------

    
    
    if cnt > 10:
        done = True
        agent.register_frame(observation,-100.0)
    else:
        agent.register_frame(observation)
    
    if done:
        #agent.done()
        total_reward = agent.total_reward()
        hist.add(total_reward)

        if agent.episode % 1 == 0:
            print('\n# - = - = - = - #')
            print(
                f"Ep: {agent.episode:4}\nTotal reward: {total_reward:.3f}\nEpsilon: {agent.epsilon:.4f}")
            hist.report()
        agent.train(verbose=1)
        agent.reset()

        observation = env.reset()

2.8945252486874986
# - = - = - = - #
Ep:  227
Total reward: 57.330
Epsilon: 0.3205
Reward AVG:    57.33 | ▲    57.33
Best: 57.33012044584222
Epoch 1/1
2.9387703343990728
# - = - = - = - #
Ep:  228
Total reward: 57.389
Epsilon: 0.3189
Reward AVG:    57.36 | ▲     0.03
Best: 57.38923569145897
Epoch 1/1
2.8748699472024413
# - = - = - = - #
Ep:  229
Total reward: -79.597
Epsilon: 0.3173
Reward AVG:    11.71 | ▼   -45.65
Best: 57.38923569145897
Epoch 1/1

# - = - = - = - #
Ep:  230
Total reward: -100.000
Epsilon: 0.3157
Reward AVG:   -16.22 | ▼   -27.93
Best: 57.38923569145897
Saved!


TypeError: Cannot cast ufunc subtract output from dtype('float64') to dtype('int32') with casting rule 'same_kind'

In [65]:
agent.save("last_lf_best.h5")
env.close()

In [58]:

#agent.load("last.h5")
observation = env.reset()



while True:
    env.render()

    
    
    frame = agent.preprocessFrame(observation)
    data = np.zeros((146, 144, 2))
    data[:,:,0] = agent.last_frame
    data[:,:,0] = frame
    agent.last_frame = frame
        
    



    cnt = 0
    probs = agent.model.predict(np.expand_dims(data, 0))
    act_t = np.argmax(probs[0])

    if float('nan') in probs[0]:
        print("NANANANANANANANANANANANANANANANANANA", probs[0])
        exit()
    
    observation, reward, done, info = env.step(act_t)


    
    
    if done:
        break
        
env.close()