In [None]:
# test the environment
import gym
env = gym.make('CarRacing-v0')

for i_episode in range(1):
    observation = env.reset()
    for t in range(200):
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, Dropout
from keras.optimizers import Adam, sgd

import random
import numpy as np
from collections import deque

import pickle, os, gzip

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [3]:
state_size = env.observation_space.shape
processed_shape = (84,84,1)
new_shape = (1,)+state_size
new_processed_shape= (1,)+processed_shape
continuous_action_size = env.action_space.shape[0]
action_size = 4

print("State shape : " + str(state_size))
print("New shape : " + str(new_shape))
print("New preprocessed shape : " + str(new_processed_shape))
print("Continuous action shape : " + str(continuous_action_size))
print("Action shape : " + str(action_size))

State shape : (96, 96, 3)
New shape : (1, 96, 96, 3)
New prrocessed shape : (1, 84, 84, 1)
Continuous action shape : 3
Action shape : 4


In [None]:
plt.imshow(observation)
plt.show()

plt.imshow(observation[:84,6:90,1] , cmap = plt.get_cmap('gray'))
plt.show()

In [4]:
def continuous_from_discrete(action):
    if (action==0):
        return [-1, 0, 0]
    if (action==1):
        return [1, 0, 0]
    if (action==2):
        return [0, 0, 0]
    #other actions that might be taken into account [-1, 0, 0.5]  [1, 0, 0.5]  [0, 1, 0]  [0, 0, 0.5]
    return [0, 1, 0]

def discrete_from_continuous(action):
    if (action[0] == -1):
        return 0
    if (action[0] == 1):
        return 1
    if (action[1] == 0):
        return 2
    return 3

def onehot_from_discrete(action):
    return [ 1 if i==action else 0 for i in range(action_size)]

def discrete_from_onehot(onehot):
    for i in range(len(onehot)):
        if (onehot[i]==1):
            return i

In [7]:
def preprocess(state):
    return np.reshape(state[:,:84,6:90,1], new_processed_shape)

In [85]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.3  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.0005
        self.model = self._build_model()

    def _build_model(self):
        # https://towardsdatascience.com/atari-reinforcement-learning-in-depth-part-1-ddqn-ceaa762a546f
        # this paper gave a nn architecture
        model = Sequential()
        model.add(Conv2D(filters = 32, kernel_size=8, strides=4, activation='relu', input_shape=processed_shape))
        model.add(Dropout(0.1))
        model.add(Conv2D(filters = 64, kernel_size=4, strides=2, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',metrics=['accuracy'], optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, epsilon = -1):
        if np.random.rand() <= epsilon:
            return random.randrange(self.action_size)
        state = preprocess(state)
        act_values = self.model.predict(state)[0]
        return np.argmax(act_values)  # returns action
    
    def in_grass(self, state):
        return state[0,82,42,1]>150

    def replay(self):
        batch_size=100
        states = []
        target_fs = []
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = preprocess(state)
            next_state = preprocess(next_state)
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            states.append(state)
            target_fs.append(target_f)
        history = self.model.fit(np.array(states)[:,0,:,:,:],
                                 np.array(target_fs)[:,0,:],
                                 batch_size=batch_size,
                                 verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return history
            
    def train(self, memory):
        batch_size=100
        states = []
        target_fs = []
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = preprocess(state)
            next_state = preprocess(next_state)
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            states.append(state)
            target_fs.append(target_f)
        history = self.model.fit(np.array(states)[:,0,:,:,:],
                                 np.array(target_fs)[:,0,:],
                                 batch_size=batch_size,
                                 verbose=0)
        return history

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [44]:
class DPAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # https://towardsdatascience.com/atari-reinforcement-learning-in-depth-part-1-ddqn-ceaa762a546f
        model = Sequential()
        model.add(Conv2D(filters = 32, kernel_size=(8,8), strides=(2,2), activation='relu', input_shape=processed_shape))
        model.add(Dropout(0.1))
        model.add(Conv2D(filters = 64, kernel_size=(3,3), strides=(1,1), activation='relu'))
        model.add(Dropout(0.1))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.compile(loss='categorical_crossentropy',metrics=['accuracy'], optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model

    def act(self, state):
        state = preprocess(state)
        act_values = self.model.predict(state)[0]
        return np.argmax(act_values)  # returns action
    
    def in_grass(self, state):
        return state[0,82,42,1]>150

    def train(self, memory):
        batch_size=100
        states = []
        actions = []
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = preprocess(state)
            action = onehot_from_discrete(action)
            states.append(state)
            actions.append([action])
        states = np.array(states)[:,0,:,:,:]
        actions = np.array(actions)[:,0,:]
        history = self.model.fit(states, actions, batch_size=100, verbose = 0)
        return history

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [19]:
class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.3  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.tau = 0.5
        self.model = self._build_model()
        self.target_model = self._build_model()

    def _build_model(self):
        # https://towardsdatascience.com/atari-reinforcement-learning-in-depth-part-1-ddqn-ceaa762a546f
        model = Sequential()
        model.add(Conv2D(filters = 32, kernel_size=8, strides=4, activation='relu', input_shape=processed_shape))
        model.add(Dropout(0.1))
        model.add(Conv2D(filters = 64, kernel_size=4, strides=2, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',metrics=['accuracy'], optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, epsilon = -1):
        if np.random.rand() <= epsilon:
            return random.randrange(self.action_size)
        state = preprocess(state)
        act_values = self.model.predict(state)[0]
        return np.argmax(act_values)  # returns action
    
    def in_grass(self, state):
        return state[0,82,42,1]>150

    def replay(self):
        batch_size=100
        states = []
        target_fs = []
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = preprocess(state)
            next_state = preprocess(next_state)
            target = reward
            if not done:
                target_next = self.model.predict(next_state)
                target_val = self.target_model.predict(next_state)
                a = np.argmax(target_next[0])
                target = reward + self.gamma * (target_val[0][a])  
            target_f = self.model.predict(state)
            target_f[0][action] = target
            states.append(state)
            target_fs.append(target_f)
        history = self.model.fit(np.array(states)[:,0,:,:,:],
                                 np.array(target_fs)[:,0,:],
                                 batch_size=batch_size,
                                 verbose=0)
        q_model_theta = self.model.get_weights()
        target_model_theta = self.target_model.get_weights()
        counter = 0
        for q_weight, target_weight in zip(q_model_theta, target_model_theta):
            target_weight = target_weight * (1-self.tau) + q_weight * self.tau
            target_model_theta[counter] = target_weight
            counter += 1
        self.target_model.set_weights(target_model_theta)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return history
            
    def train(self, memory):
        batch_size=100
        states = []
        target_fs = []
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = preprocess(state)
            next_state = preprocess(next_state)
            target = reward
            if not done:
                target_next = self.model.predict(next_state)
                target_val = self.target_model.predict(next_state)
                a = np.argmax(target_next[0])
                target = reward + self.gamma * (target_val[0][a])  
            target_f = self.model.predict(state)
            target_f[0][action] = target
            states.append(state)
            target_fs.append(target_f)
        history = self.model.fit(np.array(states)[:,0,:,:,:],
                                 np.array(target_fs)[:,0,:],
                                 batch_size=batch_size,
                                 verbose=0)
        '''
        # Soft Update
        q_model_theta = self.model.get_weights()
        target_model_theta = self.target_model.get_weights()
        counter = 0
        for q_weight, target_weight in zip(q_model_theta, target_model_theta):
            target_weight = target_weight * (1-self.tau) + q_weight * self.tau
            target_model_theta[counter] = target_weight
            counter += 1
        self.target_model.set_weights(target_model_theta)
        '''
        self.target_model.set_weights(self.model.get_weights())
        return history

    def load(self, name):
        self.model.load_weights(name)
        self.target_model.load_weights("target_"+name)

    def save(self, name):
        self.model.save_weights(name)
        self.target_model.save_weights("target_"+name)

In [None]:
warm_up = 50

In [13]:
def read_data():
    
    #get the data needed from the user gzip
    print("Reading data...")
    with gzip.open('./data/data.pkl.gzip','rb') as f:
        data = pickle.load(f)
    states = data["state"]
    next_states = data["next_state"]
    actions = data["action"]
    rewards = data["reward"]
    dones = data["terminal"]
    
    # put it in the good format, as in the agent memory
    memory = deque(maxlen=100000)
    NB_TRIALS = len(states)
    for ii in range(NB_TRIALS):
        for jj in range(warm_up,len(states[ii])):
            state = states[ii][jj]
            state = np.reshape(state, new_shape)
            action = discrete_from_continuous(actions[ii][jj])
            reward = rewards[ii][jj]
            next_state = next_states[ii][jj]
            next_state = np.reshape(next_state, new_shape)
            done = dones[ii][jj]
            memory.append((state, action, reward, next_state, done))
    
    return memory

In [86]:
agent = DQNAgent(state_size, action_size)
# agent.load("dqn_imitation_learning")

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_41 (Conv2D)           (None, 20, 20, 32)        2080      
_________________________________________________________________
conv2d_42 (Conv2D)           (None, 9, 9, 64)          32832     
_________________________________________________________________
flatten_21 (Flatten)         (None, 5184)              0         
_________________________________________________________________
dense_42 (Dense)             (None, 128)               663680    
_________________________________________________________________
dense_43 (Dense)             (None, 4)                 516       
Total params: 699,108
Trainable params: 699,108
Non-trainable params: 0
_________________________________________________________________


In [87]:
train_memory = read_data()

print("Training...")
for ii in range(1000):
    history = agent.train(train_memory)
    if ii%50 == 0:  # Compute the score on an average of 10 games
        scores = []
        for e in range(10):
            score = 0
            state = env.reset()
            state = np.reshape(state, new_shape)
            for time in range(1000):
                action = agent.act(state) if (time>warm_up) else 3
                next_state, reward, done, _ = env.step(continuous_from_discrete(action))
                next_state = np.reshape(next_state, new_shape)
                score += reward
                state = next_state
                if done:
                    break
            scores.append(score)
        avg = np.mean(scores)
        print(str(ii+1) + " - nn accuracy:" + str(history.history['accuracy'][0]))
        print(str(ii+1) + " - agent score:" + str(avg))
print("Complete")

"""
train_memory = read_data()
history = agent.train(train_memory)
"""

Reading data...
Training...
Track generation: 1221..1530 -> 309-tiles track
Track generation: 1097..1375 -> 278-tiles track
Track generation: 1169..1465 -> 296-tiles track
Track generation: 1044..1309 -> 265-tiles track
Track generation: 973..1227 -> 254-tiles track
Track generation: 1167..1466 -> 299-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1103..1392 -> 289-tiles track
Track generation: 1190..1492 -> 302-tiles track
Track generation: 1223..1533 -> 310-tiles track
Track generation: 1189..1490 -> 301-tiles track
Track generation: 1146..1439 -> 293-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1164..1454 -> 290-tiles track
1:0.59
1:-45.25260655149099
Track generation: 1018..1277 -> 259-tiles track
Track generation: 1143..1433 -> 290-tiles track
Track generation: 1327..1663 -> 336-tiles track
Track generation: 1095..1373 -> 278-tiles track
Track generation: 1216..1524 -> 3

Track generation: 949..1196 -> 247-tiles track
Track generation: 1107..1388 -> 281-tiles track
Track generation: 1219..1528 -> 309-tiles track
Track generation: 1121..1405 -> 284-tiles track
Track generation: 1313..1645 -> 332-tiles track
651:0.75
651:24.436219035875475
Track generation: 1341..1680 -> 339-tiles track
Track generation: 1118..1402 -> 284-tiles track
Track generation: 1005..1266 -> 261-tiles track
Track generation: 1067..1338 -> 271-tiles track
Track generation: 1248..1564 -> 316-tiles track
Track generation: 1207..1513 -> 306-tiles track
Track generation: 1043..1308 -> 265-tiles track
Track generation: 1144..1434 -> 290-tiles track
Track generation: 1177..1476 -> 299-tiles track
Track generation: 1091..1368 -> 277-tiles track
701:0.71
701:67.47341224478603
Track generation: 1220..1529 -> 309-tiles track
Track generation: 1155..1456 -> 301-tiles track
Track generation: 1136..1424 -> 288-tiles track
Track generation: 980..1229 -> 249-tiles track
Track generation: 969..1220

KeyboardInterrupt: 

In [83]:
agent.save("dqn_imitation_learning_v5.h5")

In [84]:
scores = []
for e in range(100):
    score = 0
    state = env.reset()
    state = np.reshape(state, new_shape)
    for time in range(1000):
        action = agent.act(state) if (time>warm_up) else 3
        next_state, reward, done, _ = env.step(continuous_from_discrete(action))
        next_state = np.reshape(next_state, new_shape)
        score += reward
        state = next_state
        if done:
            break
    scores.append(score)
avg = np.mean(scores)
print(str(ii+1) + ":" + str(avg))

Track generation: 1200..1505 -> 305-tiles track
Track generation: 1065..1338 -> 273-tiles track
retry to generate track (normal if there are not many of this messages)
Track generation: 1004..1259 -> 255-tiles track
Track generation: 1236..1548 -> 312-tiles track
Track generation: 1123..1408 -> 285-tiles track
Track generation: 1162..1457 -> 295-tiles track
Track generation: 1009..1265 -> 256-tiles track
Track generation: 1155..1448 -> 293-tiles track
Track generation: 1324..1659 -> 335-tiles track
Track generation: 1183..1483 -> 300-tiles track
Track generation: 1163..1458 -> 295-tiles track
Track generation: 1121..1414 -> 293-tiles track
Track generation: 1249..1565 -> 316-tiles track
Track generation: 1059..1328 -> 269-tiles track
Track generation: 1148..1439 -> 291-tiles track
Track generation: 1091..1369 -> 278-tiles track
Track generation: 1355..1698 -> 343-tiles track
Track generation: 1245..1551 -> 306-tiles track
retry to generate track (normal if there are not many of this me

In [31]:
# test the agent on the environment

for e in range(2):
    score = 0
    state = env.reset()
    state = np.reshape(state, new_shape)
    for time in range(1000):
        env.render()
        action = agent.act(state) if (time>32) else 3
        next_state, reward, done, _ = env.step(continuous_from_discrete(action))
        next_state = np.reshape(next_state, new_shape)
        score += reward
        #if agent.in_grass(next_state) and time>warm_up:
        #    done = True
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, score, agent.epsilon))
            break

Track generation: 1039..1303 -> 264-tiles track
episode: 0/1000, score: 884.7908745246934, e: 0.3
Track generation: 1247..1563 -> 316-tiles track
episode: 1/1000, score: 811.1111111110937, e: 0.3


In [None]:
# To train the agent after the learning from demonstration part (not efficient - the agent unlearn how to turn)
EPISODES = 100
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, new_shape)
    for time in range(1000):
        env.render()
        action = agent.act(state, agent.epsilon) if (time>warm_up) else 3
        next_state, reward, done, _ = env.step(continuous_from_discrete(action))
        next_state = np.reshape(next_state, new_shape)
        agent.memorize(state, action, reward, next_state, done)
        state = next_state
        if done:
            break
    history = agent.replay()
    print("----------- " +str(e)+"/"+str(EPISODES)+ " ------ val_loss : "+str(history.history['accuracy']))
            
agent.save("dqn_imitation_learning_v2.h5")

In [76]:
env.close()