# Experiments on Reinforcement Learning

## The game

Let's imagine a game in which the player has to choose one out of ten boxes. Every game one of the boxes lights up and the player gets one point if he or she chooses that box and zero points otherwise. The player can make 10 choices for each game.

In practice, the environment $\mathcal{E}$ is given by a random number generator that extracts a random number from 1 to 10 according to some probability distribution. The agent choose one box for 10 times, getting the corresponding reward at each attempt.

In [329]:
import numpy as np
import sys
sys.path.insert(0, '../src')
from rl_tools import *
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import random
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import pickle
import pandas as pd

init_notebook_mode(connected=True)

In [72]:
from importlib import reload
import rl_tools
reload(rl_tools)
from rl_tools import *

## Random agent

Initialize environment, a random agent and the game.

In [21]:
env = Environment()
random_agent = Agent(0.2,0, random_only=True)
game = Game(n_actions=500)

Play 500 rounds with 10 attempts each and plot the results.

In [22]:
n_rounds = 1000
scores = []

for _ in tqdm(range(n_rounds)):
    scores.append(game.play_one_episode(random_agent, env))
    
scores = np.array(scores)




In [23]:
trace = go.Scatter(
    x = np.arange(1, len(scores)+1),
    y = scores,
    mode='markers'
)

layout = go.Layout(
    xaxis = dict(
        title='Episode number'
    ),
    yaxis = dict(
        title='Score'
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [25]:
print('Average score:')
print(scores.mean())
print('Standard deviation:')
print(scores.std())

Average score:
49.92
Standard deviation:
6.80540961294763


Plot the distribution of the scores (rewards).

In [26]:
np.histogram(scores, bins=20)

(array([  2,   8,   9,  25,  39,  56,  81,  86, 124, 115, 108,  94,  76,
         66,  50,  25,  19,  11,   3,   3]),
 array([31., 33., 35., 37., 39., 41., 43., 45., 47., 49., 51., 53., 55.,
        57., 59., 61., 63., 65., 67., 69., 71.]))

In [476]:
trace = go.Histogram(
    x = scores,
    xbins=dict(
        start=scores.min()-scores.std()/2.0,
        end=scores.max()+scores.std()/2.0,
        size=(scores.max()-scores.min())/20.0
    )
)

data = [trace]

fig = go.Figure(data=data)

iplot(fig)

## Neural network agent

In [229]:
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.losses import mean_squared_error

In [439]:
import rl_tools
reload(rl_tools)
from rl_tools import *

In [417]:
model = Sequential()
model.add(Dense(128, input_shape=(1,), activation='relu'))
model.add(Dense(10))# Previously relu

model.compile(
    optimizer='adam',
    loss=custom_loss
)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_49 (Dense)             (None, 128)               256       
_________________________________________________________________
dense_50 (Dense)             (None, 10)                1290      
Total params: 1,546
Trainable params: 1,546
Non-trainable params: 0
_________________________________________________________________


In [192]:
env = Environment()
nn_agent = Agent(0.2, model, random_only=False)
game = Game(n_actions=500)

In [193]:
n_episodes = 1000

scores = []

for _ in tqdm(range(n_episodes)):
    scores.append(game.play_one_episode(nn_agent, env))
    
scores = np.array(scores)

model.save('../models/last_model.h5')

with open('../scores/'+'last_model'+'_scores.pkl', 'wb') as f:
    pickle.dump(scores, f)




Modified:
- No oversampling
- Minimal $\epsilon$ in `play_one_episode` to 0.1

In [488]:
import rl_tools
reload(rl_tools)
from rl_tools import *

In [501]:
agent_params = {
    'nodes': 128,
    'activation': 'relu',
    'optimizer': 'adam',
    'memory_size': 10000,
    'batch_size': 10000,
    'epochs': 1,
    'gamma': 0.005
}

game_params = {
    'n_episodes': 1000,
    'n_actions': 100,
    'inverse_training_freq': 1,
    'eps_min': 0.01
}

In [502]:
scores, last_agent = run_game(agent_params, game_params)

In [503]:
plot_scores(scores)

In [504]:
scores[scores>50].shape

(89,)

In [505]:
describe_scores(scores)

Average score: 10.134
Standard deviation: 25.547838342998805


In [506]:
plot_model_switch(last_agent.model_memory, last_agent.eps_memory)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



Check history for the last agent.

In [326]:
sum(last_agent.model_memory[16000:])/(len(last_agent.model_memory)-16000)

0.9516190476190476

In [328]:
1.0-last_agent.eps

0.9515054747505769

In [334]:
last_total_memory = pd.DataFrame(np.array(last_agent.total_memory))
last_total_memory.columns = ['initial_state', 'action', 'reward', 'final_state']

In [339]:
last_total_memory['reward'].sum()/len(last_total_memory)

0.08681

In [341]:
last_total_memory[last_total_memory['reward']==1]

Unnamed: 0,initial_state,action,reward,final_state
20,5,5,1,5
22,5,5,1,5
27,5,5,1,5
38,5,5,1,5
42,5,5,1,5
44,5,5,1,5
48,5,5,1,5
52,5,5,1,5
69,5,5,1,5
72,5,5,1,5


In [351]:
list(np.array(last_agent.total_memory)[np.array(last_agent.total_memory)[:,2]==1])

[array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5,

In [352]:
random.sample(list(np.array(last_agent.total_memory)[np.array(last_agent.total_memory)[:,2]==1]), 10)

[array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5]),
 array([5, 5, 1, 5])]

In [363]:
batch = np.array(random.sample(last_agent.memory, agent_params['batch_size']))

In [448]:
for sample in batch[:10]:
    print(sample)

[5 3 0 5]
[5 4 0 5]
[5 6 0 5]
[5 7 0 5]
[5 2 0 5]
[5 3 0 5]
[5 1 0 5]
[5 3 0 5]
[5 6 0 5]
[5 7 0 5]


In [451]:
sample[3].reshape(1,1)

array([[5]])

In [384]:
batch[:,2].sum()/len(batch)

0.0627

In [403]:
batch[batch[:,2]==1]

array([[5, 5, 1, 5],
       [5, 5, 1, 5],
       [5, 5, 1, 5],
       ...,
       [5, 5, 1, 5],
       [5, 5, 1, 5],
       [5, 5, 1, 5]])

In [415]:
oversampling_factor = 1

oversampled_batch = batch

for _ in range(oversampling_factor):
    oversampled_batch = np.concatenate((oversampled_batch, batch[batch[:,2]==1]), axis=0)
    
print(oversampled_batch[:,2].sum()/oversampled_batch.shape[0])

0.21109334231125326


Create a new folder under `../special-agents/` corresponding to the new agent, set `INSERT-MODEL-DIR` to the name of that folder and run the cell below to save the last model, run history etc.

In [507]:
# SPECIAL_MODELS_DIR = '../special-agents/'
# model_dir = 'almost_trained_even_smaller_gamma/'

# with open(SPECIAL_MODELS_DIR+model_dir+'agent_params.pkl', 'wb') as f:
#     pickle.dump(agent_params, f)
# with open(SPECIAL_MODELS_DIR+model_dir+'game_params.pkl', 'wb') as f:
#     pickle.dump(game_params, f)
# with open(SPECIAL_MODELS_DIR+model_dir+'scores.pkl', 'wb') as f:
#     pickle.dump(scores, f)
# with open(SPECIAL_MODELS_DIR+model_dir+'memory.pkl', 'wb') as f:
#     pickle.dump(last_agent.memory, f)
# with open(SPECIAL_MODELS_DIR+model_dir+'total_memory.pkl', 'wb') as f:
#     pickle.dump(last_agent.total_memory, f)
# with open(SPECIAL_MODELS_DIR+model_dir+'model_memory.pkl', 'wb') as f:
#     pickle.dump(last_agent.model_memory, f)
# with open(SPECIAL_MODELS_DIR+model_dir+'eps_memory.pkl', 'wb') as f:
#     pickle.dump(last_agent.eps_memory, f)
# with open(SPECIAL_MODELS_DIR+model_dir+'training_memory.pkl', 'wb') as f:
#     pickle.dump(last_agent.training_memory, f)
# last_agent.model.save(SPECIAL_MODELS_DIR+model_dir+'model.h5')

Analyse performance of the last agent without further training.

In [256]:
last_agent_memory = last_agent.memory
last_agent_total_memory = last_agent.total_memory
last_agent_model_memory = last_agent.model_memory
last_agent_eps_memory = last_agent.eps_memory

In [257]:
new_scores = []

for _ in tqdm(range(100)):
    new_scores.append(game.play_one_episode(last_agent, env, training=False))
    
new_scores = np.array(new_scores)




In [258]:
plot_scores(new_scores)

### Optimization possible on:
- Memory size
- Batch size
- Number of epochs (passages on the same batch)
- Attempts/episode
- Number of episodes
- Choice of optimizer
- $\epsilon$-greedy strategy parameter ($\epsilon$)
- Architecture of the neural network

## Running a game through a function

In [93]:
from joblib import Parallel, delayed

In [143]:
params = {
    'nodes': 128,
    'activation': 'sigmoid',
    'optimizer': 'sgd'
}

In [144]:
scores = game_run(params)

In [145]:
plot_scores(scores)

In [None]:
args = [params]

Parallel(n_jobs=2, backend='threading')(delayed(game_run)(params) for _ in range(2))

In [125]:
scores_list

[array([14, 12,  8, 11,  8]), array([13,  6, 11, 10,  6])]

In [126]:
def square(x):
    return x**2

In [146]:
Parallel(n_jobs=2, verbose=100)(delayed(game_run)(params) for i in range(2))

KeyboardInterrupt: 

# Building the training routine

Trining: building a training batch and computing the target variables.

In [None]:
training_batch = np.array(
    random.sample(agent.memory, int(len(agent.memory)/10))
)
training_batch

In [None]:
training_batch.shape

In [None]:
training_batch[:,3].shape

In [None]:
training_batch[:,3]

Compute the Q-values (each of which is a 10-component array) for each final state in the transitions.

In [None]:
np.argmax(nn_agent.compute_q(training_batch[:,3]), axis=1)

Compute the target variable ($y_i$) for each of the transitions in the training batch.

In [None]:
np.amax(nn_agent.compute_q(training_batch[:,3]), axis=1)+1

In [None]:
Y_target = (training_batch[:,2]
    + nn_agent.gamma
    * np.amax(nn_agent.compute_q(training_batch[:,3]), axis=1))

In [None]:
Y_target.shape

Compute the "prediction" from the model, based on on the initial state and action taken. This implies computing the Q-value (array) associated to the initial state and then selecting the component of each array according to which action the agent performed in each trainsition.

In [None]:
Y_pred = np.take(nn_agent.compute_q(training_batch[:,0]), training_batch[:,1])

In [None]:
Y_pred.shape

Optimization is done on the MSE between Y_pred and Y_target.

In [None]:
nn_agent.model.fit(Y_target, Y_pred, epochs=1)