# Experiments on Reinforcement Learning

## The game

Let's imagine a game in which the player has to choose one out of ten boxes. Every game one of the boxes lights up and the player gets one point if he or she chooses that box and zero points otherwise. The player can make 10 choices for each game.

In practice, the environment $\mathcal{E}$ is given by a random number generator that extracts a random number from 1 to 10 according to some probability distribution. The agent choose one box for 10 times, getting the corresponding reward at each attempt.

In [308]:
import numpy as np
import sys
sys.path.insert(0, '../src')
from rl_tools import *
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import random
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [302]:
from importlib import reload
import rl_tools
reload(rl_tools)
from rl_tools import *

## Random agent

Initialize environment, a random agent and the game.

In [223]:
env = Environment()
agent = Agent(0.2,0)
game = Game()

Play 500 rounds with 10 attempts each and plot the results.

In [283]:
n_rounds = 500
scores = []

for _ in range(n_rounds):
    scores.append(game.play_one_episode(agent, env, 1.0))
    
scores = np.array(scores)

In [284]:
trace = go.Scatter(
    x = np.arange(1, len(scores)+1),
    y = scores,
    mode='markers'
)

layout = go.Layout(
    xaxis = dict(
        title='Episode number'
    ),
    yaxis = dict(
        title='Score'
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [285]:
print('Average score:')
print(scores.mean())
print('Standard deviation of the mean:')
print(scores.std())

Average score:
1.01
Standard deviation of the mean:
0.9065870063044142


## Neural network agent

In [287]:
from keras.models import Sequential
from keras.layers import Dense
from keras.losses import mean_squared_error

In [294]:
def custom_loss(Y_target, Y_pred):
    return mean_squared_error(Y_target, Y_pred)

In [303]:
model = Sequential()
model.add(Dense(32, input_shape=(1,), activation='relu'))
model.add(Dense(10, activation='relu'))

model.compile(
    optimizer='adam',
    loss=custom_loss
)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_47 (Dense)             (None, 32)                64        
_________________________________________________________________
dense_48 (Dense)             (None, 10)                330       
Total params: 394
Trainable params: 394
Non-trainable params: 0
_________________________________________________________________


In [304]:
env = Environment()
nn_agent = Agent(0.2, model, random_only=False)
game = Game()

In [309]:
n_rounds = 500
scores = []

for _ in tqdm(range(n_rounds)):
    scores.append(game.play_one_episode(nn_agent, env, 0.0))
    
scores = np.array(scores)




Exception ignored in: <bound method tqdm.__del__ of  54%|█████▎    | 268/500 [52:02<45:03, 11.65s/it]>
Traceback (most recent call last):
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 882, in __del__
    self.close()
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 1087, in close
    self._decr_instances(self)
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 452, in _decr_instances
    cls.monitor.exit()
  File "/opt/anaconda/envs/Python3/lib/python3.6/site-packages/tqdm/_monitor.py", line 50, in exit
    self.join()
  File "/opt/anaconda/envs/Python3/lib/python3.6/threading.py", line 1053, in join
    raise RuntimeError("cannot join current thread")
RuntimeError: cannot join current thread


In [310]:
trace = go.Scatter(
    x = np.arange(1, len(scores)+1),
    y = scores,
    mode='markers'
)

layout = go.Layout(
    xaxis = dict(
        title='Episode number'
    ),
    yaxis = dict(
        title='Score'
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [312]:
print('Average score:')
print(scores[350:].mean())
print('Standard deviation:')
print(scores.std())

Average score:
0.68
Standard deviation:
0.8544565524355232


# Building the training routine

Trining: building a training batch and computing the target variables.

In [192]:
training_batch = np.array(
    random.sample(agent.memory, int(len(agent.memory)/10))
)
training_batch

array([[10,  1,  0,  6],
       [ 8,  2,  0,  6],
       [ 1,  5,  0,  5],
       ...,
       [ 8,  5,  0,  9],
       [ 4, 10,  0,  1],
       [ 4,  4,  1,  8]])

In [32]:
training_batch.shape

(1000, 4)

In [60]:
training_batch[:,3].shape

(1000,)

In [62]:
training_batch[:,3]

array([ 5,  6,  9,  9,  6,  2,  5,  6,  4,  8,  7,  9,  1,  9,  6,  8,  5,
        3, 10,  4,  8,  4,  8,  3,  5,  6,  4,  7,  9,  8,  9,  4,  1, 10,
        2,  1,  5,  1, 10,  1,  8,  9,  4,  4, 10, 10,  5,  8,  6,  1,  8,
        9,  7,  3, 10,  7, 10,  7,  1,  4,  9,  2,  2,  5,  8,  2,  4,  9,
        5,  7,  4, 10,  3,  1,  8,  6,  8,  6,  7,  3,  8,  4,  7,  8,  5,
        6,  7,  5,  8,  2,  9,  4,  3,  1,  7,  2,  5,  5,  5,  7,  2,  4,
        2,  6,  1,  6,  3,  9,  9,  1,  8,  9,  8,  6,  6,  5,  7,  9, 10,
        1,  4,  7,  6,  6,  9, 10,  7,  2, 10,  3,  2,  7,  6,  4,  3,  9,
        3,  1,  7,  6,  9,  1,  3,  2,  8,  2,  1,  4,  8,  7,  1,  2,  8,
        2,  7,  5,  4,  2,  8,  1,  7,  8,  9,  7,  2,  4,  4,  8,  2,  7,
        6,  9,  8,  1,  4,  5,  3,  7,  8,  3,  1,  7,  4,  8,  4, 10,  4,
        5,  9, 10,  4,  7,  2,  9,  7,  5,  1,  6,  6,  2,  2,  8,  9, 10,
        2,  8,  3,  5, 10,  1,  9,  5,  3,  8,  1,  2,  2,  4,  3,  3, 10,
       10,  8,  2, 10,  8

Compute the Q-values (each of which is a 10-component array) for each final state in the transitions.

In [69]:
np.argmax(nn_agent.compute_q(training_batch[:,3]), axis=1)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

Compute the target variable ($y_i$) for each of the transitions in the training batch.

In [267]:
np.amax(nn_agent.compute_q(training_batch[:,3]), axis=1)+1

array([7415410.], dtype=float32)

In [268]:
Y_target = (training_batch[:,2]
    + nn_agent.gamma
    * np.amax(nn_agent.compute_q(training_batch[:,3]), axis=1))

In [89]:
Y_target.shape

(1000,)

Compute the "prediction" from the model, based on on the initial state and action taken. This implies computing the Q-value (array) associated to the initial state and then selecting the component of each array according to which action the agent performed in each trainsition.

In [98]:
Y_pred = np.take(nn_agent.compute_q(training_batch[:,0]), training_batch[:,1])

In [99]:
Y_pred.shape

(1000,)

Optimization is done on the MSE between Y_pred and Y_target.

In [138]:
nn_agent.model.fit(Y_target, Y_pred, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x7fbb103b5940>