# Experiments on Reinforcement Learning

## The game

Let's imagine a game in which the player has to choose one out of ten boxes. Every game one of the boxes lights up and the player gets one point if he or she chooses that box and zero points otherwise. The player can make 10 choices for each game.

In practice, the environment $\mathcal{E}$ is given by a random number generator that extracts a random number from 1 to 10 according to some probability distribution. The agent choose one box for 10 times, getting the corresponding reward at each attempt.

In [593]:
import numpy as np
import sys
sys.path.insert(0, '../src')
from rl_tools import *
from pprint import pprint
import random
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [575]:
from importlib import reload
import rl_tools
reload(rl_tools)
from rl_tools import *

## Random agent

Initialize environment, a random agent and the game.

In [537]:
env = Environment()
agent = Agent(0)
game = Game()

Play 500 rounds with 10 attempts each and plot the results.

In [538]:
n_rounds = 500
scores = []

for _ in range(n_rounds):
    scores.append(game.play_one_round(agent, env, 1.0))
    
scores = np.array(scores)

In [539]:
trace = go.Scatter(
    x = np.arange(1, len(scores)+1),
    y = scores,
    mode='markers'
)

layout = go.Layout(
    xaxis = dict(
        title='Episode number'
    ),
    yaxis = dict(
        title='Score'
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [540]:
print('Average score:')
print(scores.mean())
print('Standard deviation of the mean:')
print(scores.std()/np.sqrt(len(scores)))

Average score:
1.012
Standard deviation of the mean:
0.04137284133341581


## Neural network agent

In [576]:
from keras.models import Sequential
from keras.layers import Dense

In [596]:
model = Sequential()
model.add(Dense(32, input_shape=(1,), activation='relu'))
model.add(Dense(10, activation='relu'))

model.compile(
    optimizer='sgd',
    loss='mse'
)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 32)                64        
_________________________________________________________________
dense_14 (Dense)             (None, 10)                330       
Total params: 394
Trainable params: 394
Non-trainable params: 0
_________________________________________________________________


In [597]:
env = Environment()
nn_agent = Agent(model, random_only=False)
game = Game()

In [598]:
n_rounds = 500
scores = []

for _ in range(n_rounds):
    scores.append(game.play_one_round(agent, env, 0.0))
    
scores = np.array(scores)

In [599]:
trace = go.Scatter(
    x = np.arange(1, len(scores)+1),
    y = scores,
    mode='markers'
)

layout = go.Layout(
    xaxis = dict(
        title='Episode number'
    ),
    yaxis = dict(
        title='Score'
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)

iplot(fig)

In [600]:
print('Average score:')
print(scores.mean())
print('Standard deviation of the mean:')
print(scores.std()/np.sqrt(len(scores)))

Average score:
0.986
Standard deviation of the mean:
0.04385895575592287


In [605]:
random.sample(agent.memory,int(len(agent.memory)/10))

[(5, 10, 0, 8),
 (8, 9, 0, 5),
 (7, 5, 0, 10),
 (9, 10, 0, 3),
 (10, 7, 0, 9),
 (2, 4, 0, 10),
 (1, 4, 0, 5),
 (9, 1, 0, 7),
 (1, 4, 0, 10),
 (7, 4, 0, 8),
 (8, 3, 0, 10),
 (5, 5, 1, 1),
 (3, 4, 0, 1),
 (8, 6, 0, 6),
 (9, 7, 0, 3),
 (10, 5, 0, 4),
 (6, 10, 0, 1),
 (9, 5, 0, 1),
 (6, 3, 0, 8),
 (3, 1, 0, 3),
 (7, 1, 0, 8),
 (10, 1, 0, 6),
 (2, 9, 0, 4),
 (7, 1, 0, 5),
 (10, 1, 0, 4),
 (10, 2, 0, 8),
 (4, 6, 0, 8),
 (2, 2, 1, 9),
 (10, 8, 0, 9),
 (8, 9, 0, 4),
 (6, 9, 0, 9),
 (3, 8, 0, 6),
 (5, 5, 1, 9),
 (3, 3, 1, 1),
 (8, 9, 0, 6),
 (9, 2, 0, 9),
 (10, 9, 0, 5),
 (1, 9, 0, 10),
 (10, 6, 0, 6),
 (2, 10, 0, 4),
 (9, 3, 0, 3),
 (7, 4, 0, 1),
 (9, 10, 0, 4),
 (10, 10, 1, 4),
 (9, 10, 0, 7),
 (10, 10, 1, 3),
 (2, 6, 0, 1),
 (10, 8, 0, 1),
 (6, 5, 0, 9),
 (6, 8, 0, 10),
 (2, 10, 0, 5),
 (4, 4, 1, 6),
 (9, 1, 0, 9),
 (6, 8, 0, 6),
 (6, 10, 0, 9),
 (10, 8, 0, 1),
 (2, 3, 0, 5),
 (3, 5, 0, 5),
 (10, 1, 0, 7),
 (3, 4, 0, 5),
 (3, 10, 0, 10),
 (2, 6, 0, 4),
 (8, 9, 0, 10),
 (10, 2, 0, 8),
 (3, 8,