# Requisites

In [None]:
%load_ext autoreload
%autoreload 2

! git clone https://github.com/gaudel/recommender_system.git
! mkdir /content/recommender_system/bandits/data
import os
os.chdir("./recommender_system/bandits")

In [None]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

# An Arm

In [None]:
# Instantiate an arm
from arm import MyBeta
arm = MyBeta(mean=0.2)

# draw three values from this arm
print(arm.draw())
print(arm.draw())
print(arm.draw())

# expected value
print(arm.mean())

# empirical mean
n_samples = 1000
rewards = np.zeros(n_samples)
for i in range(n_samples):
    rewards[i] = arm.draw()

# XXX TO DO XXX       print empirical mean

# Bandit Setting : One Game (Played by a Human)

## Environment

In [None]:
environment = [MyBeta(mean=0.8), MyBeta(mean=0.2), MyBeta(mean=0.4)]

# mean per arm
for i, arm in enumerate(environment):
    print("expected value for arm ", i, ": ", arm.mean(), sep="")

## Game

In [None]:
n_iter = 5

# play a game
for t in range(n_iter):
    print("===")
    print("iteration ", t)
    i_arm = int(input("Which arm do you want to play? "))
    reward = 0 # XXX TO DO XXX       draw the chosen arm and store the result in reward
    print("Reward:", reward)    

## Cumulative Gain

In [None]:
n_iter = 5

rewards = np.zeros(n_iter)

# play a game
for t in range(n_iter):
    print("===")
    print("iteration ", t)
    i_arm = int(input("Which arm do you want to play? "))
    reward = environment[i_arm].draw()
    # XXX TO DO XXX       store the reward in `rewards`
    print("Reward:", reward)

    
# XXX TO DO XXX       print total reward at time `n_iter`

    
# XXX TO DO XXX       print total reward at each timestep

# Cumulative Regret

In [None]:
n_iter = 5

expected_rewards = np.zeros(n_iter)

best_expected_rewards = np.zeros(n_iter)


# play a game
for t in range(n_iter):
    print("===")
    print("iteration ", t)
    i_arm = int(input("Which arm do you want to play? "))
    reward = environment[i_arm].draw()
    print("Reward:", reward)
    # to compute the regret
    expected_rewards[t] = environment[i_arm].mean()
    best_expected_rewards[t] = max([arm.mean() for arm in environment])

    
# XXX TO DO XXX       print instantaneous regret

    
# XXX TO DO XXX       print cumulative regret    

## Let's Plot!

In [None]:
plt.plot((best_expected_rewards - expected_rewards).cumsum(), "--", label='Human Intelligence')
plt.xlabel('Time')
plt.ylabel('Cumulative Regret')
plt.legend()
plt.grid(True)
plt.show()

# AI Player

## Choose an Arm

In [None]:
from player import EpsilonNGreedy

player = EpsilonNGreedy(nb_arms=3, c=5)

for _ in range(30):
    print("Chosen arm:", player.choose_next_arm())


## Learn

In [None]:
n_rep = 10

# play arm 2 `n_rep` times
i_arm = 2
for _ in range(n_rep):
    reward = environment[i_arm].draw()
    player.update(i_arm, reward)


# XXX TO DO XXX       play arm 0 and arm 1 `n_rep` times



# What are the chosen arms now ?
for _ in range(30):
    print("Chosen arm:", player.choose_next_arm())

# Let's Play!

In [None]:
n_iter = 1000

"""!!! Do not cheat: restart the player !!!"""
player.restart()


expected_rewards = np.zeros(n_iter)
best_expected_rewards = np.zeros(n_iter)


# play a game
for t in range(n_iter):
    i_arm = 0 # XXX TO DO XXX       let the artificial player choose the arm
    reward = environment[i_arm].draw()
    # XXX TO DO XXX       tel to the artificial player which regret was obtained
    # to compute the regret
    expected_rewards[t] = environment[i_arm].mean()
    best_expected_rewards[t] = max([arm.mean() for arm in environment])


# plot the results    
plt.plot((best_expected_rewards - expected_rewards).cumsum(), "--", label='EG c=5')
plt.xlabel('Time')
plt.ylabel('Cumulative Regret')
plt.legend()
plt.grid(True)
plt.show()

# Multiple Games

In [None]:
n_iter = 1000
n_games = 5

expected_rewards = np.zeros((n_iter, n_games))
best_expected_rewards = np.zeros((n_iter, n_games))



for i_game in range(n_games):
    """!!! Do not cheat: restart the player !!!"""
    player.restart()

    # play a game
    for t in range(n_iter):
        i_arm = player.choose_next_arm()
        reward = environment[i_arm].draw()
        player.update(i_arm, reward)
        # to compute the regret
        expected_rewards[t, i_game] = environment[i_arm].mean()
        best_expected_rewards[t, i_game] = max([arm.mean() for arm in environment])


# plot the results    
for i_game in range(n_games):
    plt.plot((best_expected_rewards - expected_rewards)[:,i_game].cumsum(), "--", label='game '+str(i_game))
plt.plot((best_expected_rewards - expected_rewards).mean(1).cumsum(), "-", label='average', lw=3, color = 'black')
plt.xlabel('Time')
plt.ylabel('Cumulative Regret')
plt.legend()
plt.grid(True)
plt.show()

# Let `play_games.py` do the Job 

In [None]:
# run
%run -t play_games.py 200 10 --Random --Ber 0.4 0.2 0.8
%run -t play_games.py 200 10 --Oracle --Ber 0.4 0.2 0.8
%run -t play_games.py 200 10 --EtC 20 --Ber 0.4 0.2 0.8
%run -t play_games.py 200 10 --eGreedy 1 --Ber 0.4 0.2 0.8
# XXX TO DO XXX       run also with c=10 and c=100
%run -t play_games.py 200 10 --TS --Ber 0.4 0.2 0.8

# load
from tools import retrieve_data_from_zip
logs = []
logs.extend(retrieve_data_from_zip("data/Ber0.4_0.2_0.8__Random__nb_trials_200__nb_games_10.gz"))
logs.extend(retrieve_data_from_zip("data/Ber0.4_0.2_0.8__Oracle__nb_trials_200__nb_games_10.gz"))
logs.extend(retrieve_data_from_zip("data/Ber0.4_0.2_0.8__EtC__m_20__nb_trials_200__nb_games_10.gz"))
logs.extend(retrieve_data_from_zip("data/Ber0.4_0.2_0.8__eGreedy__c_1__nb_trials_200__nb_games_10.gz"))
# XXX TO DO XXX       plot also with c=10 and c=100
logs.extend(retrieve_data_from_zip("data/Ber0.4_0.2_0.8__TS__nb_trials_200__nb_games_10.gz"))

# plot
from exp import plot_exp
plot_exp(logs)