In [1]:
import pandas as pd
import numpy as np

In [34]:
data = pd.read_csv('../Data/dataset_1000_minimum_reviews.csv', index_col='Unnamed: 0')
data

Unnamed: 0,user_id,movie_id,rating,genre,gender,age,occupation,time,binary_rating
0,3669,1197,4,Action|Adventure|Comedy|Romance,M,25,14,0,0
1,3621,780,2,Action|Sci-Fi|War,M,18,4,1,0
2,4917,1234,3,Comedy|Crime,M,45,0,2,0
3,1147,1544,2,Action|Adventure|Sci-Fi|Thriller,M,25,20,3,0
4,262,3175,5,Adventure|Comedy|Sci-Fi,F,25,1,4,1
...,...,...,...,...,...,...,...,...,...
284267,2240,1544,5,Action|Adventure|Sci-Fi|Thriller,M,45,12,284267,1
284268,5550,1089,4,Crime|Thriller,M,35,15,284268,0
284269,5988,1197,4,Action|Adventure|Comedy|Romance,M,25,15,284269,0
284270,3764,923,5,Drama,M,25,1,284270,1


### User features Data

In [48]:
user_features = data[['user_id', 'gender', 'age', 'occupation']]

dummies_occup = pd.get_dummies(user_features['occupation'])
dummies_occup = dummies_occup.astype(int)

occupations_columns = [f'occup_{i}' for i in range(21)]
dummies_occup.columns = occupations_columns

dummies_genre = pd.get_dummies(user_features['gender'])
dummies_genre = dummies_genre.astype(int)

user_features = pd.concat([user_features.drop(['gender','occupation'], axis = 1),
                           dummies_genre, dummies_occup, ], axis=1)


Unnamed: 0,user_id,age,F,M,occup_0,occup_1,occup_2,occup_3,occup_4,occup_5,...,occup_11,occup_12,occup_13,occup_14,occup_15,occup_16,occup_17,occup_18,occup_19,occup_20
0,3669,25,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,3621,18,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4917,45,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1147,25,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,262,25,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284267,2240,45,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
284268,5550,35,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
284269,5988,25,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
284270,3764,25,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# We first get the columns necessary to compute our vector x of the data of users
user_features = data[['user_id', 'gender', 'age', 'occupation']]

# We create a dataframe where the different occupations are the columns and each row is a user
# In this dataframe, a cell returns the bool True if an user as the occupation_i and False else
dummies_occup = pd.get_dummies(user_features['occupation'])
# We replace the bools by 1s and 0s
dummies_occup = dummies_occup.astype(int)

# Same for the genders
occupations_columns = [f'occup_{i}' for i in range(21)]
dummies_occup.columns = occupations_columns

dummies_genre = pd.get_dummies(user_features['gender'])
dummies_genre = dummies_genre.astype(int)

# We drop the previous columns genders and occupation and concat our new dataframe
user_features = pd.concat([user_features.drop(['gender','occupation'], axis = 1),
                           dummies_genre, dummies_occup, ], axis=1)

# We center and reduce 'age'
user_features['age'] = (user_features['age']- user_features['age'].mean()) / user_features['age']

# Here we can indeed see that new columns were defined for the 'gender_F' and 'gender_M' as well as the 20 occupations
user_features.head() # That's the collection of our 'x' vectors for all 'time'

Unnamed: 0,user_id,age,gender_F,gender_M,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,...,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20
0,3669,-0.208675,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3621,-0.678715,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4917,0.328514,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1147,-0.208675,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,262,-0.208675,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Stream of Data

In [31]:
# As usual, we only keep our features of interess
df = data[['time', 'movie_id', 'user_id', 'binary_rating']]
df.head()

Unnamed: 0,time,movie_id,user_id,binary_rating
0,0,1197,3669,0
1,1,780,3621,0
2,2,1234,4917,0
3,3,1544,1147,0
4,4,3175,262,1


In [None]:
def policy_evaluator_disjoint_lin_ucb(dataframe, alpha):
    
    ...