In [1]:
import pandas as pd

## Data Stream

In this section we will work on a much bigger data set, of 25M ratings. This data set can be found on https://grouplens.org/datasets/movielens/.

Note that in this data set, there are no features on the users, meaning that **we cannot apply the Disjoint Linear UCB algorithm there**.

In [2]:
data_stream = pd.read_csv('../data/ml-25m/ratings.csv')
data_stream

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [3]:
# We rename the columns userId and movieId to adapt them to our existing functions
data = data_stream.rename(columns={"userId": "user_id", "movieId": 'movie_id'}).copy()

# Same thing, we only keep the movies that have more than 20000 reviews
# We chose 20000 because it reduces the number of different movies to 202, where for a bound of 1000,
# there are 3790 different movies and for a bound of 10000, there is still 588.
# Reducing to 202 movies greatly improve our performences as the number of arms gets lower, while 
# still keeping more that 6.5 millions observations which is 25 times more than with our small database of work
movie_counts = data['movie_id'].value_counts()
selected_movies = movie_counts[movie_counts > 20000].index
data = data[data['movie_id'].isin(selected_movies)].reset_index(drop=True)

# We resample
data = data.sample(frac=1).reset_index(drop=True)

# Add time columns
data['time'] = data.index

# Add the binary rating
data['binary_rating'] = data['rating'].apply(lambda x: 1 if x == 5 else 0)

# We reorder the columns and get rid of timestamp
df = data[['time', 'movie_id', 'user_id', 'rating', 'binary_rating']].copy()

# We compute our new rating system to keep it between 0 and 1 included
df['exp_rating'] = (df['rating']-1)/4

df

Unnamed: 0,time,movie_id,user_id,rating,binary_rating,exp_rating
0,0,2571,117346,4.0,0,0.750
1,1,1196,23499,5.0,1,1.000
2,2,2987,100963,3.0,0,0.500
3,3,2,10060,4.0,0,0.750
4,4,344,155856,5.0,1,1.000
...,...,...,...,...,...,...
6589710,6589710,1,100548,4.0,0,0.750
6589711,6589711,2959,11782,4.5,0,0.875
6589712,6589712,919,41781,4.0,0,0.750
6589713,6589713,778,101863,5.0,1,1.000


In [4]:
df['binary_rating'].sum()/len(df) # We can check the proportion of ratings 5

0.2197729340343247

In [5]:
df.to_csv('../Data/ml-25m/big_dataset_20000_minimum_reviews.csv')