In [343]:
import pandas as pd
import numpy as np

## First, we read the datasets

In [344]:
users = pd.read_table('../Data/users.dat', sep='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

ratings = pd.read_table('../Data/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')

movies = pd.read_csv('../Data/movies.dat', sep='::', names=['movie_id','title','genre'], engine = 'python', encoding='latin1')

In [345]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [346]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [347]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [348]:
df = pd.merge(ratings, movies, on='movie_id')
df = df.sort_values('user_id')

In [349]:
df = pd.merge(df, users, on='user_id')

In [350]:
df

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,gender,age,occupation,zip_code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,48,5,978824351,Pocahontas (1995),Animation|Children's|Musical|Romance,F,1,10,48067
2,1,938,4,978301752,Gigi (1958),Musical,F,1,10,48067
3,1,1207,4,978300719,To Kill a Mockingbird (1962),Drama,F,1,10,48067
4,1,1721,4,978300055,Titanic (1997),Drama|Romance,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,2641,2,956716343,Superman II (1980),Action|Adventure|Sci-Fi,M,25,6,11106
1000205,6040,1947,4,997454036,West Side Story (1961),Musical|Romance,M,25,6,11106
1000206,6040,904,4,956716845,Rear Window (1954),Mystery|Thriller,M,25,6,11106
1000207,6040,2664,4,957717463,Invasion of the Body Snatchers (1956),Horror|Sci-Fi,M,25,6,11106


We remove columns that will not be useful for us

In [351]:
df = df.drop(['timestamp', 'title', 'genre', 'zip_code', 'occupation'],axis=1)
df

Unnamed: 0,user_id,movie_id,rating,gender,age
0,1,1193,5,F,1
1,1,48,5,F,1
2,1,938,4,F,1
3,1,1207,4,F,1
4,1,1721,4,F,1
...,...,...,...,...,...
1000204,6040,2641,2,M,25
1000205,6040,1947,4,M,25
1000206,6040,904,4,M,25
1000207,6040,2664,4,M,25


## Few people have 1 year old, we remove them as it is probaly a mistake

In [352]:
df = df[df['age'] > 1].reset_index(drop=True)

In [353]:
df

Unnamed: 0,user_id,movie_id,rating,gender,age
0,2,235,3,M,56
1,2,2278,3,M,56
2,2,647,3,M,56
3,2,3147,5,M,56
4,2,1955,4,M,56
...,...,...,...,...,...
972993,6040,2641,2,M,25
972994,6040,1947,4,M,25
972995,6040,904,4,M,25
972996,6040,2664,4,M,25


In [354]:
df['movie_id'].value_counts()

movie_id
2858    3336
1196    2898
260     2890
1210    2783
480     2583
        ... 
3277       1
706        1
3458       1
2226       1
2909       1
Name: count, Length: 3702, dtype: int64

There are 3702 different movies rated, but some movies have only 1 rating. We thus choose to keep only the movies with at least M ratings

In [355]:
def keep_M_movies(dataframe, M):
    # We get the number of reviews for each movie
    movie_counts = dataframe['movie_id'].value_counts()

    # We keep the movies with more than M reviews
    selected_movies = movie_counts[movie_counts > M].index

    return dataframe[dataframe['movie_id'].isin(selected_movies)].reset_index(drop=True)


In [356]:
df_100 = keep_M_movies(df, 100)
df_500 = keep_M_movies(df, 500)
df_1000 = keep_M_movies(df, 1000)

In [358]:
len(df_100), len(df_500), len(df_1000)

(912924, 567888, 284272)

In [359]:
df_500

Unnamed: 0,user_id,movie_id,rating,gender,age
0,2,235,3,M,56
1,2,2278,3,M,56
2,2,3147,5,M,56
3,2,2006,3,M,56
4,2,2916,3,M,56
...,...,...,...,...,...
567883,6040,3504,4,M,25
567884,6040,2641,2,M,25
567885,6040,1947,4,M,25
567886,6040,904,4,M,25


As we're going to deal with an exploration phase, we shuffle the dataframes so the batch size we add iteratively isn't biased.

In [360]:
df_100 = df_100.sample(frac=1).reset_index(drop=True)
df_500 = df_500.sample(frac=1).reset_index(drop=True)
df_1000 = df_1000.sample(frac=1).reset_index(drop=True)

In [361]:
df_100

Unnamed: 0,user_id,movie_id,rating,gender,age
0,200,2779,2,F,18
1,3195,1387,3,M,18
2,1517,714,4,M,18
3,4215,296,5,M,25
4,3475,11,4,M,25
...,...,...,...,...,...
912919,5552,3638,3,M,50
912920,4872,2028,3,M,45
912921,4422,2123,4,M,25
912922,444,1358,4,M,56


We also add a time column (optionnal, as we can just check the index) for clarity, and again because we're going to deal with exploration phases to simulate data flows

In [363]:
for dataframe in [df_100, df_500, df_1000]:
    dataframe['time'] = dataframe.index

In [364]:
df_1000.loc[df_1000.time <= 100]

Unnamed: 0,user_id,movie_id,rating,gender,age,time
0,5277,3114,3,M,25,0
1,5924,2640,4,M,35,1
2,4899,3471,1,M,25,2
3,2557,2571,2,F,18,3
4,1489,2324,3,F,35,4
...,...,...,...,...,...,...
96,5978,736,3,M,35,96
97,5365,2571,5,M,18,97
98,4918,1617,3,F,18,98
99,3999,3256,4,M,35,99


In [366]:
df_100['rating'].value_counts()

rating
4    323136
3    237005
5    212122
2     94543
1     46118
Name: count, dtype: int64

We transform our ratings in binary rating to make the reward easier and more interpretable.

In [367]:
df_100['rating'].value_counts() / len(df_100)

rating
4    0.353957
3    0.259611
5    0.232355
2    0.103561
1    0.050517
Name: count, dtype: float64

If we take 4 and 5 as good rating (rating = 1), we will have around 55% of good ratings. We thus make the choice to only take the rating 5 as a positive rating.

In [368]:
for dataframe in [df_100, df_500, df_1000]:
    dataframe['binary_rating'] = dataframe['rating'].apply(lambda x: 1 if x == 5 else 0)

In [369]:
df_100['binary_rating'].value_counts(), df_100['binary_rating'].value_counts() / len(df_100)

(binary_rating
 0    700802
 1    212122
 Name: count, dtype: int64,
 binary_rating
 0    0.767645
 1    0.232355
 Name: count, dtype: float64)

In [370]:
df_500['binary_rating'].value_counts(), df_500['binary_rating'].value_counts() / len(df_500)

(binary_rating
 0    410416
 1    157472
 Name: count, dtype: int64,
 binary_rating
 0    0.722706
 1    0.277294
 Name: count, dtype: float64)

In [371]:
df_1000['binary_rating'].value_counts(), df_1000['binary_rating'].value_counts() / len(df_1000)

(binary_rating
 0    187541
 1     96731
 Name: count, dtype: int64,
 binary_rating
 0    0.659724
 1    0.340276
 Name: count, dtype: float64)

We have around 30% of good ratings by taking only 5 as a good rating for our binary transformation, which looks much better.

We now export our three datasets, we're going to work with the three datasets to check if the minimum number of reviews per movie has that much importance.

In [374]:
df_100.to_csv('../Data/dataset_100_minimum_reviews.csv')
df_500.to_csv('../Data/dataset_500_minimum_reviews.csv')
df_1000.to_csv('../Data/dataset_1000_minimum_reviews.csv')