In [2]:
import pandas as pd
import numpy as np

## First, we read the datasets

In [3]:
users = pd.read_table('../Data/users.dat', sep='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

ratings = pd.read_table('../Data/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')

movies = pd.read_csv('../Data/movies.dat', sep='::', names=['movie_id','title','genre'], engine = 'python', encoding='latin1')

In [4]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df = pd.merge(ratings, movies, on='movie_id')
df = df.sort_values('user_id')

In [8]:
df = pd.merge(df, users, on='user_id')

In [9]:
df

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,gender,age,occupation,zip_code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,48,5,978824351,Pocahontas (1995),Animation|Children's|Musical|Romance,F,1,10,48067
2,1,938,4,978301752,Gigi (1958),Musical,F,1,10,48067
3,1,1207,4,978300719,To Kill a Mockingbird (1962),Drama,F,1,10,48067
4,1,1721,4,978300055,Titanic (1997),Drama|Romance,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,2641,2,956716343,Superman II (1980),Action|Adventure|Sci-Fi,M,25,6,11106
1000205,6040,1947,4,997454036,West Side Story (1961),Musical|Romance,M,25,6,11106
1000206,6040,904,4,956716845,Rear Window (1954),Mystery|Thriller,M,25,6,11106
1000207,6040,2664,4,957717463,Invasion of the Body Snatchers (1956),Horror|Sci-Fi,M,25,6,11106


We remove columns that will not be useful for us

In [10]:
df = df.drop(['timestamp', 'title', 'zip_code'],axis=1)
df

Unnamed: 0,user_id,movie_id,rating,genre,gender,age,occupation
0,1,1193,5,Drama,F,1,10
1,1,48,5,Animation|Children's|Musical|Romance,F,1,10
2,1,938,4,Musical,F,1,10
3,1,1207,4,Drama,F,1,10
4,1,1721,4,Drama|Romance,F,1,10
...,...,...,...,...,...,...,...
1000204,6040,2641,2,Action|Adventure|Sci-Fi,M,25,6
1000205,6040,1947,4,Musical|Romance,M,25,6
1000206,6040,904,4,Mystery|Thriller,M,25,6
1000207,6040,2664,4,Horror|Sci-Fi,M,25,6


## Few people have 1 year old, we remove them as it is probaly a mistake

In [11]:
df = df[df['age'] > 1].reset_index(drop=True)

In [12]:
df

Unnamed: 0,user_id,movie_id,rating,genre,gender,age,occupation
0,2,235,3,Comedy|Drama,M,56,16
1,2,2278,3,Action|Crime|Thriller,M,56,16
2,2,647,3,Drama|War,M,56,16
3,2,3147,5,Drama|Thriller,M,56,16
4,2,1955,4,Drama,M,56,16
...,...,...,...,...,...,...,...
972993,6040,2641,2,Action|Adventure|Sci-Fi,M,25,6
972994,6040,1947,4,Musical|Romance,M,25,6
972995,6040,904,4,Mystery|Thriller,M,25,6
972996,6040,2664,4,Horror|Sci-Fi,M,25,6


In [16]:
df['movie_id'].value_counts()

movie_id
2858    3336
1196    2898
260     2890
1210    2783
480     2583
        ... 
3277       1
706        1
3458       1
2226       1
2909       1
Name: count, Length: 3702, dtype: int64

There are 3702 different movies rated, but some movies have only 1 rating. We thus choose to keep only the M most popular movies.

In [24]:
def keep_M_popular_movies(dataframe, M):

    # We keep the M most popular movies
    selected_movies = dataframe['movie_id'].value_counts().iloc[:M].index.to_list()

    return dataframe[dataframe['movie_id'].isin(selected_movies)].reset_index(drop=True)


In [25]:
df_100 = keep_M_popular_movies(df, 100)
df_50 = keep_M_popular_movies(df, 50)

In [26]:
len(df_100), len(df_50)

(178030, 106656)

In [30]:
df_50.head()

Unnamed: 0,user_id,movie_id,rating,genre,gender,age,occupation
0,2,2916,3,Action|Adventure|Sci-Fi|Thriller,M,56,16
1,2,2028,4,Action|Drama|War,M,56,16
2,2,356,5,Comedy|Romance|War,M,56,16
3,2,1210,4,Action|Adventure|Romance|Sci-Fi|War,M,56,16
4,2,2396,4,Comedy|Romance,M,56,16


As we're going to deal with an exploration phase, we shuffle the dataframes so the batch size we add iteratively isn't biased.

In [31]:
df_100 = df_100.sample(frac=1).reset_index(drop=True)
df_50 = df_50.sample(frac=1).reset_index(drop=True)

In [32]:
df_100

Unnamed: 0,user_id,movie_id,rating,genre,gender,age,occupation
0,2465,2997,5,Comedy,M,50,6
1,2334,590,5,Adventure|Drama|Western,F,56,7
2,1958,3175,4,Adventure|Comedy|Sci-Fi,F,25,1
3,3161,1210,5,Action|Adventure|Romance|Sci-Fi|War,M,25,7
4,4978,1230,3,Comedy|Romance,M,35,14
...,...,...,...,...,...,...,...
178025,4701,1079,4,Comedy,M,45,17
178026,5491,50,4,Crime|Thriller,M,18,7
178027,3224,1230,5,Comedy|Romance,F,25,14
178028,2414,260,5,Action|Adventure|Fantasy|Sci-Fi,M,25,1


We also add a time column (optionnal, as we can just check the index) for clarity, and again because we're going to deal with exploration phases to simulate data flows

In [33]:
for dataframe in [df_100, df_50]:
    dataframe['time'] = dataframe.index

In [34]:
df_100['rating'].value_counts()

rating
5    67483
4    65758
3    32361
2     9138
1     3290
Name: count, dtype: int64

We transform our ratings in binary rating to make the reward easier and more interpretable.

In [35]:
df_100['rating'].value_counts() / len(df_100)

rating
5    0.379054
4    0.369365
3    0.181773
2    0.051328
1    0.018480
Name: count, dtype: float64

If we take 4 and 5 as good rating (rating = 1), we will have around 70% of good ratings. We thus make the choice to only take the rating 5 as a positive rating.

In [36]:
for dataframe in [df_100, df_50]:
    dataframe['binary_rating'] = dataframe['rating'].apply(lambda x: 1 if x == 5 else 0)

In [37]:
df_100['binary_rating'].value_counts(), df_100['binary_rating'].value_counts() / len(df_100)

(binary_rating
 0    110547
 1     67483
 Name: count, dtype: int64,
 binary_rating
 0    0.620946
 1    0.379054
 Name: count, dtype: float64)

In [38]:
df_50['binary_rating'].value_counts(), df_50['binary_rating'].value_counts() / len(df_50)

(binary_rating
 0    60960
 1    45696
 Name: count, dtype: int64,
 binary_rating
 0    0.571557
 1    0.428443
 Name: count, dtype: float64)

We have around 30% of good ratings by taking only 5 as a good rating for our binary transformation, which looks much better.

We now export our two datasets, we're going to work with the two datasets to check if the minimum number movies to take into account has that much importance.

In [40]:
df_100.to_csv('../Data/dataset_100_most_popular.csv')
df_50.to_csv('../Data/dataset_50_most_populars.csv')