# Project: Movie Paring Recommender System

## Data Collection and Preprocessing

### Downloading the MovieLens and IMDb datasets

In [1]:
import pandas as pd
import os
import re

#### Download the MovieLens dataset and merge them

In [2]:
if not os.path.exists('ml-1m'):
    !wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
    !unzip ml-1m.zip
    !rm ml-1m.zip

--2024-07-07 15:07:50--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2024-07-07 15:07:50 (12.0 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [3]:
movies_MovieLens = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python', header=None, names=['movieId', 'title', 'genres'], encoding='ISO-8859-1')
ratings_MovieLens = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python', header=None, names=['userId', 'movieId', 'rating', 'timestamp'], encoding='ISO-8859-1')
print("The movies MovieLens dataset:")
print(movies_MovieLens.head())
print(f"The number of rows of the movies Movielens dataset is {movies_MovieLens.shape[0]}")
print("The ratings MovieLens dataset:")
print(ratings_MovieLens.head())
print(f"The number of rows of the ratings Movielens dataset is {ratings_MovieLens.shape[0]}")

The movies MovieLens dataset:
   movieId                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
The number of rows of the movies Movielens dataset is 3883
The ratings MovieLens dataset:
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
The number of rows of the ratings Movielens dataset is 1000209


Merge the ratings and movies MovieLens datasets on the column movieId

In [4]:
merge_MovieLens = pd.merge(ratings_MovieLens, movies_MovieLens, on='movieId')
print("The merge MovieLens dataset:")
print(merge_MovieLens.head())
print(f"The number of rows of the merge Movielens dataset is {merge_MovieLens.shape[0]}")

The merge MovieLens dataset:
   userId  movieId  rating  timestamp                                   title  \
0       1     1193       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       2     1193       5  978298413  One Flew Over the Cuckoo's Nest (1975)   
2      12     1193       4  978220179  One Flew Over the Cuckoo's Nest (1975)   
3      15     1193       4  978199279  One Flew Over the Cuckoo's Nest (1975)   
4      17     1193       5  978158471  One Flew Over the Cuckoo's Nest (1975)   

  genres  
0  Drama  
1  Drama  
2  Drama  
3  Drama  
4  Drama  
The number of rows of the merge Movielens dataset is 1000209


Delete the ratings and movies MovieLens datasets to make some memory space

In [5]:
del movies_MovieLens
del ratings_MovieLens

Extract the year from the title column and remove it from the title column and create a year column

In [6]:
year_regex = re.compile(r'\((\d{4})\)')
merge_MovieLens['year'] = merge_MovieLens['title'].str.extract(r'\((\d{4})\)').astype(int)
merge_MovieLens['title'] = merge_MovieLens['title'].apply(lambda x: year_regex.sub("", x).strip())
print("The merge MovieLens dataset with the column year:")
print(merge_MovieLens.head())
print(f"The number of rows of the merge Movielens dataset is {merge_MovieLens.shape[0]}")

The merge MovieLens dataset with the column year:
   userId  movieId  rating  timestamp                            title genres  \
0       1     1193       5  978300760  One Flew Over the Cuckoo's Nest  Drama   
1       2     1193       5  978298413  One Flew Over the Cuckoo's Nest  Drama   
2      12     1193       4  978220179  One Flew Over the Cuckoo's Nest  Drama   
3      15     1193       4  978199279  One Flew Over the Cuckoo's Nest  Drama   
4      17     1193       5  978158471  One Flew Over the Cuckoo's Nest  Drama   

   year  
0  1975  
1  1975  
2  1975  
3  1975  
4  1975  
The number of rows of the merge Movielens dataset is 1000209


##### Feature Engineering

Remove the timestamp of the review column

In [7]:
merge_MovieLens.drop('timestamp', axis=1, inplace=True)
print("The merge MovieLens dataset without the column timestamp:")
print(merge_MovieLens.head())
print(f"The number of rows of the merge Movielens dataset is {merge_MovieLens.shape[0]}")

The merge MovieLens dataset without the column timestamp:
   userId  movieId  rating                            title genres  year
0       1     1193       5  One Flew Over the Cuckoo's Nest  Drama  1975
1       2     1193       5  One Flew Over the Cuckoo's Nest  Drama  1975
2      12     1193       4  One Flew Over the Cuckoo's Nest  Drama  1975
3      15     1193       4  One Flew Over the Cuckoo's Nest  Drama  1975
4      17     1193       5  One Flew Over the Cuckoo's Nest  Drama  1975
The number of rows of the merge Movielens dataset is 1000209


Change the genres from string to list of string and order the list

In [8]:
merge_MovieLens['genres'] = merge_MovieLens['genres'].str.split('|')
merge_MovieLens['genres'] = merge_MovieLens['genres'].apply(lambda x: sorted(x))
print(merge_MovieLens.head())

   userId  movieId  rating                            title   genres  year
0       1     1193       5  One Flew Over the Cuckoo's Nest  [Drama]  1975
1       2     1193       5  One Flew Over the Cuckoo's Nest  [Drama]  1975
2      12     1193       4  One Flew Over the Cuckoo's Nest  [Drama]  1975
3      15     1193       4  One Flew Over the Cuckoo's Nest  [Drama]  1975
4      17     1193       5  One Flew Over the Cuckoo's Nest  [Drama]  1975


#### Download the IMDb datasets and merge them

In [9]:
if not os.path.exists('title.basics.tsv.gz'):
    !wget https://datasets.imdbws.com/title.basics.tsv.gz

--2024-07-07 15:08:25--  https://datasets.imdbws.com/title.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.224.14.12, 13.224.14.33, 13.224.14.11, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.224.14.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191394068 (183M) [binary/octet-stream]
Saving to: ‘title.basics.tsv.gz’


2024-07-07 15:08:26 (215 MB/s) - ‘title.basics.tsv.gz’ saved [191394068/191394068]



In [10]:
if not os.path.exists('title.ratings.tsv.gz'):
    !wget https://datasets.imdbws.com/title.ratings.tsv.gz

--2024-07-07 15:08:26--  https://datasets.imdbws.com/title.ratings.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.224.14.12, 13.224.14.33, 13.224.14.11, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.224.14.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7308316 (7.0M) [binary/octet-stream]
Saving to: ‘title.ratings.tsv.gz’


2024-07-07 15:08:26 (75.0 MB/s) - ‘title.ratings.tsv.gz’ saved [7308316/7308316]



In [11]:
if not os.path.exists('title.crew.tsv.gz'):
    !wget https://datasets.imdbws.com/title.crew.tsv.gz

--2024-07-07 15:08:26--  https://datasets.imdbws.com/title.crew.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.224.14.12, 13.224.14.33, 13.224.14.11, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.224.14.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71369593 (68M) [binary/octet-stream]
Saving to: ‘title.crew.tsv.gz’


2024-07-07 15:08:26 (183 MB/s) - ‘title.crew.tsv.gz’ saved [71369593/71369593]



Load the dataset movies IMDb with only the columns: tconst, titleType, primaryTitle, startYear, endYear, and genres

In [12]:
only_columns = ['tconst', 'titleType', 'primaryTitle', 'startYear', 'genres']
movies_IMDb = pd.read_csv('title.basics.tsv.gz', sep='\t', usecols=only_columns,compression='gzip')
print("The movies IMDb dataset:")
print(movies_IMDb.head())
print(f"The number of rows of the movies IMDb dataset is {movies_IMDb.shape[0]}")

The movies IMDb dataset:
      tconst titleType            primaryTitle startYear  \
0  tt0000001     short              Carmencita      1894   
1  tt0000002     short  Le clown et ses chiens      1892   
2  tt0000003     short          Pauvre Pierrot      1892   
3  tt0000004     short             Un bon bock      1892   
4  tt0000005     short        Blacksmith Scene      1893   

                     genres  
0         Documentary,Short  
1           Animation,Short  
2  Animation,Comedy,Romance  
3           Animation,Short  
4              Comedy,Short  
The number of rows of the movies IMDb dataset is 10911804


Remove series and short from the dataset

In [13]:
movies_IMDb = movies_IMDb[movies_IMDb['titleType'] == 'movie']
print("The movies IMDb dataset:")
print(movies_IMDb.head())
print(f"The number of rows of the movies IMDb dataset is {movies_IMDb.shape[0]}")

The movies IMDb dataset:
        tconst titleType                   primaryTitle startYear  \
8    tt0000009     movie                     Miss Jerry      1894   
144  tt0000147     movie  The Corbett-Fitzsimmons Fight      1897   
498  tt0000502     movie                       Bohemios      1905   
570  tt0000574     movie    The Story of the Kelly Gang      1906   
587  tt0000591     movie               The Prodigal Son      1907   

                         genres  
8                       Romance  
144      Documentary,News,Sport  
498                          \N  
570  Action,Adventure,Biography  
587                       Drama  
The number of rows of the movies IMDb dataset is 685593


Removing the columns: titleType to make the dataset lighter for the following merges

In [14]:
movies_IMDb.drop('titleType', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_IMDb.drop('titleType', axis=1, inplace=True)


Add the ratings with only the columns: tconst, averageRating to the movies IMDb dataset

In [15]:
only_columns = ['tconst', 'averageRating']
movies_IMDb = pd.merge(movies_IMDb, pd.read_csv('title.ratings.tsv.gz', sep='\t', usecols=only_columns, compression='gzip'), on='tconst', how='left')
print("The movies IMDb dataset:")
print(movies_IMDb.head())
print(f"The number of rows of the movies IMDb dataset is {movies_IMDb.shape[0]}")

The movies IMDb dataset:
      tconst                   primaryTitle startYear  \
0  tt0000009                     Miss Jerry      1894   
1  tt0000147  The Corbett-Fitzsimmons Fight      1897   
2  tt0000502                       Bohemios      1905   
3  tt0000574    The Story of the Kelly Gang      1906   
4  tt0000591               The Prodigal Son      1907   

                       genres  averageRating  
0                     Romance            5.4  
1      Documentary,News,Sport            5.2  
2                          \N            4.2  
3  Action,Adventure,Biography            6.0  
4                       Drama            5.5  
The number of rows of the movies IMDb dataset is 685593


Add the crew to the movies IMDb dataset

In [16]:
movies_IMDb = pd.merge(movies_IMDb, pd.read_csv('title.crew.tsv.gz', sep='\t', compression='gzip'), on='tconst', how='left')
print("The movies IMDb dataset:")
print(movies_IMDb.head())
print(f"The number of rows of the movies IMDb dataset is {movies_IMDb.shape[0]}")

The movies IMDb dataset:
      tconst                   primaryTitle startYear  \
0  tt0000009                     Miss Jerry      1894   
1  tt0000147  The Corbett-Fitzsimmons Fight      1897   
2  tt0000502                       Bohemios      1905   
3  tt0000574    The Story of the Kelly Gang      1906   
4  tt0000591               The Prodigal Son      1907   

                       genres  averageRating  directors  \
0                     Romance            5.4  nm0085156   
1      Documentary,News,Sport            5.2  nm0714557   
2                          \N            4.2  nm0063413   
3  Action,Adventure,Biography            6.0  nm0846879   
4                       Drama            5.5  nm0141150   

                         writers  
0                      nm0085156  
1                             \N  
2  nm0063413,nm0657268,nm0675388  
3                      nm0846879  
4                      nm0141150  
The number of rows of the movies IMDb dataset is 685593


Rename the column primaryTitle to title

In [17]:
movies_IMDb.rename(columns={'primaryTitle': 'title', 'startYear': 'year'}, inplace=True)
print("The movies IMDb dataset:")
print(movies_IMDb.head())
print(f"The number of rows of the movies IMDb dataset is {movies_IMDb.shape[0]}")

The movies IMDb dataset:
      tconst                          title  year                      genres  \
0  tt0000009                     Miss Jerry  1894                     Romance   
1  tt0000147  The Corbett-Fitzsimmons Fight  1897      Documentary,News,Sport   
2  tt0000502                       Bohemios  1905                          \N   
3  tt0000574    The Story of the Kelly Gang  1906  Action,Adventure,Biography   
4  tt0000591               The Prodigal Son  1907                       Drama   

   averageRating  directors                        writers  
0            5.4  nm0085156                      nm0085156  
1            5.2  nm0714557                             \N  
2            4.2  nm0063413  nm0063413,nm0657268,nm0675388  
3            6.0  nm0846879                      nm0846879  
4            5.5  nm0141150                      nm0141150  
The number of rows of the movies IMDb dataset is 685593


Change the genres from string to list of string and order the list

In [18]:
movies_IMDb['genres'] = movies_IMDb['genres'].str.split(',')
movies_IMDb['genres'] = movies_IMDb['genres'].apply(lambda x: [] if x == ['\\N'] else x)
movies_IMDb['genres'] = movies_IMDb['genres'].apply(lambda x: sorted(x))
print(movies_IMDb.head())
print(f"The number of rows of the movies IMDb dataset is {movies_IMDb.shape[0]}")

      tconst                          title  year  \
0  tt0000009                     Miss Jerry  1894   
1  tt0000147  The Corbett-Fitzsimmons Fight  1897   
2  tt0000502                       Bohemios  1905   
3  tt0000574    The Story of the Kelly Gang  1906   
4  tt0000591               The Prodigal Son  1907   

                           genres  averageRating  directors  \
0                       [Romance]            5.4  nm0085156   
1      [Documentary, News, Sport]            5.2  nm0714557   
2                              []            4.2  nm0063413   
3  [Action, Adventure, Biography]            6.0  nm0846879   
4                         [Drama]            5.5  nm0141150   

                         writers  
0                      nm0085156  
1                             \N  
2  nm0063413,nm0657268,nm0675388  
3                      nm0846879  
4                      nm0141150  
The number of rows of the movies IMDb dataset is 685593


Change the string writers to list of string writers and order the list

In [19]:
movies_IMDb['writers'] = movies_IMDb['writers'].str.split(',')
movies_IMDb['writers'] = movies_IMDb['writers'].apply(lambda x: [] if x == ['\\N'] else x)
print(movies_IMDb.head())

      tconst                          title  year  \
0  tt0000009                     Miss Jerry  1894   
1  tt0000147  The Corbett-Fitzsimmons Fight  1897   
2  tt0000502                       Bohemios  1905   
3  tt0000574    The Story of the Kelly Gang  1906   
4  tt0000591               The Prodigal Son  1907   

                           genres  averageRating  directors  \
0                       [Romance]            5.4  nm0085156   
1      [Documentary, News, Sport]            5.2  nm0714557   
2                              []            4.2  nm0063413   
3  [Action, Adventure, Biography]            6.0  nm0846879   
4                         [Drama]            5.5  nm0141150   

                             writers  
0                        [nm0085156]  
1                                 []  
2  [nm0063413, nm0657268, nm0675388]  
3                        [nm0846879]  
4                        [nm0141150]  


Change the string directors to list of string directors and order the list

In [20]:
movies_IMDb['directors'] = movies_IMDb['directors'].str.split(',')
movies_IMDb['directors'] = movies_IMDb['directors'].apply(lambda x: [] if x == ['\\N'] else x)
print(movies_IMDb.head())

      tconst                          title  year  \
0  tt0000009                     Miss Jerry  1894   
1  tt0000147  The Corbett-Fitzsimmons Fight  1897   
2  tt0000502                       Bohemios  1905   
3  tt0000574    The Story of the Kelly Gang  1906   
4  tt0000591               The Prodigal Son  1907   

                           genres  averageRating    directors  \
0                       [Romance]            5.4  [nm0085156]   
1      [Documentary, News, Sport]            5.2  [nm0714557]   
2                              []            4.2  [nm0063413]   
3  [Action, Adventure, Biography]            6.0  [nm0846879]   
4                         [Drama]            5.5  [nm0141150]   

                             writers  
0                        [nm0085156]  
1                                 []  
2  [nm0063413, nm0657268, nm0675388]  
3                        [nm0846879]  
4                        [nm0141150]  


### Convert the year column to integer

In [21]:
movies_IMDb['year'] = pd.to_numeric(movies_IMDb['year'], errors='coerce').fillna(0).astype(int)
print(movies_IMDb.dtypes)

tconst            object
title             object
year               int64
genres            object
averageRating    float64
directors         object
writers           object
dtype: object


### Merge datasets to include movie ratings, genres, and metadata

In [22]:
merge_dataset = pd.merge(merge_MovieLens, movies_IMDb, on=['title', 'year'], how='inner')

In [23]:
print("The merge dataset:")
print(merge_dataset.head())
print(f"The number of rows of the merge dataset is {merge_dataset.shape[0]}")

The merge dataset:
   userId  movieId  rating                            title genres_x  year  \
0       1     1193       5  One Flew Over the Cuckoo's Nest  [Drama]  1975   
1       2     1193       5  One Flew Over the Cuckoo's Nest  [Drama]  1975   
2      12     1193       4  One Flew Over the Cuckoo's Nest  [Drama]  1975   
3      15     1193       4  One Flew Over the Cuckoo's Nest  [Drama]  1975   
4      17     1193       5  One Flew Over the Cuckoo's Nest  [Drama]  1975   

      tconst genres_y  averageRating    directors  \
0  tt0073486  [Drama]            8.7  [nm0001232]   
1  tt0073486  [Drama]            8.7  [nm0001232]   
2  tt0073486  [Drama]            8.7  [nm0001232]   
3  tt0073486  [Drama]            8.7  [nm0001232]   
4  tt0073486  [Drama]            8.7  [nm0001232]   

                                        writers  
0  [nm0369142, nm0325743, nm0450181, nm0913670]  
1  [nm0369142, nm0325743, nm0450181, nm0913670]  
2  [nm0369142, nm0325743, nm0450181, nm0913

## Feature Engineering

### Combine genres from both datasets

In [24]:
# combine the genres columns
def combine_genres(genres_x, genres_y):
    """
    Combine the genres of the two datasets
    """
    return list(set(genres_x + genres_y))
merge_dataset['genres'] = merge_dataset.apply(lambda row: combine_genres(row['genres_x'], row['genres_y']), axis=1)
# remove the genres_x and genres_y columns
merge_dataset.drop(['genres_x', 'genres_y'], axis=1, inplace=True)
print(merge_dataset.head())
print(f"The number of rows of the merge dataset is {merge_dataset.shape[0]}")

   userId  movieId  rating                            title  year     tconst  \
0       1     1193       5  One Flew Over the Cuckoo's Nest  1975  tt0073486   
1       2     1193       5  One Flew Over the Cuckoo's Nest  1975  tt0073486   
2      12     1193       4  One Flew Over the Cuckoo's Nest  1975  tt0073486   
3      15     1193       4  One Flew Over the Cuckoo's Nest  1975  tt0073486   
4      17     1193       5  One Flew Over the Cuckoo's Nest  1975  tt0073486   

   averageRating    directors                                       writers  \
0            8.7  [nm0001232]  [nm0369142, nm0325743, nm0450181, nm0913670]   
1            8.7  [nm0001232]  [nm0369142, nm0325743, nm0450181, nm0913670]   
2            8.7  [nm0001232]  [nm0369142, nm0325743, nm0450181, nm0913670]   
3            8.7  [nm0001232]  [nm0369142, nm0325743, nm0450181, nm0913670]   
4            8.7  [nm0001232]  [nm0369142, nm0325743, nm0450181, nm0913670]   

    genres  
0  [Drama]  
1  [Drama]  
2  [D

In [25]:
def combine_cast(writers, directors):
    """
    Combine the writers and directors columns to create cast column
    """
    return list(set(writers + directors))

In [26]:
merge_dataset['cast'] = merge_dataset.apply(lambda row: combine_cast(row['writers'], row['directors']), axis=1)

In [27]:
merge_dataset.drop(['writers', 'directors'], axis=1, inplace=True)
print(merge_dataset.head())

   userId  movieId  rating                            title  year     tconst  \
0       1     1193       5  One Flew Over the Cuckoo's Nest  1975  tt0073486   
1       2     1193       5  One Flew Over the Cuckoo's Nest  1975  tt0073486   
2      12     1193       4  One Flew Over the Cuckoo's Nest  1975  tt0073486   
3      15     1193       4  One Flew Over the Cuckoo's Nest  1975  tt0073486   
4      17     1193       5  One Flew Over the Cuckoo's Nest  1975  tt0073486   

   averageRating   genres                                               cast  
0            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
1            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
2            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
3            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
4            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  


### Convert the rating column to float

In [28]:
merge_dataset['rating'] = merge_dataset['rating'].astype(float)
print(merge_dataset.dtypes)

userId             int64
movieId            int64
rating           float64
title             object
year               int64
tconst            object
averageRating    float64
genres            object
cast              object
dtype: object


### Remove tconst column to only keep the movieId column

We do that because we don't need 2 id columns for the same movie

In [29]:
merge_dataset.drop('tconst', axis=1, inplace=True)
print(merge_dataset.head())

   userId  movieId  rating                            title  year  \
0       1     1193     5.0  One Flew Over the Cuckoo's Nest  1975   
1       2     1193     5.0  One Flew Over the Cuckoo's Nest  1975   
2      12     1193     4.0  One Flew Over the Cuckoo's Nest  1975   
3      15     1193     4.0  One Flew Over the Cuckoo's Nest  1975   
4      17     1193     5.0  One Flew Over the Cuckoo's Nest  1975   

   averageRating   genres                                               cast  
0            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
1            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
2            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
3            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  
4            8.7  [Drama]  [nm0913670, nm0450181, nm0369142, nm0325743, n...  


### Create a metadata column

In [30]:
merge_dataset['metadata'] = merge_dataset['genres'].apply(lambda x: ' '.join(x)) + ' ' + merge_dataset['cast'].apply(lambda x: ' '.join(x)) + ' ' + merge_dataset['year'].astype(str)

### Create a user-item interaction matrix for collaborative filtering

In [31]:
user_item_matrix = merge_dataset.pivot_table(index='userId', columns='movieId', values='rating')
print(user_item_matrix.head())

movieId  1     2     3     4     5     6     7     8     9     10    ...  \
userId                                                               ...   
1         5.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   

movieId  3942  3943  3944  3945  3946  3947  3948  3949  3950  3951  
userId                                                               
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         NaN   NaN   NaN   NaN   NaN   NaN   N

In [32]:
user_item_matrix_sparse = user_item_matrix.fillna(0)

### Understand how to combine user data to get data for the couple of users

#### Get the number of reviews per user and per movie

In [33]:
usercount = merge_dataset['userId'].value_counts()
print(usercount.head())

userId
4169    1503
1680    1275
4277    1147
1941    1107
1181    1026
Name: count, dtype: int64


#### Get the number of reviews per movie

In [34]:
filmcount = merge_dataset['movieId'].value_counts()
print(filmcount.head())

movieId
2858    3428
260     2991
1196    2990
1210    2883
480     2672
Name: count, dtype: int64


## Model Development

### Implement a recommender system algotithm to predict the rating of a movie by a couple of users

In [35]:
import numpy as np
import pandas as pd

In [36]:
R = user_item_matrix_sparse.values
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

#### Collaborative filtering using Singular Value Decomposition (SVD)

In [37]:
from scipy.sparse.linalg import svds
def svd_decomposition(R, k=50):
    """
    Perform Singular Value Decomposition (SVD) on the user-item interaction matrix
    """
    U, sigma, Vt = svds(R, k=k)
    sigma = np.diag(sigma)
    return U, sigma, Vt

In [38]:
U, sigma, Vt = svd_decomposition(R_demeaned, k=50)

In [39]:
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_item_matrix_sparse.columns, index=user_item_matrix_sparse.index)

In [40]:
def combine_predictions(user1_id, user2_id, predicted_ratings_df):
    """
    Combine the predictions of two users
    """
    user1_predictions = predicted_ratings_df.loc[user1_id]
    user2_predictions = predicted_ratings_df.loc[user2_id]
    return (user1_predictions + user2_predictions) / 2

## Recommendation Algorithm

#### Content-based filtering using KMeans

In [41]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
count_vectorizer = CountVectorizer()
metadata_matrix = count_vectorizer.fit_transform(merge_dataset['metadata'].values)

In [43]:
kmeans = KMeans(n_clusters=10, random_state=42)
merge_dataset['clusterId'] = kmeans.fit_predict(metadata_matrix)



In [44]:
def get_recommendations_kmeans(movieId, merge_dataset, n_recommendations=5):
    """
    Get the recommendations of a movie using KMeans
    """
    movie_cluster = merge_dataset[merge_dataset['movieId'] == movieId]['clusterId'].values[0]
    recommendations = merge_dataset[merge_dataset['clusterId'] == movie_cluster].sort_values('averageRating', ascending=False).head(n_recommendations)
    return recommendations[['movieId', 'title', 'genres', 'cast', 'year']]

### Combine Collaborative and content-based filtering to create a hybrid recommender system to predict the rating of a movie by a couple of users

### Develop an algorithm to suggest one movie that might be likes by the couple of users

In [45]:
from sklearn.preprocessing import MinMaxScaler

In [46]:
def normalize_ratings(ratings, min_rating=0.5, max_rating=5):
    """
    Normalize the ratings to be between 0.5 and 5
    """
    scaler = MinMaxScaler(feature_range=(min_rating, max_rating))
    ratings_array = np.array([rating for _, rating in ratings]).reshape(-1, 1)
    normalized_ratings = scaler.fit_transform(ratings_array).flatten()
    return [(movieId, rating) for (movieId, _), rating in zip(ratings, normalized_ratings)]

In [47]:
def calculate_couple_score_hybrid(user1_id, user2_id, predicted_ratings_df, merge_dataset):
    """
    Calculate the score of a couple of users using the hybrid recommender system
    """
    combined_ratings = combine_predictions(user1_id, user2_id, predicted_ratings_df)

    movie_ratings = combined_ratings.to_dict()

    for movieId in movie_ratings.keys():
        metadata_recommendations = get_recommendations_kmeans(movieId, merge_dataset)
        for _, row in metadata_recommendations.iterrows():
            recommended_movie = row['movieId']
            if recommended_movie in movie_ratings:
                movie_ratings[recommended_movie] += movie_ratings[movieId] * 0.1

    sorted_movie_ratings = sorted(movie_ratings.items(), key=lambda x: x[1], reverse=True)
    normalized_sorted_movie_ratings = normalize_ratings(sorted_movie_ratings)
    return normalized_sorted_movie_ratings

In [48]:
def get_score_of_movie_hybrid(user1_id, user2_id, movieId, predicted_ratings_df, merge_dataset):
    """
    Get the score of a movie for a couple of users using the hybrid recommender system
    """
    sorted_movie_scores = calculate_couple_score_hybrid(user1_id, user2_id, predicted_ratings_df, merge_dataset)
    for movie_score in sorted_movie_scores:
        if movie_score[0] == movieId:
            return movie_score[1]
    return 0

In [49]:
print(get_score_of_movie_hybrid(1, 2, 1, predicted_ratings_df, merge_dataset))

2.579240506672697


In [50]:
def recommend_one_movie_for_couple(user1_id, user2_id, predicted_ratings_df, merge_dataset):
    """
    Recommend one movie for a couple of users
    """
    sorted_movie_scores = calculate_couple_score_hybrid(user1_id, user2_id, predicted_ratings_df, merge_dataset)
    recommended_movie = merge_dataset[merge_dataset['movieId'] == sorted_movie_scores[0][0]]
    return recommended_movie[['movieId', 'title', 'genres', 'cast', 'year']].values[0]

In [51]:
print(recommend_one_movie_for_couple(1, 2, predicted_ratings_df, merge_dataset))

[527 "Schindler's List" list(['History', 'Biography', 'Drama', 'War'])
 list(['nm0001873', 'nm0000229', 'nm0447745']) 1993]


## Evaluation

### Slit the data into training and testing sets

In [52]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(merge_dataset, test_size=0.2)
print(f"The number of rows of the train data is {train_data.shape[0]}")
print(f"The number of rows of the test data is {test_data.shape[0]}")

The number of rows of the train data is 556352
The number of rows of the test data is 139089


### Evaluate the model

In [53]:
def make_couple_from_data(data):
    """
    Make couples of users from the data
    """
    couples = []
    user_ids = data['userId'].unique()
    for i, user1_id in enumerate(user_ids):
        for user2_id in user_ids[i + 1:]:
            couples.append((user1_id, user2_id))
    return couples

In [54]:
import random
from sklearn.metrics import mean_squared_error
def calculate_rmse(test_data, predicted_ratings_df):
    """
    Calculate the Root Mean Squared Error (RMSE) of the model
    """

    actual_ratings = []
    predicted_ratings = []

    couples = make_couple_from_data(test_data)

    # take only 10 couples otherwise this is too big
    couples = random.sample(couples, 10)

    for user1_id, user2_id in couples:
        user1_test_data = test_data[test_data['userId'] == user1_id]
        user2_test_data = test_data[test_data['userId'] == user2_id]

        # Find movies rated by both users
        common_movies = pd.merge(user1_test_data, user2_test_data, on='movieId')

        for _, row in common_movies.iterrows():
            movie_id = row['movieId']
            actual_rating_user1 = row['rating_x']
            actual_rating_user2 = row['rating_y']
            actual_combined_rating = (actual_rating_user1 + actual_rating_user2) / 2

            try:
                combined_prediction = combine_predictions(user1_id, user2_id, predicted_ratings_df).loc[movie_id]
                actual_ratings.append(actual_combined_rating)
                predicted_ratings.append(combined_prediction)
            except KeyError:
                continue

    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    return rmse

In [55]:
print(calculate_rmse(test_data, predicted_ratings_df))

1.696809018327508
