<a href="https://colab.research.google.com/github/davidimago/movie_recommendation_collaborative_filtering/blob/main/collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Reccomendation Engine with Collaborative Filtering



## Preprocess

In [None]:
import pandas as pd
import numpy as np

# Load data
user_ratings = pd.read_csv('dataset/user_ratings.csv')

In [None]:
# View the general info / structure of user_ratings
print(user_ratings.info())

print('\n')

# Select userId, rating, and title only
user_ratings = user_ratings[['userId','rating','title']]

# Inspect the first 10 rows of user_ratings
print(user_ratings.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  object 
 1   movieId    100836 non-null  object 
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.6+ MB
None


    userId  rating             title
0  user001     4.0  Toy Story (1995)
1  user005     4.0  Toy Story (1995)
2  user007     4.5  Toy Story (1995)
3  user015     2.5  Toy Story (1995)
4  user017     4.5  Toy Story (1995)
5  user018     3.5  Toy Story (1995)
6  user019     4.0  Toy Story (1995)
7  user021     3.5  Toy Story (1995)
8  user027     3.0  Toy Story (1995)
9  user031     5.0  Toy Story (1995)


In [None]:
# For now, let's take only 25 most popular movies

# Count number of viewer of each movie
viewers = user_ratings['title'].value_counts(sort=True)
print(viewers.head(25))

print('\n')

# Take only top 25
top25 = viewers.head(25).index.tolist()

# Filter only for 25 movies
user_ratings_25 = user_ratings[user_ratings['title'].isin(top25)]

Forrest Gump (1994)                                                               329
Shawshank Redemption, The (1994)                                                  317
Pulp Fiction (1994)                                                               307
Silence of the Lambs, The (1991)                                                  279
Matrix, The (1999)                                                                278
Star Wars: Episode IV - A New Hope (1977)                                         251
Jurassic Park (1993)                                                              238
Braveheart (1995)                                                                 237
Terminator 2: Judgment Day (1991)                                                 224
Schindler's List (1993)                                                           220
Fight Club (1999)                                                                 218
Toy Story (1995)                                      

### Dataset for User-Based Method

In [None]:
# Transform the table in form user-based
user_based = user_ratings_25.pivot_table(index='userId', columns='title', values='rating')

# Inspect the transformed table
print(user_based.head())

title    American Beauty (1999)  ...  Usual Suspects, The (1995)
userId                           ...                            
user001                     5.0  ...                         5.0
user002                     NaN  ...                         NaN
user003                     NaN  ...                         NaN
user004                     5.0  ...                         NaN
user005                     NaN  ...                         4.0

[5 rows x 25 columns]


In [None]:
# Handling the missing rating

# Get the average rating for each user
avg_ratings_user = user_based.mean(axis=1)

# Center each users ratings around 0
user_based_centered = user_based.sub(avg_ratings_user, axis=0)

# Assume that if rating is null, it means neutral
# Fill in the missing data with 0s
user_based_normed = user_based_centered.fillna(0)

print(user_based_normed.head())

title    American Beauty (1999)  ...  Usual Suspects, The (1995)
userId                           ...                            
user001                0.526316  ...                    0.526316
user002                0.000000  ...                    0.000000
user003                0.000000  ...                    0.000000
user004                1.600000  ...                    0.000000
user005                0.000000  ...                    0.200000

[5 rows x 25 columns]


### Dataset for Item-Based Method

In [None]:
# Transform the table in form item-based
item_based = user_ratings_25.pivot_table(index='title', columns='userId', values='rating')

# Inspect the transformed table
print(item_based.head())

userId                  user001  user002  user003  ...  user608  user609  user610
title                                              ...                           
American Beauty (1999)      5.0      NaN      NaN  ...      5.0      NaN      3.5
Apollo 13 (1995)            NaN      NaN      NaN  ...      2.0      3.0      NaN
Batman (1989)               4.0      NaN      NaN  ...      3.0      3.0      4.5
Braveheart (1995)           4.0      NaN      NaN  ...      4.0      3.0      4.5
Fight Club (1999)           5.0      NaN      NaN  ...      5.0      NaN      5.0

[5 rows x 572 columns]


In [None]:
# Handling the missing rating

# Get the average rating for each user
avg_ratings_item = item_based.mean(axis=1)

# Center each users ratings around 0
item_based_centered = item_based.sub(avg_ratings_item, axis=0)

# Assume if users dont give rating, means neutral
# Fill in the missing data with 0s
item_based_normed = item_based_centered.fillna(0)

print(item_based_normed.head())

userId                   user001  user002  ...   user609   user610
title                                      ...                    
American Beauty (1999)  0.943627      0.0  ...  0.000000 -0.556373
Apollo 13 (1995)        0.000000      0.0  ... -0.845771  0.000000
Batman (1989)           0.571429      0.0  ... -0.428571  1.071429
Braveheart (1995)      -0.031646      0.0  ... -1.031646  0.468354
Fight Club (1999)       0.727064      0.0  ...  0.000000  0.727064

[5 rows x 572 columns]


### Question Statement
**We want to predict will user001 like Godfather?**

In [None]:
# Take a look about movies that user001 doesn't watch yet
print(user_based.loc['user001'])

# We want to predict will user001 like Godfather?

title
American Beauty (1999)                                                            5.0
Apollo 13 (1995)                                                                  NaN
Batman (1989)                                                                     4.0
Braveheart (1995)                                                                 4.0
Fight Club (1999)                                                                 5.0
Forrest Gump (1994)                                                               4.0
Fugitive, The (1993)                                                              5.0
Godfather, The (1972)                                                             NaN
Independence Day (a.k.a. ID4) (1996)                                              3.0
Jurassic Park (1993)                                                              4.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         NaN
Lord of the Rings: The Two Towers, The (2002)   

## User-Based Method

In [None]:
# Drop the movie we are trying to predict
user_based_x = user_based_normed.drop("Godfather, The (1972)", axis=1)

# Get the data for the user we are predicting for
target_user_x = user_based_x.loc[["user001"]]
print('target_user_x : \n', target_user_x)

# Get the target data from orginal rating value dataset
other_users_y = user_based["Godfather, The (1972)"]

# Get the data for only those that have seen the movie
other_users_x = user_based_x[other_users_y.notnull()]

# Remove those that have not seen the movie from the target
other_users_y.dropna(inplace=True)

print('other_users_x : \n', other_users_x.head())
print('other_users_y : \n', other_users_y.head())

target_user_x : 
 title    American Beauty (1999)  ...  Usual Suspects, The (1995)
userId                           ...                            
user001                0.526316  ...                    0.526316

[1 rows x 24 columns]
other_users_x : 
 title    American Beauty (1999)  ...  Usual Suspects, The (1995)
userId                           ...                            
user015               -0.111111  ...                    0.000000
user016                0.093750  ...                    0.593750
user017               -0.583333  ...                   -0.083333
user018                0.000000  ...                    0.772727
user021               -1.382353  ...                    0.000000

[5 rows x 24 columns]
other_users_y : 
 userId
user015    4.0
user016    2.5
user017    5.0
user018    4.0
user021    2.5
Name: Godfather, The (1972), dtype: float64


In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Instantiate the user KNN model
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=8)

# Fit the model and predict the target user
user_knn.fit(other_users_x, other_users_y)
target_user_pred = user_knn.predict(target_user_x)

print('Predicted rating by user001 :', target_user_pred)

Predicted rating by user001 : [4.75]


## Item-Based Method

In [None]:
# Drop the user we are trying to predict
item_based_x = item_based_normed.drop("user001", axis=1)

# Get the data for the user we are predicting for
target_item_x = item_based_x.loc[["Godfather, The (1972)"]]
print('target_item_x : \n', target_item_x)

# Get the target data from orginal rating value dataset
other_item_y = item_based["user001"]

# Get the data for only movies that user001 have seen
other_item_x = item_based_x[other_item_y.notnull()]

# Remove those that have not seen the movie from the target
other_item_y.dropna(inplace=True)

print('other_users_x : \n', other_item_x.head())
print('other_users_y : \n', other_item_y.head())

target_item_x : 
 userId                 user002  user003  user004  ...   user608  user609   user610
title                                             ...                             
Godfather, The (1972)      0.0      0.0      0.0  ...  0.710938      0.0  0.710938

[1 rows x 571 columns]
other_users_x : 
 userId                  user002  user003  ...   user609   user610
title                                     ...                    
American Beauty (1999)      0.0      0.0  ...  0.000000 -0.556373
Batman (1989)               0.0      0.0  ... -0.428571  1.071429
Braveheart (1995)           0.0      0.0  ... -1.031646  0.468354
Fight Club (1999)           0.0      0.0  ...  0.000000  0.727064
Forrest Gump (1994)         0.0      0.0  ... -0.164134 -1.164134

[5 rows x 571 columns]
other_users_y : 
 title
American Beauty (1999)    5.0
Batman (1989)             4.0
Braveheart (1995)         4.0
Fight Club (1999)         5.0
Forrest Gump (1994)       4.0
Name: user001, dtype: float64


In [None]:
# Instantiate the user KNN model
item_knn = KNeighborsRegressor(metric='cosine', n_neighbors=8)

# Fit the model and predict the target user
item_knn.fit(other_item_x, other_item_y)
target_item_pred = item_knn.predict(target_item_x)

print('Predicted rating by user001 :', target_item_pred)

Predicted rating by user001 : [4.625]


## Conclusion

Predicted Rating (of Godfather movie) by user001 :
*   User-Based Method : 4.75
*   Item-Based Method : 4.625 


