# Movie Recommender System
## (with multiple models)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import SVD, Dataset, Reader, accuracy
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings; warnings.simplefilter("ignore")
from IPython.display import display

In [2]:
movies = pd.read_csv("movies_metadata.csv")
display(movies.head())

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
# Returns the top 3% of movies based on vote counts

m = movies.vote_count.quantile(0.97)
print(m)

movies_97 = movies[movies.vote_count > m]
display(movies_97.vote_count.head())
print(len(movies_97))
print(len(movies))

821.2299999999959


0     5415.0
1     2413.0
5     1886.0
9     1194.0
15    1343.0
Name: vote_count, dtype: float64

1364
45466


In [4]:
# Prints the average vote for the top 3% of movies

C = np.mean(movies_97.vote_average)
print(C)

6.737243401759528


In [5]:
v = movies_97.vote_count
R = movies_97.vote_average

#
## Movie weitghed rating formula: (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C 
### where: 
- R is average for the movie
- v is number of votes for the movie 
- m is minimum vote count required
- C is the average vote for the top 3% of movies by vote count (in this case)

In [6]:
# Now we create a weighted rating for the movie where v = vote count, m = minimum vote count requirement, R = avg rating, C = avg vote across whole report

movies_97["weighted_rating"] = ((v/(v + m) * R) + (m/(v + m) * C))

# Here are the top 15 movies based on their Weighted Rating

display(movies_97[["title", "genres", "weighted_rating"]].sort_values("weighted_rating", ascending = False).head(15).reset_index())

Unnamed: 0,index,title,genres,weighted_rating
0,314,The Shawshank Redemption,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",8.342293
1,834,The Godfather,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",8.28852
2,12481,The Dark Knight,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",8.201959
3,2843,Fight Club,"[{'id': 18, 'name': 'Drama'}]",8.177764
4,292,Pulp Fiction,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",8.164782
5,351,Forrest Gump,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",8.066054
6,522,Schindler's List,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",8.055882
7,23673,Whiplash,"[{'id': 18, 'name': 'Drama'}]",8.053064
8,5481,Spirited Away,"[{'id': 14, 'name': 'Fantasy'}, {'id': 12, 'na...",8.032027
9,15480,Inception,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",8.024871


In [7]:


# Create a new column for the extracted genre
movies_97['extracted_genre'] = ""

# Iterate through each row in the DataFrame
for index, row in movies_97.iterrows():
    # Parse the "genres" column
    genres_list = eval(row['genres'])  # Assuming the column is stored as a string, use eval() to convert it to a list of dictionaries
    
    # Extract the genre value from each dictionary
    genres = [genre_dict['name'] for genre_dict in genres_list]
    
    # Store the genre value in the new column as a comma-separated string
    movies_97.at[index, 'extracted_genre'] = ', '.join(genres)

display(movies_97[["title", "extracted_genre", "weighted_rating"]].sort_values("weighted_rating", ascending = False).head(15).reset_index())


Unnamed: 0,index,title,extracted_genre,weighted_rating
0,314,The Shawshank Redemption,"Drama, Crime",8.342293
1,834,The Godfather,"Drama, Crime",8.28852
2,12481,The Dark Knight,"Drama, Action, Crime, Thriller",8.201959
3,2843,Fight Club,Drama,8.177764
4,292,Pulp Fiction,"Thriller, Crime",8.164782
5,351,Forrest Gump,"Comedy, Drama, Romance",8.066054
6,522,Schindler's List,"Drama, History, War",8.055882
7,23673,Whiplash,Drama,8.053064
8,5481,Spirited Away,"Fantasy, Adventure, Animation, Family",8.032027
9,15480,Inception,"Action, Thriller, Science Fiction, Mystery, Ad...",8.024871


In [14]:
# We can make recommendations to people based on their preferred genre:

# Top 10 Comedy movies based on Weighted Rating:

Comedy_movies = movies_97[movies_97.extracted_genre.str.contains("Comedy")].sort_values("weighted_rating", ascending = False)

display(Comedy_movies[["title", "extracted_genre", "weighted_rating"]].head(10).reset_index())

Unnamed: 0,index,title,extracted_genre,weighted_rating
0,351,Forrest Gump,"Comedy, Drama, Romance",8.066054
1,2211,Life Is Beautiful,"Comedy, Drama",8.012519
2,18465,The Intouchables,"Drama, Comedy",8.00722
3,1225,Back to the Future,"Adventure, Comedy, Science Fiction, Family",7.853119
4,22841,The Grand Budapest Hotel,"Comedy, Drama",7.810253
5,22131,The Wolf of Wall Street,"Crime, Drama, Comedy",7.774178
6,30315,Inside Out,"Drama, Comedy, Animation, Family",7.773662
7,40882,La La Land,"Comedy, Drama, Music, Romance",7.728449
8,13724,Up,"Animation, Comedy, Family, Adventure",7.689091
9,24455,Big Hero 6,"Adventure, Family, Animation, Action, Comedy",7.677252


In [9]:
# Next I will use surprise SVD on a different dataset to make recommendations:

ratings = pd.read_csv("ratings_small.csv")
display(ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


#
## Recommender using Surprise library (SVD, KNNBasic)

In [10]:
from surprise import SVD, KNNBasic, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split, cross_validate

reader = Reader()
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

trainset, testset = train_test_split(data, test_size = 0.2, random_state = 1)

svd = SVD()

svd.fit(trainset)

predictions_svd = svd.test(testset)

print(accuracy.rmse(predictions_svd))
print(accuracy.mae(predictions_svd))

RMSE: 0.9021
0.9021053840029488
MAE:  0.6959
0.6959109140468974


In [21]:
# Lets make a prediction on userId 12 for movie 2031, which userId 1 had a 2.5 rating:

predict_u2_m31 = svd.predict(12, 2031)

print(predict_u2_m31)

# Estimation: 2.66 rating for userId 12

user: 12         item: 2031       r_ui = None   est = 2.66   {'was_impossible': False}


In [22]:
# Now Lets fit a KNNBasic model:

knn = KNNBasic()

knn.fit(trainset)

predictions_knn = knn.test(testset)

print(accuracy.rmse(predictions_knn))
print(accuracy.mae(predictions_knn))

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9614
0.9614329145192568
MAE:  0.7402
0.7401741295897455


In [23]:
# Now lets make a prediction on userId 2, movie 31:

knn_predict_u2_m31 = knn.predict(2, 31)

print(knn_predict_u2_m31)

user: 2          item: 31         r_ui = None   est = 3.24   {'actual_k': 26, 'was_impossible': False}
