In [76]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
%cd /content/drive/MyDrive/movie_rec

/content/drive/MyDrive/movie_rec


In [78]:
!pip install fuzzywuzzy



# Importing neccessary libraries

In [79]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

# Data Modelling

## Read 2 dataframes

In [80]:
movie_data_df = pd.read_csv('data/preprocessed_data.csv')
ratings_df = pd.read_csv('data/ratings_small.csv')

## Data preprocessing

Our goal is to build a user-item matrix based on the movie id and ratings from each user.

In [81]:
movie_data_df.head(10)

Unnamed: 0,movieId,title,genres,production_companies,production_countries,release_date,budget,revenue,runtime,vote_average,vote_count
0,1,Toy Story,"Animation, Comedy, Family",Pixar Animation Studios,United States of America,1995-10-30,30000000,373554033.0,81.0,7.7,5415.0
1,2,Jumanji,"Adventure, Fantasy, Family","TriStar Pictures, Teitler Film, Interscope Com...",United States of America,1995-12-15,65000000,262797249.0,104.0,6.9,2413.0
2,3,Grumpier Old Men,"Romance, Comedy","Warner Bros., Lancaster Gate",United States of America,1995-12-22,0,0.0,101.0,6.5,92.0
3,4,Waiting to Exhale,"Comedy, Drama, Romance",Twentieth Century Fox Film Corporation,United States of America,1995-12-22,16000000,81452156.0,127.0,6.1,34.0
4,5,Father of the Bride Part II,Comedy,"Sandollar Productions, Touchstone Pictures",United States of America,1995-02-10,0,76578911.0,106.0,5.7,173.0
5,6,Heat,"Action, Crime, Drama, Thriller","Regency Enterprises, Forward Pass, Warner Bros.",United States of America,1995-12-15,60000000,187436818.0,170.0,7.7,1886.0
6,7,Sabrina,"Comedy, Romance","Paramount Pictures, Scott Rudin Productions, M...","Germany, United States of America",1995-12-15,58000000,0.0,127.0,6.2,141.0
7,8,Tom and Huck,"Action, Adventure, Drama, Family",Walt Disney Pictures,United States of America,1995-12-22,0,0.0,97.0,5.4,45.0
8,9,Sudden Death,"Action, Adventure, Thriller","Universal Pictures, Imperial Entertainment, Si...",United States of America,1995-12-22,35000000,64350171.0,106.0,5.5,174.0
9,10,GoldenEye,"Adventure, Action, Thriller","United Artists, Eon Productions","United Kingdom, United States of America",1995-11-16,58000000,352194034.0,130.0,6.6,1194.0


In [82]:
merged_movie_data = ratings_df.merge(movie_data_df, on='movieId')
merged_movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,production_companies,production_countries,release_date,budget,revenue,runtime,vote_average,vote_count
0,1,31,2.5,1260759144,Dangerous Minds,"Drama, Crime","Hollywood Pictures, Via Rosa Productions, Don ...",United States of America,1995-08-11,0,180000000.0,99.0,6.4,249.0
1,1,1029,3.0,1260759179,Dumbo,"Animation, Family","RKO Radio Pictures, Walt Disney Productions",United States of America,1941-10-22,812000,1600000.0,64.0,6.8,1206.0
2,1,1061,3.0,1260759182,Sleepers,"Crime, Drama, Thriller","Propaganda Films, Warner Bros., Baltimore Pict...",United States of America,1996-10-18,44000000,165615285.0,147.0,7.3,729.0
3,1,1129,2.0,1260759185,Escape from New York,"Science Fiction, Action","AVCO Embassy Pictures, Goldcrest Films Interna...",United States of America,1981-05-22,6000000,50244700.0,99.0,6.9,720.0
4,1,1172,4.0,1260759205,Cinema Paradiso,"Drama, Romance",Rai Tre Radiotelevisione Italiana,"Italy, France",1988-11-17,0,11990401.0,124.0,8.2,834.0


In [83]:
user_item = merged_movie_data[["userId", "movieId", "rating"]]
user_item

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99845,671,6268,2.5
99846,671,6269,4.0
99847,671,6365,4.0
99848,671,6385,2.5


In [84]:
user_item = user_item.groupby(['userId', 'movieId'], as_index=False).mean()

In [85]:
user_item_matrix = user_item.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,160718,161084,161155,161594,161830,161918,161944,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
user_rating_matrix = user_item_matrix.T
user_rating_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


For building a recommendation system, we will choose the K-nearest neighbors to use here.

In [87]:
cf_knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)


# # Fitting the model on our matrix
# cf_knn_model.fit(user_item_matrix)

Create a function to recommend movies based on the movie name.

In [149]:
def movie_recommender_engine(movie_name, matrix, cf_model, n_recs):
    cf_model.fit(matrix)

    # Get the movie id based on the name
    movie_id = movie_data_df.loc[movie_data_df['title'] == movie_name, 'movieId'].iloc[0]

    # Calculate neighbors distance
    distances, indices = cf_model.kneighbors(matrix.loc[movie_id].values.reshape(1, -1), n_neighbors=n_recs)
    movie_rec_ids = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]

    # Get the result
    cf_recs = []
    for i in movie_rec_ids:
        id = matrix.index[i[0]]
        movie_title = movie_data_df.loc[movie_data_df['movieId'] == id, 'title'].iloc[0]
        cf_recs.append({'Title': movie_title,'Distance':i[1]})


    cf_recs_sorted = sorted(cf_recs, key=lambda x: x['Distance']) # Sort the result
    df = pd.DataFrame(cf_recs_sorted, index = range(1,n_recs))

    return df

In [150]:
n_recs = 10
batman_df = movie_recommender_engine('Batman', user_rating_matrix, cf_knn_model, n_recs)
batman_df

Unnamed: 0,Title,Distance
1,True Lies,0.31049
2,Batman Forever,0.32548
3,The Fugitive,0.349277
4,Dances with Wolves,0.373423
5,Jurassic Park,0.374453
6,Ace Ventura: Pet Detective,0.39213
7,Die Hard: With a Vengeance,0.397594
8,Apollo 13,0.401325
9,Aladdin,0.40241


We see the recommended movies are mostly spy and espionage notably Batman Forever at the second place.

In [151]:
n_recs = 10
toystory_df = movie_recommender_engine('Toy Story', user_rating_matrix, cf_knn_model, n_recs)
toystory_df

Unnamed: 0,Title,Distance
1,Toy Story 2,0.40529
2,Star Wars,0.423812
3,Forrest Gump,0.435466
4,Independence Day,0.437054
5,Groundhog Day,0.451977
6,Back to the Future,0.4633
7,Jurassic Park,0.464803
8,Shrek,0.467315
9,Return of the Jedi,0.470666


We can see that the movie Toy Story 2 is at the first place for reccomendation.