# Importing neccessary libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
# from fuzzywuzzy import process

# Data Modelling

## Read 2 dataframes

In [2]:
movie_data_df = pd.read_csv('data/preprocessed_data.csv')
ratings_df = pd.read_csv('data/ratings_small.csv')

## Data preprocessing

Our goal is to build a user-item matrix based on the movie id and ratings from each user.

In [3]:
movie_data_df.head(10)

Unnamed: 0,movieId,title,genres,production_companies,production_countries,release_date,budget,revenue,runtime,vote_average,vote_count
0,1,Toy Story,"Animation, Comedy, Family",Pixar Animation Studios,United States of America,1995-10-30,30000000,373554033.0,81.0,7.7,5415.0
1,2,Jumanji,"Adventure, Fantasy, Family","TriStar Pictures, Teitler Film, Interscope Com...",United States of America,1995-12-15,65000000,262797249.0,104.0,6.9,2413.0
2,3,Grumpier Old Men,"Romance, Comedy","Warner Bros., Lancaster Gate",United States of America,1995-12-22,0,0.0,101.0,6.5,92.0
3,4,Waiting to Exhale,"Comedy, Drama, Romance",Twentieth Century Fox Film Corporation,United States of America,1995-12-22,16000000,81452156.0,127.0,6.1,34.0
4,5,Father of the Bride Part II,Comedy,"Sandollar Productions, Touchstone Pictures",United States of America,1995-02-10,0,76578911.0,106.0,5.7,173.0
5,6,Heat,"Action, Crime, Drama, Thriller","Regency Enterprises, Forward Pass, Warner Bros.",United States of America,1995-12-15,60000000,187436818.0,170.0,7.7,1886.0
6,7,Sabrina,"Comedy, Romance","Paramount Pictures, Scott Rudin Productions, M...","Germany, United States of America",1995-12-15,58000000,0.0,127.0,6.2,141.0
7,8,Tom and Huck,"Action, Adventure, Drama, Family",Walt Disney Pictures,United States of America,1995-12-22,0,0.0,97.0,5.4,45.0
8,9,Sudden Death,"Action, Adventure, Thriller","Universal Pictures, Imperial Entertainment, Si...",United States of America,1995-12-22,35000000,64350171.0,106.0,5.5,174.0
9,10,GoldenEye,"Adventure, Action, Thriller","United Artists, Eon Productions","United Kingdom, United States of America",1995-11-16,58000000,352194034.0,130.0,6.6,1194.0


In [4]:
merged_movie_data = ratings_df.merge(movie_data_df, on='movieId')
merged_movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,production_companies,production_countries,release_date,budget,revenue,runtime,vote_average,vote_count
0,1,31,2.5,1260759144,Dangerous Minds,"Drama, Crime","Hollywood Pictures, Via Rosa Productions, Don ...",United States of America,1995-08-11,0,180000000.0,99.0,6.4,249.0
1,7,31,3.0,851868750,Dangerous Minds,"Drama, Crime","Hollywood Pictures, Via Rosa Productions, Don ...",United States of America,1995-08-11,0,180000000.0,99.0,6.4,249.0
2,31,31,4.0,1273541953,Dangerous Minds,"Drama, Crime","Hollywood Pictures, Via Rosa Productions, Don ...",United States of America,1995-08-11,0,180000000.0,99.0,6.4,249.0
3,32,31,4.0,834828440,Dangerous Minds,"Drama, Crime","Hollywood Pictures, Via Rosa Productions, Don ...",United States of America,1995-08-11,0,180000000.0,99.0,6.4,249.0
4,36,31,3.0,847057202,Dangerous Minds,"Drama, Crime","Hollywood Pictures, Via Rosa Productions, Don ...",United States of America,1995-08-11,0,180000000.0,99.0,6.4,249.0


In [5]:
user_item = merged_movie_data[["userId", "movieId", "rating"]]
user_item

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,7,31,3.0
2,31,31,4.0
3,32,31,4.0
4,36,31,3.0
...,...,...,...
99845,664,64997,2.5
99846,664,72380,3.5
99847,665,129,3.0
99848,665,4736,1.0


In [6]:
user_item = user_item.groupby(['userId', 'movieId'], as_index=False).mean()

In [None]:
user_item_matrix = user_item.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,160718,161084,161155,161594,161830,161918,161944,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
user_rating_matrix = user_item_matrix.T
# print(user_rating_matrix)
user_rating_matrix.to_csv("data/user_rating.csv", index=True)
print(user_rating_matrix)

userId   1    2    3    4    5    6    7    8    9    10   ...  662  663  664  \
movieId                                                    ...                  
1        0.0  0.0  0.0  0.0  0.0  0.0  3.0  0.0  4.0  0.0  ...  0.0  4.0  3.5   
2        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  5.0  0.0  0.0   
3        0.0  0.0  0.0  0.0  4.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
161918   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
161944   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
162542   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
162672   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
163949   0.0  0.0  0.0  0.0 

In [27]:
x = pd.read_csv("data/user_rating.csv", index_col=0)
x

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
