In [1]:
import pandas as pd
import scipy
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

from surprise import KNNWithMeans, Reader, Dataset
from surprise.model_selection import GridSearchCV

In [2]:
ratings = pd.read_csv(r'userDatas.csv')
movies = pd.read_csv(r'netflix_titles.csv')
ratings = pd.merge(movies, ratings).drop(['date_added', 'release_year', 'rating', 'duration'], axis=1)
ratings.head()

Unnamed: 0,show_id,type,title,director,cast,country,listed_in,description,user_id,fav_genre,fav_cast,user_rating
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentaries,"As her father nears the end of his life, filmm...",u5,Documentaries,,2
1,s33,TV Show,Sex Education,,"Asa Butterfield, Gillian Anderson, Ncuti Gatwa...",United Kingdom,"British TV Shows, International TV Shows, TV C...",Insecure Otis has all the answers when it come...,u9,Comedies,"Joel Courtney,Stephen Jennings",5
2,s46,Movie,My Heroes Were Cowboys,Tyler Greco,,,Documentaries,Robin Wiltshire's painful childhood was rescue...,u5,Documentaries,,5
3,s47,Movie,Safe House,Daniel Espinosa,"Denzel Washington, Ryan Reynolds, Vera Farmiga...","South Africa, United States, Japan",Action & Adventure,Young CIA operative Matt Weston must get a dan...,u3,Action & Adventure,"Chris Hemsworth,Rain,Matt Damon,Jason Statham,...",3
4,s83,TV Show,Lucifer,,"Tom Ellis, Lauren German, Kevin Alejandro, D.B...",United States,"Crime TV Shows, TV Comedies, TV Dramas","Bored with being the Lord of Hell, the devil r...",u9,Comedies,"Joel Courtney,Stephen Jennings",5


In [3]:
ratings['cast'] = ratings['cast'].fillna(' ')
ratings['fav_cast'] = ratings['fav_cast'].fillna(' ')
ratings['director'] = ratings['director'].fillna(' ')
ratings['country'] = ratings['country'].fillna(' ')

In [4]:
reader = Reader(rating_scale=(0, 5), line_format = 'user item rating')
gs_data = Dataset.load_from_df(ratings[['user_id', 'title', 'user_rating']], reader)

In [5]:
sim_options = {
    "name": ["cosine", "pearson"],
    "min_support": [3, 4, 5, 6],
}

param_grid = {
    "k": [10, 20, 30],
    "min_k": [1, 2, 3],
    "sim_options": sim_options
}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=10)
gs_res = gs.fit(gs_data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [6]:
print(gs.best_params["rmse"])
print('rmse:', gs.best_score["rmse"])

{'k': 10, 'min_k': 1, 'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': True}}
rmse: 1.0825187833285916


In [7]:
new_user1 = [("ONE PIECE",5),("Sword Art Online",2),("Sex Education",4),("Hunter X Hunter (2011)",5),("Attack on Titan",4)]
new_ratings = ratings[['user_id', 'title', 'user_rating']]

for movie,rating in new_user1:
    new_ratings = new_ratings.append({'user_id': 'x1', 'title': movie, 'user_rating': rating}, ignore_index=True)
    
data = Dataset.load_from_df(new_ratings, reader)
trainingSet = data.build_full_trainset()

# To use parms from GridSearchCV
sim_options = {
    "name": "cosine",
    "min_support": 3,
    "user_based": True
}

algo = KNNWithMeans(sim_options=sim_options, k=10, min_k=1)
predictions = algo.fit(trainingSet)

pred_movies = []
for title in ratings['title']:
    if title in list(map(lambda n: n[0], new_user1)):
        continue
    pred = algo.predict('x1', title)
    pred_movies.append(pred)

pred_analysis = pd.DataFrame(pred_movies).sort_values('est', ascending=False)[['iid', 'est']]
pred_analysis.drop_duplicates(subset=["iid"]).head(15)

Computing the cosine similarity matrix...
Done computing similarity matrix.


Unnamed: 0,iid,est
15,Record of Ragnarok,5.0
52,The Devil Is a Part-Timer!,5.0
41,Death Note,5.0
20,Durarara!!,4.181818
0,Dick Johnson Is Dead,4.0
31,Cops and Robbers,4.0
32,Teen Mom 2,4.0
33,60 Days In,4.0
34,The Impossible,4.0
35,The American Barbecue Showdown,4.0
