In [1]:
import pandas as pd
import scipy
import numpy as np
import re
import nltk
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

## Retrieving data from CSV

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
#ratings = pd.merge(movies, ratings).drop(['date_added', 'release_year', 'rating', 'duration'], axis=1)
ratings = pd.merge(movies, ratings)

ratings.drop('timestamp', axis = 1, inplace=True)
ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


# USER-USER Collaboration filtering Recommendation System

## Pre-processing of data

#### Filling non value data as 0

In [3]:
ratings['rating'] = ratings['rating'].fillna('0')
ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


#### Converting strings into lower case, removing of stop words and splitting of words

In [4]:
ratings['clean_genre'] = ratings['genres'].str.lower()
ratings['clean_genre'] = ratings['clean_genre'].apply(lambda x: re.sub(' ', '', x))
ratings['clean_genre'] = ratings['clean_genre'].apply(lambda x: re.sub('\s+', ' ', x))
ratings['clean_genre'] = ratings['clean_genre'].apply(lambda x: x.split('|'))
#pd.set_option('display.max_rows', 100)
#ratings['clean_genre']
ratings

Unnamed: 0,movieId,title,genres,userId,rating,clean_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,"[adventure, animation, children, comedy, fantasy]"
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,"[adventure, animation, children, comedy, fantasy]"
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,"[adventure, animation, children, comedy, fantasy]"
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,"[adventure, animation, children, comedy, fantasy]"
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,"[adventure, animation, children, comedy, fantasy]"
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,"[action, animation, comedy, fantasy]"
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,"[animation, comedy, fantasy]"
100833,193585,Flint (2017),Drama,184,3.5,[drama]
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,"[action, animation]"


In [5]:
genre_ratings = ratings[['title', 'userId', 'rating', 'clean_genre']].copy()
genre_ratings.head()

Unnamed: 0,title,userId,rating,clean_genre
0,Toy Story (1995),1,4.0,"[adventure, animation, children, comedy, fantasy]"
1,Toy Story (1995),5,4.0,"[adventure, animation, children, comedy, fantasy]"
2,Toy Story (1995),7,4.5,"[adventure, animation, children, comedy, fantasy]"
3,Toy Story (1995),15,2.5,"[adventure, animation, children, comedy, fantasy]"
4,Toy Story (1995),17,4.5,"[adventure, animation, children, comedy, fantasy]"


## Creating functions for recommending movies of similar genres

#### Function that searches for genre type movies and returns the dataframe of movies

In [6]:
def get_similar_genre(genre):
    #similar_genre = genre_ratings['user_id', 'clean_genre']
    temp = []
    df = pd.DataFrame()
    #print(genre_ratings['clean_genre'][0][0])
    for i, x in enumerate(genre_ratings['clean_genre']):
        for y in x:
            if(y in genre):
                temp.append(i)
                     
    #temp = genre_ratings.loc[genre_ratings['clean_genre'][0][0] == ([genre])]
    #temp = genre_ratings.loc[genre_ratings['clean_genre'].str.contains(genre)]
    temp = set(temp)
    temp = list(temp)
    df = genre_ratings.iloc[temp]
    return df

Getting a dataframe of children genre

In [7]:
df_genre = get_similar_genre(["children"])
df_genre
#df_genre.drop_duplicates(inplace=True)
#get_similar_genre(["adventure"])

Unnamed: 0,title,userId,rating,clean_genre
0,Toy Story (1995),1,4.0,"[adventure, animation, children, comedy, fantasy]"
1,Toy Story (1995),5,4.0,"[adventure, animation, children, comedy, fantasy]"
2,Toy Story (1995),7,4.5,"[adventure, animation, children, comedy, fantasy]"
3,Toy Story (1995),15,2.5,"[adventure, animation, children, comedy, fantasy]"
4,Toy Story (1995),17,4.5,"[adventure, animation, children, comedy, fantasy]"
...,...,...,...,...
65507,Snow Dogs (2002),288,3.0,"[adventure, children, comedy]"
65508,Snow Dogs (2002),380,2.0,"[adventure, children, comedy]"
6510,"Goofy Movie, A (1995)",136,1.0,"[animation, children, comedy, romance]"
32766,Hercules (1997),19,4.0,"[adventure, animation, children, comedy, musical]"


Creates a table that shows different user and their ratings to different movie titles

In [8]:
recc_ratings = df_genre.pivot_table(index=['userId'],columns=['title'],values='rating')
recc_ratings.fillna(0, inplace=True)
recc_ratings

title,*batteries not included (1987),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),101 Dalmatians II: Patch's London Adventure (2003),102 Dalmatians (2000),1776 (1972),3 Ninjas (1992),3 Ninjas Kick Back (1994),3 Ninjas Knuckle Up (1995),3 Ninjas: High Noon On Mega Mountain (1998),...,Wizards of Waverly Place: The Movie (2009),Wow! A Talking Fish! (1983),"Yearling, The (1946)",Yogi Bear (2010),Yongary: Monster from the Deep (1967),Young Sherlock Holmes (1985),"Yours, Mine and Ours (1968)",Zathura (2005),Zeus and Roxanne (1997),Zootopia (2016)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Sum the total ratings from all users for each movie title

In [9]:
recc_columns = recc_ratings.sum(axis=0).sort_values(ascending=False)
#recc_columns = recc_ratings.pivot_table(columns=[''])
recc_columns = pd.DataFrame(recc_columns)
recc_columns['sum'] = recc_ratings.sum(axis=0).sort_values(ascending=False)
#recc_columns.(columns=recc_columns[0], axis=1, inplace=True)
del recc_columns[recc_columns.columns[0]]
recc_columns

Unnamed: 0_level_0,sum
title,Unnamed: 1_level_1
Toy Story (1995),843.0
Aladdin (1992),694.0
"Lion King, The (1994)",678.0
Shrek (2001),657.5
Finding Nemo (2003),558.5
...,...
The Star Wars Holiday Special (1978),0.5
Born to Be Wild (1995),0.5
Tooth Fairy 2 (2012),0.5
Arthur Christmas (2011),0.5


Count the number of users that have rated the movie

In [10]:
recc_normalization = df_genre['title'].value_counts()
recc_normalization = pd.DataFrame(recc_normalization)
recc_normalization.rename(columns={'title':'count'}, inplace=True)
recc_normalization

Unnamed: 0,count
Toy Story (1995),215
Aladdin (1992),183
"Lion King, The (1994)",172
Shrek (2001),170
Beauty and the Beast (1991),146
...,...
On the Trail of the Bremen Town Musicians (1973),1
Gena the Crocodile (1969),1
"Little Drummer Boy, The (1968)",1
Gulliver's Travels (1996),1


In [11]:
#recc_merge = pd.join(recc_columns, recc_normalization)
recc_merge = recc_columns.join(recc_normalization)
recc_merge

Unnamed: 0_level_0,sum,count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),843.0,215
Aladdin (1992),694.0,183
"Lion King, The (1994)",678.0,172
Shrek (2001),657.5,170
Finding Nemo (2003),558.5,141
...,...,...
The Star Wars Holiday Special (1978),0.5,1
Born to Be Wild (1995),0.5,1
Tooth Fairy 2 (2012),0.5,1
Arthur Christmas (2011),0.5,1


## Normalizing + removing count values <= 100

In [12]:
recc_merge['normalize'] = (recc_merge['sum']/recc_merge['count'])
recc_merge['count'].dtypes
#recc_merge = recc_merge.drop(recc_merge[recc_merge['count'] < 100].index, inplace=True)
recc_merge = recc_merge[(recc_merge['count'] > 100)]
recc_merge.sort_values(by='count', ascending=False).head(10)

Unnamed: 0_level_0,sum,count,normalize
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story (1995),843.0,215,3.92093
Aladdin (1992),694.0,183,3.79235
"Lion King, The (1994)",678.0,172,3.94186
Shrek (2001),657.5,170,3.867647
Beauty and the Beast (1991),550.5,146,3.770548
Finding Nemo (2003),558.5,141,3.960993
"Monsters, Inc. (2001)",511.0,132,3.871212
Babe (1995),467.5,128,3.652344
"Incredibles, The (2004)",479.5,125,3.836
E.T. the Extra-Terrestrial (1982),459.5,122,3.766393


# NETFLIX DATA 

In [13]:
netflix_ratings = pd.read_csv('userDatas.csv')
netflix_movies = pd.read_csv('netflix_titles.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

#movies

Pre-processing of IMDB movie data

In [14]:
movies.title = movies.title.str.replace('\d+', '')
movies.title = movies.title.str.replace('(', '')
movies.title = movies.title.str.replace(')', '')
#movies['title'] = movies['title'].apply(lambda x: re.sub(' ', '', x))
movies['title'] = movies['title'].str.lower()
movies['title'] = movies['title'].str.strip()
movies

  movies.title = movies.title.str.replace('\d+', '')
  movies.title = movies.title.str.replace('(', '')
  movies.title = movies.title.str.replace(')', '')


Unnamed: 0,movieId,title,genres
0,1,toy story,Adventure|Animation|Children|Comedy|Fantasy
1,2,jumanji,Adventure|Children|Fantasy
2,3,grumpier old men,Comedy|Romance
3,4,waiting to exhale,Comedy|Drama|Romance
4,5,father of the bride part ii,Comedy
...,...,...,...
9737,193581,black butler: book of the atlantic,Action|Animation|Comedy|Fantasy
9738,193583,no game no life: zero,Animation|Comedy|Fantasy
9739,193585,flint,Drama
9740,193587,bungo stray dogs: dead apple,Action|Animation


Pre-processing of netflix data

In [15]:
netflix_movies['title'] = netflix_movies['title'].str.lower()
netflix_movies['title'] = netflix_movies['title'].apply(lambda x: re.sub(' ', '', x))
netflix_movies

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,dickjohnsonisdead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,blood&water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,jailbirdsneworleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,kotafactory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,zombiedumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


Merge IMDB data and Netflix data to retrieve ratings for netflix shows for ratings

In [20]:
overall_movies = movies.merge(netflix_movies, on='title', how='left')
overall_movies

Unnamed: 0,movieId,title,genres,show_id,type,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,1,toy story,Adventure|Animation|Children|Comedy|Fantasy,,,,,,,,,,,
1,2,jumanji,Adventure|Children|Fantasy,,,,,,,,,,,
2,3,grumpier old men,Comedy|Romance,,,,,,,,,,,
3,4,waiting to exhale,Comedy|Drama|Romance,,,,,,,,,,,
4,5,father of the bride part ii,Comedy,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,black butler: book of the atlantic,Action|Animation|Comedy|Fantasy,,,,,,,,,,,
9738,193583,no game no life: zero,Animation|Comedy|Fantasy,,,,,,,,,,,
9739,193585,flint,Drama,,,,,,,,,,,
9740,193587,bungo stray dogs: dead apple,Action|Animation,,,,,,,,,,,


In [21]:
overall_movies = overall_movies.dropna()

In [22]:
overall_movies = overall_movies[['movieId', 'title', 'genres']]
overall_movies = overall_movies.merge(ratings)
overall_movies = overall_movies.drop('timestamp', axis='columns')
overall_movies

Unnamed: 0,movieId,title,genres,userId,rating
0,7,sabrina,Comedy|Romance,6,4.0
1,7,sabrina,Comedy|Romance,14,3.0
2,7,sabrina,Comedy|Romance,19,2.0
3,7,sabrina,Comedy|Romance,31,4.0
4,7,sabrina,Comedy|Romance,32,4.0
...,...,...,...,...,...
3575,182823,bright,Action|Crime|Fantasy,212,4.0
3576,182823,bright,Action|Crime|Fantasy,249,3.5
3577,182823,bright,Action|Crime|Fantasy,380,3.0
3578,182823,bright,Action|Crime|Fantasy,567,1.0


Clean and split the genre in the merged dataframe

In [23]:
overall_movies['clean_genre'] = overall_movies['genres'].str.lower()
overall_movies['clean_genre'] = overall_movies['clean_genre'].apply(lambda x: re.sub(' ', '', x))
overall_movies['clean_genre'] = overall_movies['clean_genre'].apply(lambda x: re.sub('\s+', ' ', x))
overall_movies['clean_genre'] = overall_movies['clean_genre'].apply(lambda x: x.split('|'))
overall_movies

Unnamed: 0,movieId,title,genres,userId,rating,clean_genre
0,7,sabrina,Comedy|Romance,6,4.0,"[comedy, romance]"
1,7,sabrina,Comedy|Romance,14,3.0,"[comedy, romance]"
2,7,sabrina,Comedy|Romance,19,2.0,"[comedy, romance]"
3,7,sabrina,Comedy|Romance,31,4.0,"[comedy, romance]"
4,7,sabrina,Comedy|Romance,32,4.0,"[comedy, romance]"
...,...,...,...,...,...,...
3575,182823,bright,Action|Crime|Fantasy,212,4.0,"[action, crime, fantasy]"
3576,182823,bright,Action|Crime|Fantasy,249,3.5,"[action, crime, fantasy]"
3577,182823,bright,Action|Crime|Fantasy,380,3.0,"[action, crime, fantasy]"
3578,182823,bright,Action|Crime|Fantasy,567,1.0,"[action, crime, fantasy]"


In [24]:
def get_similar_netflix_genre(genre):
    #similar_genre = genre_ratings['user_id', 'clean_genre']
    temp = []
    df = pd.DataFrame()
    #print(genre_ratings['clean_genre'][0][0])
    for i, x in enumerate(overall_movies['clean_genre']):
        for y in x:
            if(y in genre):
                temp.append(i)
                     
    #temp = genre_ratings.loc[genre_ratings['clean_genre'][0][0] == ([genre])]
    #temp = genre_ratings.loc[genre_ratings['clean_genre'].str.contains(genre)]
    temp = set(temp)
    temp = list(temp)
    df = overall_movies.iloc[temp]
    return df

In [25]:
df_netflix_genre = get_similar_netflix_genre(["children"])
df_netflix_genre

Unnamed: 0,movieId,title,genres,userId,rating,clean_genre
3239,84944,rango,Action|Adventure|Animation|Children|Comedy|Wes...,89,5.0,"[action, adventure, animation, children, comed..."
3240,84944,rango,Action|Adventure|Animation|Children|Comedy|Wes...,103,4.0,"[action, adventure, animation, children, comed..."
3241,84944,rango,Action|Adventure|Animation|Children|Comedy|Wes...,177,2.5,"[action, adventure, animation, children, comed..."
3242,84944,rango,Action|Adventure|Animation|Children|Comedy|Wes...,246,5.0,"[action, adventure, animation, children, comed..."
3243,84944,rango,Action|Adventure|Animation|Children|Comedy|Wes...,249,3.5,"[action, adventure, animation, children, comed..."
...,...,...,...,...,...,...
2529,40851,zathura,Action|Adventure|Children|Fantasy,232,2.5,"[action, adventure, children, fantasy]"
2530,40851,zathura,Action|Adventure|Children|Fantasy,380,4.0,"[action, adventure, children, fantasy]"
2531,40851,zathura,Action|Adventure|Children|Fantasy,408,4.0,"[action, adventure, children, fantasy]"
2532,40851,zathura,Action|Adventure|Children|Fantasy,605,3.0,"[action, adventure, children, fantasy]"


In [26]:
recc_netflix_ratings = df_netflix_genre.pivot_table(index=['userId'],columns=['title'],values='rating')
recc_netflix_ratings.fillna(0, inplace=True)
recc_netflix_ratings

title,balto,beethoven,benji,bolt,g-force,golmaal,home,hop,hugo,rango,tarzan,turbo,zathura
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
20,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
21,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
599,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [27]:
recc_netflix_columns = recc_netflix_ratings.sum(axis=0).sort_values(ascending=False)
#recc_columns = recc_ratings.pivot_table(columns=[''])
recc_netflix_columns = pd.DataFrame(recc_netflix_columns)
recc_netflix_columns['sum'] = recc_netflix_ratings.sum(axis=0).sort_values(ascending=False)
#recc_columns.(columns=recc_columns[0], axis=1, inplace=True)
del recc_netflix_columns[recc_netflix_columns.columns[0]]
recc_netflix_columns

Unnamed: 0_level_0,sum
title,Unnamed: 1_level_1
tarzan,83.0
hugo,67.5
rango,61.5
bolt,61.0
home,26.0
balto,25.0
beethoven,19.5
benji,19.0
zathura,13.5
hop,5.5


In [28]:
recc_netflix_normalization = df_netflix_genre['title'].value_counts()
recc_netflix_normalization = pd.DataFrame(recc_netflix_normalization)
recc_netflix_normalization.rename(columns={'title':'count'}, inplace=True)
recc_netflix_normalization

Unnamed: 0,count
tarzan,24
hugo,18
bolt,18
rango,17
beethoven,11
balto,8
home,8
benji,5
zathura,4
hop,2


In [29]:
recc_netflix_merge = recc_netflix_columns.join(recc_netflix_normalization)
recc_netflix_merge

Unnamed: 0_level_0,sum,count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
tarzan,83.0,24
hugo,67.5,18
rango,61.5,17
bolt,61.0,18
home,26.0,8
balto,25.0,8
beethoven,19.5,11
benji,19.0,5
zathura,13.5,4
hop,5.5,2


In [30]:
recc_netflix_merge['normalize'] = (recc_netflix_merge['sum']/recc_netflix_merge['count'])
recc_netflix_merge['count'].dtypes
#recc_merge = recc_merge.drop(recc_merge[recc_merge['count'] < 100].index, inplace=True)
recc_netflix_merge = recc_netflix_merge[(recc_netflix_merge['count'] > 10)]
recc_netflix_merge.sort_values(by='count', ascending=False).head(10)

Unnamed: 0_level_0,sum,count,normalize
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tarzan,83.0,24,3.458333
hugo,67.5,18,3.75
bolt,61.0,18,3.388889
rango,61.5,17,3.617647
beethoven,19.5,11,1.772727


# USER-ITEM Collaborative Filtering Recommendation System

In [38]:
ratings = pd.read_csv("userDatas.csv")
netflixData = pd.read_csv("netflix_titles.csv")
ratings = ratings.merge(netflixData)

In [39]:
ratings['cast'] = ratings['cast'].fillna(' ')
ratings['fav_cast'] = ratings['fav_cast'].fillna(' ')
ratings['director'] = ratings['director'].fillna(' ')
ratings['country'] = ratings['country'].fillna(' ')

ratings.head(61)

Unnamed: 0,user_id,fav_genre,show_id,fav_cast,user_rating,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,u1,"Action & Adventure,Scifi & Fantasy",s144,"Tobey Maguire,James Franco",3,Movie,Green Lantern,Martin Campbell,"Ryan Reynolds, Blake Lively, Peter Sarsgaard, ...",United States,"September 1, 2021",2011,PG-13,114 min,"Action & Adventure, Sci-Fi & Fantasy",Test pilot Hal Jordan harnesses glowing new po...
1,u1,"Action & Adventure,Scifi & Fantasy",s6201,"Tobey Maguire,James Franco",5,Movie,Avengers: Infinity War,"Anthony Russo, Joe Russo","Robert Downey Jr., Josh Brolin, Mark Ruffalo, ...",United States,"December 25, 2018",2018,PG-13,150 min,"Action & Adventure, Sci-Fi & Fantasy",Superheroes amass to stop intergalactic sociop...
2,u3,Action & Adventure,s6201,"Chris Hemsworth,Rain,Matt Damon,Jason Statham,...",5,Movie,Avengers: Infinity War,"Anthony Russo, Joe Russo","Robert Downey Jr., Josh Brolin, Mark Ruffalo, ...",United States,"December 25, 2018",2018,PG-13,150 min,"Action & Adventure, Sci-Fi & Fantasy",Superheroes amass to stop intergalactic sociop...
3,u1,"Action & Adventure,Scifi & Fantasy",s8068,"Tobey Maguire,James Franco",4,Movie,Spider-Man 3,Sam Raimi,"Tobey Maguire, Kirsten Dunst, James Franco, Th...",United States,"November 1, 2019",2007,PG-13,139 min,"Action & Adventure, Sci-Fi & Fantasy",The seemingly invincible Spider-Man goes up ag...
4,u1,"Action & Adventure,Scifi & Fantasy",s8069,"Tobey Maguire,James Franco",5,Movie,Spider-Man: Into the Spider-Verse,"Peter Ramsey, Rodney Rothman, Bob Persichetti","Shameik Moore, Jake Johnson, Hailee Steinfeld,...",United States,"June 26, 2019",2018,PG,117 min,"Action & Adventure, Comedies","After being bitten by a radioactive spider, Br..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,u10,Dramas,s113,"Naomi Watts,Miachel Keaton,Zaria,Amy Ryan",5,Movie,Worth,Sara Colangelo,"Michael Keaton, Stanley Tucci, Amy Ryan, Shuno...",,"September 3, 2021",2021,PG-13,119 min,Dramas,"In the wake of the Sept. 11 attacks, a lawyer ..."
57,u10,Dramas,s1090,"Naomi Watts,Miachel Keaton,Zaria,Amy Ryan",3,Movie,Two Distant Strangers,"Travon Free, Martin Desmond Roe","Joey Bada$$, Andrew Howard, Zaria",United States,"April 9, 2021",2021,TV-MA,32 min,Dramas,"In this Oscar-nominated short film, a man tryi..."
58,u10,Dramas,s1485,"Naomi Watts,Miachel Keaton,Zaria,Amy Ryan",2,Movie,Cops and Robbers,"Arnon Manor, Timothy Ware-Hill",Timothy Ware-Hill,United States,"December 28, 2020",2020,PG-13,8 min,Dramas,Animation and activism unite in this multimedi...
59,u10,Dramas,s1436,"Naomi Watts,Miachel Keaton,Zaria,Amy Ryan",5,Movie,Pieces of a Woman,Kornél Mundruczó,"Vanessa Kirby, Shia LaBeouf, Ellen Burstyn, Mo...","Canada, Hungary, United States","January 7, 2021",2020,R,128 min,Dramas,A heartbreaking home birth leaves a woman grap...


#### Create a table that shows user's rating of different netflix titles

In [40]:
userRatings = ratings.pivot_table(index=['user_id'],columns=['title'],values='user_rating')
print("Before: ",userRatings.shape)
#userRatings = userRatings.dropna(how='all').fillna(0)
userRatings.fillna(0, inplace=True)
print("After: ",userRatings.shape)
userRatings.head(10)

Before:  (10, 58)
After:  (10, 58)


title,60 Days In,Akame ga Kill!,Attack on Titan,Avengers: Infinity War,Blade Runner: The Final Cut,Bling Empire,"Bob Ross: Happy Accidents, Betrayal & Greed",Cops and Robbers,Cosmic Sin,Creating an Army of the Dead,...,The Kissing Booth 3,The Secret Life of Pets 2,The Seven Deadly Sins,The Seventh Day,The Show Must Go On: The Queen + Adam Lambert Story,Tinker Bell and the Legend of the NeverBeast,Truth or Dare,Two Distant Strangers,Why Did You Kill Me?,Worth
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u1,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0
u2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
u3,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u4,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u5,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
u6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0
u7,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u8,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
u9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Movie Correlation data using pearson correlations

In [41]:
corrMatrix = userRatings.corr(method='pearson')
corrMatrix.head(10)

title,60 Days In,Akame ga Kill!,Attack on Titan,Avengers: Infinity War,Blade Runner: The Final Cut,Bling Empire,"Bob Ross: Happy Accidents, Betrayal & Greed",Cops and Robbers,Cosmic Sin,Creating an Army of the Dead,...,The Kissing Booth 3,The Secret Life of Pets 2,The Seven Deadly Sins,The Seventh Day,The Show Must Go On: The Queen + Adam Lambert Story,Tinker Bell and the Legend of the NeverBeast,Truth or Dare,Two Distant Strangers,Why Did You Kill Me?,Worth
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60 Days In,1.0,-0.111111,-0.111111,-0.166667,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Akame ga Kill!,-0.111111,1.0,1.0,-0.166667,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Attack on Titan,-0.111111,1.0,1.0,-0.166667,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Avengers: Infinity War,-0.166667,-0.166667,-0.166667,1.0,-0.166667,-0.166667,-0.166667,-0.166667,0.666667,-0.166667,...,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667
Blade Runner: The Final Cut,-0.111111,-0.111111,-0.111111,-0.166667,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Bling Empire,1.0,-0.111111,-0.111111,-0.166667,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
"Bob Ross: Happy Accidents, Betrayal & Greed",-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,1.0,-0.111111,-0.111111,1.0,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111
Cops and Robbers,-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,1.0
Cosmic Sin,-0.111111,-0.111111,-0.111111,0.666667,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Creating an Army of the Dead,-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,1.0,-0.111111,-0.111111,1.0,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111


In [42]:
def standardize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    return new_row

df_temp = userRatings
df_std = df_temp.apply(standardize)

user_similarity = cosine_similarity(df_std.T)
user_similarity

array([[ 1.        , -0.11111111, -0.11111111, ..., -0.11111111,
        -0.11111111, -0.11111111],
       [-0.11111111,  1.        ,  1.        , ..., -0.11111111,
        -0.11111111, -0.11111111],
       [-0.11111111,  1.        ,  1.        , ..., -0.11111111,
        -0.11111111, -0.11111111],
       ...,
       [-0.11111111, -0.11111111, -0.11111111, ...,  1.        ,
        -0.11111111,  1.        ],
       [-0.11111111, -0.11111111, -0.11111111, ..., -0.11111111,
         1.        , -0.11111111],
       [-0.11111111, -0.11111111, -0.11111111, ...,  1.        ,
        -0.11111111,  1.        ]])

In [43]:
from sklearn.metrics.pairwise import euclidean_distances
euclidean_dist = euclidean_distances(df_std.T)
euclidean_dist

array([[0.        , 1.41421356, 1.41421356, ..., 1.41421356, 1.41421356,
        1.41421356],
       [1.41421356, 0.        , 0.        , ..., 1.41421356, 1.41421356,
        1.41421356],
       [1.41421356, 0.        , 0.        , ..., 1.41421356, 1.41421356,
        1.41421356],
       ...,
       [1.41421356, 1.41421356, 1.41421356, ..., 0.        , 1.41421356,
        0.        ],
       [1.41421356, 1.41421356, 1.41421356, ..., 1.41421356, 0.        ,
        1.41421356],
       [1.41421356, 1.41421356, 1.41421356, ..., 0.        , 1.41421356,
        0.        ]])

In [44]:
user_similarity_df = pd.DataFrame(user_similarity,index=userRatings.columns,columns=userRatings.columns)
user_similarity_df

title,60 Days In,Akame ga Kill!,Attack on Titan,Avengers: Infinity War,Blade Runner: The Final Cut,Bling Empire,"Bob Ross: Happy Accidents, Betrayal & Greed",Cops and Robbers,Cosmic Sin,Creating an Army of the Dead,...,The Kissing Booth 3,The Secret Life of Pets 2,The Seven Deadly Sins,The Seventh Day,The Show Must Go On: The Queen + Adam Lambert Story,Tinker Bell and the Legend of the NeverBeast,Truth or Dare,Two Distant Strangers,Why Did You Kill Me?,Worth
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60 Days In,1.0,-0.111111,-0.111111,-0.166667,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Akame ga Kill!,-0.111111,1.0,1.0,-0.166667,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Attack on Titan,-0.111111,1.0,1.0,-0.166667,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Avengers: Infinity War,-0.166667,-0.166667,-0.166667,1.0,-0.166667,-0.166667,-0.166667,-0.166667,0.666667,-0.166667,...,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667
Blade Runner: The Final Cut,-0.111111,-0.111111,-0.111111,-0.166667,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Bling Empire,1.0,-0.111111,-0.111111,-0.166667,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
"Bob Ross: Happy Accidents, Betrayal & Greed",-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,1.0,-0.111111,-0.111111,1.0,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111
Cops and Robbers,-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,1.0
Cosmic Sin,-0.111111,-0.111111,-0.111111,0.666667,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Creating an Army of the Dead,-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,1.0,-0.111111,-0.111111,1.0,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111


In [45]:
def get_similar_cosine(movie_name, user_rating):
    similar_score = user_similarity_df[movie_name]
    similar_score = similar_score.sort_values(ascending=False)
    #print(type(similar_ratings))
    return similar_score

In [62]:
new_user1 = [("ONE PIECE",5),("Sword Art Online",2),("Sex Education",4),("Hunter X Hunter (2011)",5),("Attack on Titan",4)]
similar_movies = pd.DataFrame()
for movie,rating in new_user1:
    similar_movies = similar_movies.append(get_similar_cosine(movie,rating),ignore_index = True)

for (x,_) in new_user1:
    del similar_movies[x]
    #print("drop "+ x)
similar_movies.head()

  similar_movies = similar_movies.append(get_similar_cosine(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_cosine(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_cosine(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_cosine(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_cosine(movie,rating),ignore_index = True)


title,The Devil Is a Part-Timer!,DOTA: Dragon's Blood,The Seven Deadly Sins,Akame ga Kill!,Durarara!!,Death Note,Record of Ragnarok,The Karate Kid Part II,The Karate Kid,Blade Runner: The Final Cut,...,Spider-Man 3,Rogue Warfare: Death of a Nation,Pieces of a Woman,Ninja Assassin,Creating an Army of the Dead,Cosmic Sin,Cops and Robbers,Worth,Jiu Jitsu,Avengers: Infinity War
0,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.164581,-0.166667
1,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.164581,-0.166667
2,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.164581,-0.166667
3,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.164581,-0.166667
4,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.164581,-0.166667


Top 10 netflix recommendations

In [47]:
similar_movies.sum().sort_values(ascending=False).head(10)

title
The Devil Is a Part-Timer!    3.888889
Durarara!!                    3.888889
Death Note                    3.888889
DOTA: Dragon's Blood          3.888889
Akame ga Kill!                3.888889
The Seven Deadly Sins         3.888889
Record of Ragnarok            2.500000
The Kissing Booth 3           0.555556
Lucifer                       0.555556
Friends                       0.555556
dtype: float64

In [48]:
corrMatrix = userRatings.corr(method='pearson')
corrMatrix.head(10)

title,60 Days In,Akame ga Kill!,Attack on Titan,Avengers: Infinity War,Blade Runner: The Final Cut,Bling Empire,"Bob Ross: Happy Accidents, Betrayal & Greed",Cops and Robbers,Cosmic Sin,Creating an Army of the Dead,...,The Kissing Booth 3,The Secret Life of Pets 2,The Seven Deadly Sins,The Seventh Day,The Show Must Go On: The Queen + Adam Lambert Story,Tinker Bell and the Legend of the NeverBeast,Truth or Dare,Two Distant Strangers,Why Did You Kill Me?,Worth
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60 Days In,1.0,-0.111111,-0.111111,-0.166667,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Akame ga Kill!,-0.111111,1.0,1.0,-0.166667,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Attack on Titan,-0.111111,1.0,1.0,-0.166667,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Avengers: Infinity War,-0.166667,-0.166667,-0.166667,1.0,-0.166667,-0.166667,-0.166667,-0.166667,0.666667,-0.166667,...,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667
Blade Runner: The Final Cut,-0.111111,-0.111111,-0.111111,-0.166667,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Bling Empire,1.0,-0.111111,-0.111111,-0.166667,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
"Bob Ross: Happy Accidents, Betrayal & Greed",-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,1.0,-0.111111,-0.111111,1.0,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111
Cops and Robbers,-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,-0.111111,1.0,-0.111111,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,1.0
Cosmic Sin,-0.111111,-0.111111,-0.111111,0.666667,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111
Creating an Army of the Dead,-0.111111,-0.111111,-0.111111,-0.166667,-0.111111,-0.111111,1.0,-0.111111,-0.111111,1.0,...,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,1.0,-0.111111


In [49]:
def get_similar_pearson(movie_name, user_rating):
    similar_score = corrMatrix[movie_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    #print(type(similar_ratings))
    return similar_score

In [50]:
new_user1 = [("ONE PIECE",5),("Sword Art Online",2),("Sex Education",4),("Hunter X Hunter (2011)",5),("Attack on Titan",4)]
similar_movies = pd.DataFrame()
for movie,rating in new_user1:
    similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)

for (x,_) in new_user1:
    del similar_movies[x]
    #print("drop "+ x)
similar_movies.head()

  similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)
  similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)


title,Akame ga Kill!,Durarara!!,Death Note,The Devil Is a Part-Timer!,The Seven Deadly Sins,DOTA: Dragon's Blood,Record of Ragnarok,The Impossible,Tarzan,Friends,...,Creating an Army of the Dead,The Seventh Day,Truth or Dare,Silent Hill: Revelation,The Final Destination,The Conjuring,Scream 2,Seaspiracy,Jiu Jitsu,Avengers: Infinity War
0,2.5,2.5,2.5,2.5,2.5,2.5,1.666667,-0.277778,-0.277778,-0.277778,...,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.411452,-0.416667
1,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.333333,0.055556,0.055556,0.055556,...,0.055556,0.055556,0.055556,0.055556,0.055556,0.055556,0.055556,0.055556,0.08229,0.083333
2,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.25,-0.166667,-0.166667,1.5,...,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.246871,-0.25
3,2.5,2.5,2.5,2.5,2.5,2.5,1.666667,-0.277778,-0.277778,-0.277778,...,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.277778,-0.411452,-0.416667
4,1.5,1.5,1.5,1.5,1.5,1.5,1.0,-0.166667,-0.166667,-0.166667,...,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.246871,-0.25


In [51]:
similar_movies.sum().sort_values(ascending=False).head(10)

title
Akame ga Kill!                5.833333
The Devil Is a Part-Timer!    5.833333
Durarara!!                    5.833333
Death Note                    5.833333
DOTA: Dragon's Blood          5.833333
The Seven Deadly Sins         5.833333
Record of Ragnarok            3.750000
The Kissing Booth 3           0.833333
Friends                       0.833333
Lucifer                       0.833333
dtype: float64

In [60]:
# action_lover = [("The Amazing Spider-Man (2012)",5),("Mission: Impossible III (2006)",4),("Toy Story 3 (2010)",2),("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",4)]
# similar_movies = pd.DataFrame()
# for movie,rating in action_lover:
#     similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)

# similar_movies.head(10)
# similar_movies.sum().sort_values(ascending=False).head(20)

#### Evalaution Metric : RMSE ( Root Mean Squared Error )

In [58]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [59]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    return sqrt(mean_squared_error(pred, actual))

# Predict ratings on the training data with both similarity score
user_prediction = predict(euclidean_dist, corrMatrix, type='user')
item_prediction = predict(euclidean_dist, corrMatrix, type='item')
# RMSE on the train data
print('User-based CF RMSE: ' + str(rmse(user_prediction, corrMatrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, corrMatrix)))

User-based CF RMSE: 1.4075937705492223
Item-based CF RMSE: 0.623415600864975
