In [1]:
import pandas as pd
import scipy
import numpy as np
import re
import nltk
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

## Retrieving data from CSV

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.merge(movies, ratings)

ratings.drop('timestamp', axis = 1, inplace=True)
ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


# USER-USER Collaboration filtering Recommendation System

## Pre-processing of data

#### Filling non value data as 0

In [3]:
ratings['rating'] = ratings['rating'].fillna('0')
ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


#### Converting strings into lower case, removing of stop words and splitting of words

In [4]:
ratings['clean_genre'] = ratings['genres'].str.lower()
ratings['clean_genre'] = ratings['clean_genre'].apply(lambda x: re.sub(' ', '', x))
ratings['clean_genre'] = ratings['clean_genre'].apply(lambda x: re.sub('\s+', ' ', x))
ratings['clean_genre'] = ratings['clean_genre'].apply(lambda x: x.split('|'))
ratings

Unnamed: 0,movieId,title,genres,userId,rating,clean_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,"[adventure, animation, children, comedy, fantasy]"
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,"[adventure, animation, children, comedy, fantasy]"
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,"[adventure, animation, children, comedy, fantasy]"
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,"[adventure, animation, children, comedy, fantasy]"
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,"[adventure, animation, children, comedy, fantasy]"
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,"[action, animation, comedy, fantasy]"
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,"[animation, comedy, fantasy]"
100833,193585,Flint (2017),Drama,184,3.5,[drama]
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,"[action, animation]"


In [5]:
genre_ratings = ratings[['title', 'userId', 'rating', 'clean_genre']].copy()
genre_ratings.head()

Unnamed: 0,title,userId,rating,clean_genre
0,Toy Story (1995),1,4.0,"[adventure, animation, children, comedy, fantasy]"
1,Toy Story (1995),5,4.0,"[adventure, animation, children, comedy, fantasy]"
2,Toy Story (1995),7,4.5,"[adventure, animation, children, comedy, fantasy]"
3,Toy Story (1995),15,2.5,"[adventure, animation, children, comedy, fantasy]"
4,Toy Story (1995),17,4.5,"[adventure, animation, children, comedy, fantasy]"


## Creating functions for recommending movies of similar genres

#### Function that searches for genre type movies and returns the dataframe of movies

In [6]:
def get_similar_genre(genre):
    temp = []
    df = pd.DataFrame()
    for i, x in enumerate(genre_ratings['clean_genre']):
        if (all(elem in x for elem in genre)):
            temp.append(i)
    
    #temp = set(temp)
    #temp = list(temp)
    
    #df = genre_ratings.iloc[temp]
    #for j, y in enumerate(genre_ratings['clean_genre']): 
        #for n in y:
            #if (n in genre):
                #temp.append(j)
                    
    temp = set(temp)
    temp = list(temp)
    df = genre_ratings.iloc[temp]
    return df

Getting a dataframe of children genre

In [16]:
def get_recommendation(genre, xList):
    df_genre = pd.DataFrame()
    df_genre = get_similar_genre(genre)
    recc_list = []
    index_list = xList
    final_list = []
    
    #Creates a table that shows different user and their ratings to different movie titles
    recc_ratings = df_genre.pivot_table(index=['userId'],columns=['title'],values='rating')
    recc_ratings.fillna(0, inplace=True)
    
    #Sum the total ratings from all users for each movie title
    recc_columns = recc_ratings.sum(axis=0).sort_values(ascending=False)
    recc_columns = pd.DataFrame(recc_columns)
    recc_columns['sum'] = recc_ratings.sum(axis=0).sort_values(ascending=False)
    del recc_columns[recc_columns.columns[0]]
    
    #Count the number of users that have rated the movie
    recc_normalization = df_genre['title'].value_counts()
    recc_normalization = pd.DataFrame(recc_normalization)
    recc_normalization.rename(columns={'title':'count'}, inplace=True)
    
    recc_merge = recc_columns.join(recc_normalization)
    
    #Normalizing + removing count values <= 100
    recc_merge['normalize'] = (recc_merge['sum']/recc_merge['count'])
    recc_merge['count'].dtypes
    recc_merge = recc_merge[(recc_merge['count'] > 30)]
    if (len(recc_merge) < 5):
        get_recommendation(genre[:(len(genre)-1)], index_list)
    else :
        recc_merge.sort_values(by='count', ascending=False).head(10)
        index_list = (recc_merge.index.values.tolist())
        print("Top Recommendations: ")
        
    return index_list[:5]

In [8]:
def get_sat(list):
    tempdict = {}
    for i in list:
        selection = input("Are you satisfied with " + i + " recommendation? [Y/N]").lower()
        tempdict[i] = selection
        
    return tempdict

In [17]:
get_recommendation(["crime", "drama"], [])

Top Recommendations: 
['Shawshank Redemption, The (1994)']


['Shawshank Redemption, The (1994)',
 'Pulp Fiction (1994)',
 'Fight Club (1999)',
 'Godfather, The (1972)',
 'Fargo (1996)']

In [10]:
get_recommendation(["children", "action", "adventure"], [])

Top Recommendations: 


[['Incredibles, The (2004)',
  'Goonies, The (1985)',
  'Kung Fu Panda (2008)',
  'Zootopia (2016)',
  'The Lego Movie (2014)']]

In [18]:
get_sat(get_recommendation(["children", "action"], []))

Top Recommendations: 
['Incredibles, The (2004)']
Are you satisfied with Incredibles, The (2004) recommendation? [Y/N]Y
Are you satisfied with Goonies, The (1985) recommendation? [Y/N]Y
Are you satisfied with Kung Fu Panda (2008) recommendation? [Y/N]N
Are you satisfied with Zootopia (2016) recommendation? [Y/N]N
Are you satisfied with The Lego Movie (2014) recommendation? [Y/N]N


{'Incredibles, The (2004)': 'y',
 'Goonies, The (1985)': 'y',
 'Kung Fu Panda (2008)': 'n',
 'Zootopia (2016)': 'n',
 'The Lego Movie (2014)': 'n'}

# NETFLIX DATA 

In [None]:
netflix_ratings = pd.read_csv('userDatas.csv')
netflix_movies = pd.read_csv('netflix_titles.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

#movies

Pre-processing of IMDB movie data

In [None]:
movies.title = movies.title.str.replace('\d+', '')
movies.title = movies.title.str.replace('(', '')
movies.title = movies.title.str.replace(')', '')
movies['title'] = movies['title'].str.lower()
movies['title'] = movies['title'].str.strip()
movies

Pre-processing of netflix data

In [None]:
netflix_movies['title'] = netflix_movies['title'].str.lower()
netflix_movies['title'] = netflix_movies['title'].apply(lambda x: re.sub(' ', '', x))
netflix_movies

Merge IMDB data and Netflix data to retrieve ratings for netflix shows for ratings

In [None]:
overall_movies = movies.merge(netflix_movies, on='title', how='left')
overall_movies

In [None]:
overall_movies = overall_movies.dropna()

In [None]:
overall_movies = overall_movies[['movieId', 'title', 'genres']]
overall_movies = overall_movies.merge(ratings)
overall_movies = overall_movies.drop('timestamp', axis='columns')
overall_movies

Clean and split the genre in the merged dataframe

In [None]:
overall_movies['clean_genre'] = overall_movies['genres'].str.lower()
overall_movies['clean_genre'] = overall_movies['clean_genre'].apply(lambda x: re.sub(' ', '', x))
overall_movies['clean_genre'] = overall_movies['clean_genre'].apply(lambda x: re.sub('\s+', ' ', x))
overall_movies['clean_genre'] = overall_movies['clean_genre'].apply(lambda x: x.split('|'))
overall_movies

In [None]:
def get_similar_netflix_genre(genre):
    temp = []
    df = pd.DataFrame()
    for i, x in enumerate(overall_movies['clean_genre']):
        for y in x:
            if(y in genre):
                temp.append(i)
                     
    temp = set(temp)
    temp = list(temp)
    df = overall_movies.iloc[temp]
    return df

In [None]:
df_netflix_genre = get_similar_netflix_genre(["children"])
df_netflix_genre

In [None]:
recc_netflix_ratings = df_netflix_genre.pivot_table(index=['userId'],columns=['title'],values='rating')
recc_netflix_ratings.fillna(0, inplace=True)
recc_netflix_ratings

In [None]:
recc_netflix_columns = recc_netflix_ratings.sum(axis=0).sort_values(ascending=False)
recc_netflix_columns = pd.DataFrame(recc_netflix_columns)
recc_netflix_columns['sum'] = recc_netflix_ratings.sum(axis=0).sort_values(ascending=False)
del recc_netflix_columns[recc_netflix_columns.columns[0]]
recc_netflix_columns

In [None]:
recc_netflix_normalization = df_netflix_genre['title'].value_counts()
recc_netflix_normalization = pd.DataFrame(recc_netflix_normalization)
recc_netflix_normalization.rename(columns={'title':'count'}, inplace=True)
recc_netflix_normalization

In [None]:
recc_netflix_merge = recc_netflix_columns.join(recc_netflix_normalization)
recc_netflix_merge

In [None]:
recc_netflix_merge['normalize'] = (recc_netflix_merge['sum']/recc_netflix_merge['count'])
recc_netflix_merge['count'].dtypes
recc_netflix_merge = recc_netflix_merge[(recc_netflix_merge['count'] > 10)]
recc_netflix_merge.sort_values(by='count', ascending=False).head(10)

# USER-ITEM Collaborative Filtering Recommendation System

In [None]:
ratings = pd.read_csv("userDatas.csv")
netflixData = pd.read_csv("netflix_titles.csv")
ratings = ratings.merge(netflixData)

In [None]:
ratings['cast'] = ratings['cast'].fillna(' ')
ratings['fav_cast'] = ratings['fav_cast'].fillna(' ')
ratings['director'] = ratings['director'].fillna(' ')
ratings['country'] = ratings['country'].fillna(' ')

ratings.head(61)

#### Create a table that shows user's rating of different netflix titles

In [None]:
userRatings = ratings.pivot_table(index=['user_id'],columns=['title'],values='user_rating')
print("Before: ",userRatings.shape)
userRatings.fillna(0, inplace=True)
print("After: ",userRatings.shape)
userRatings.head(10)

Movie Correlation data using pearson correlations

In [None]:
corrMatrix = userRatings.corr(method='pearson')
corrMatrix.head(10)

In [None]:
def standardize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    return new_row

df_temp = userRatings
df_std = df_temp.apply(standardize)

user_similarity = cosine_similarity(df_std.T)
user_similarity

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
euclidean_dist = euclidean_distances(df_std.T)
euclidean_dist

In [None]:
user_similarity_df = pd.DataFrame(user_similarity,index=userRatings.columns,columns=userRatings.columns)
user_similarity_df

In [None]:
def get_similar_cosine(movie_name, user_rating):
    similar_score = user_similarity_df[movie_name]
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

In [None]:
new_user1 = [("ONE PIECE",5),("Sword Art Online",2),("Sex Education",4),("Hunter X Hunter (2011)",5),("Attack on Titan",4)]
similar_movies = pd.DataFrame()
for movie,rating in new_user1:
    similar_movies = similar_movies.append(get_similar_cosine(movie,rating),ignore_index = True)

for (x,_) in new_user1:
    del similar_movies[x]
similar_movies.head()

Top 10 netflix recommendations

In [None]:
similar_movies.sum().sort_values(ascending=False).head(10)

In [None]:
corrMatrix = userRatings.corr(method='pearson')
corrMatrix.head(10)

In [None]:
def get_similar_pearson(movie_name, user_rating):
    similar_score = corrMatrix[movie_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

In [None]:
new_user1 = [("ONE PIECE",5),("Sword Art Online",2),("Sex Education",4),("Hunter X Hunter (2011)",5),("Attack on Titan",4)]
similar_movies = pd.DataFrame()
for movie,rating in new_user1:
    similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)

for (x,_) in new_user1:
    del similar_movies[x]
similar_movies.head()

In [None]:
similar_movies.sum().sort_values(ascending=False).head(10)

In [None]:
# action_lover = [("The Amazing Spider-Man (2012)",5),("Mission: Impossible III (2006)",4),("Toy Story 3 (2010)",2),("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",4)]
# similar_movies = pd.DataFrame()
# for movie,rating in action_lover:
#     similar_movies = similar_movies.append(get_similar_pearson(movie,rating),ignore_index = True)

# similar_movies.head(10)
# similar_movies.sum().sort_values(ascending=False).head(20)

#### Evalaution Metric : RMSE ( Root Mean Squared Error )

In [None]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    return sqrt(mean_squared_error(pred, actual))

# Predict ratings on the training data with both similarity score
user_prediction = predict(euclidean_dist, corrMatrix, type='user')
item_prediction = predict(euclidean_dist, corrMatrix, type='item')
# RMSE on the train data
print('User-based CF RMSE: ' + str(rmse(user_prediction, corrMatrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, corrMatrix)))