In [165]:
#import packages
import pandas as pd
from math import sqrt
import numpy as np
from scipy.stats import pearsonr

In [166]:
#unzip dataset and read .csv files into python 
!unzip -o -j moviedataset.zip 
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

Archive:  moviedataset.zip
  inflating: links.csv               
  inflating: movies.csv              
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: tags.csv                


In [167]:
#move (####) from the title column to a separate column labeled as year in movies dataset
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
movies['title'] = movies['title'].apply(lambda x: x.strip())

#separate the list of genres into separate columns in moves dataset, delete genres column 
movies['genres'] = movies.genres.str.split('|')
for index, row in movies.iterrows():
    for genre in row['genres']:
        movies.at[index, genre] = 1
movies = movies.fillna(0).drop('genres', 1)

#view movies dataset
movies.shape
movies.head()

Unnamed: 0,movieId,title,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,1995,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,1995,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,1995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,1995,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,1995,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
#clean by dropping timestap column and view ratings dataset
ratings = ratings.drop('timestamp', 1)
ratings.shape
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [171]:
#create content based recommender system based on the movie ratings of the user
class ContentBasedRec:
    def __init__(self, title, rating):
        self.title=title
        self.rating=rating

    def GetUserInput(self):
        #create dataframe of user's inputted movies and ratings 
        UserInput = pd.DataFrame(columns=['title', 'rating'])
        for i, j in zip(self.title, self.rating):
            UserInput = UserInput.append({'title': i, 'rating':j}, ignore_index=True)
        UserInput = pd.merge(UserInput, movies[['movieId', 'title', 'year']], on=['title'])
        UserInput = UserInput[['movieId', 'title', 'year', 'rating']].sort_values(by=['movieId'], ascending=True)
        #create dataframe encoding the genres of user's inputted movies
        UserMovies = movies[movies['movieId'].isin(UserInput['movieId'].tolist())]
        UserMovies = UserMovies.reset_index(drop=True).drop(['movieId', 'title', 'year'], 1)
        self.UserInput = UserInput
        self.UserMovies = UserMovies
        return UserInput

    def GetUserProfile(self):
        #create a weighted profile of the user's genre preferences by multiplying the UsersMovies dataset with user's ratiings 
        UserProfile = self.UserMovies.transpose().dot(self.UserInput['rating'])
        self.UserProfile = UserProfile 
        return UserProfile
        
    def GetRecommendationTable(self):
        #create dataframe encoding genres of all movies in dataset
        AllMovies= movies.set_index(movies['movieId']).drop(['movieId', 'title', 'year'], 1) 
        #multiply the genres by weighted preferences of user and take weighted average to create Preference Score
        RecTable = pd.DataFrame({'PreferenceScore':(AllMovies*self.UserProfile).sum(axis=1)/(self.UserProfile.sum())}).reset_index()
        #clean dataset and show the top 10 movie recommendations to user (10 movies with the highest Preference Score) 
        RecTable = pd.merge(RecTable, movies[['movieId', 'title', 'year']], on=['movieId'])
        RecTable = RecTable.rename(columns={'movieId':'MovieID', 'title':'Title', 'year':'Year'})
        RecTable = RecTable.set_index(RecTable['MovieID']).drop('MovieID', 1).sort_values(by=['PreferenceScore'], ascending=False)
        return RecTable.head(20)

In [209]:
title = ['Sense and Sensibility','Nixon', 'Amityville: A New Generation', 'Mrs. Brown (a.k.a. Her Majesty, Mrs. Brown)',
         'Washington Square', 'Walk on the Moon, A', 'Matrix, The', 'Never Been Kissed', 'Carol']
rating = [5, 4.5, 1.5, 5, 4, 5, 1, 3, 4]

Recs = ContentBasedRec(title, rating)
Recs.GetUserInput()

In [201]:
Recs.GetUserProfile()

Adventure                0
Animation                0
Children                 0
Comedy                   3
Fantasy                  0
Romance               21.5
Drama                 27.5
Action                   1
Crime                    0
Thriller                 1
Horror                 1.5
Mystery                  0
Sci-Fi                   1
IMAX                     0
Documentary              0
War                      0
Musical                  0
Western                  0
Film-Noir                0
(no genres listed)       0
dtype: object

In [202]:
Recs.GetRecommendationTable()

Unnamed: 0_level_0,PreferenceScore,Title,Year
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
113428,0.964602,Maniacts,2001
27781,0.964602,Svidd Neger,2003
27658,0.964602,Love Object,2003
127341,0.955752,Longshot,2001
76153,0.955752,Lupin III: First Contact (Rupan Sansei: Faasut...,2002
4956,0.955752,"Stunt Man, The",1980
83266,0.955752,Kaho Naa... Pyaar Hai,2000
4719,0.955752,Osmosis Jones,2001
75408,0.955752,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,2008
118258,0.946903,It's a Wonderful Afterlife,2010


In [203]:
#create collaborative filtering recommender system based on the movie ratings of the user as well as movie ratings of other users in ratings data frame
class CollaborativeFiltering():
    def __init__(self, title, rating):
        self.title=title
        self.rating=rating
        
    def GetUserInput(self):
        #create dataframe of user's inputted movies and ratings 
        UserInput = pd.DataFrame(columns=['title', 'rating'])
        for i, j in zip(self.title, self.rating):
            UserInput = UserInput.append({'title': i, 'rating':j}, ignore_index=True)
        UserInput = pd.merge(UserInput, movies[['movieId', 'title', 'year']], on=['title'])
        UserInput = UserInput[['movieId', 'title', 'year', 'rating']].sort_values(by=['movieId'], ascending=True)
        self.UserInput = UserInput
        return UserInput
    
    def GetCorrelatedUsers(self):
        #create list of sub dataframes for individual users who have rated the same movies as the inputed movies  
        RatingSubset = ratings[ratings['movieId'].isin(self.UserInput['movieId'].tolist())].groupby(['userId'])
        #sorting by the number of movies users have in common with the inputed movies  
        RatingSubset = sorted(RatingSubset,  key=lambda x: len(x[1]), reverse=True)
        #subsetting so that the top 100 users who have the most number of movies in common with inputted movies are used for futher analysis 
        RatingSub = RatingSubset[0:100]
        #determine how similar each users' movie ratings are to the inputted movie ratings using Pearsons Correlation Coefficient
        #comparing users' movie ratings with inputted movie ratings for common movie 
        pearsonCorrelationDict = {}
        for name, group in RatingSub:
            group = group.sort_values(by='movieId')
            UserInput = self.UserInput.sort_values(by='movieId')
            nRatings = len(group)
            temp_df = UserInput[UserInput['movieId'].isin(group['movieId'].tolist())]
            tempRatingList = temp_df['rating'].tolist()
            tempGroupList = group['rating'].tolist()
            pearsonCorrelationDict[name] = pearsonr(tempRatingList, tempGroupList)
        #clean CorrelatedUsers datadet by concentrating on users who show statstically significant (p-value of 0.1 or less) positive correlation with input user  
        CorrelatedUsers = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
        CorrelatedUsers.columns = ['similarityIndex', 'pValue']
        CorrelatedUsers['userId'] = CorrelatedUsers.index
        CorrelatedUsers = CorrelatedUsers[['userId', 'similarityIndex', 'pValue']]
        CorrelatedUsers = CorrelatedUsers.drop(CorrelatedUsers[(CorrelatedUsers['similarityIndex'] < 0)].index)
        CorrelatedUsers = CorrelatedUsers.drop(CorrelatedUsers[(CorrelatedUsers['pValue'] > 0.1)].index) 
        self.CorrelatedUsers = CorrelatedUsers
        return CorrelatedUsers.sort_values(by=['similarityIndex'], ascending=False)
    
    def GetRecommendationTable (self): 
        #get users' weighted ratings by multiplying correlation coefficient with users' rating for each movie 
        CorrelatedUserRatings=pd.merge(self.CorrelatedUsers, ratings, on=['userId'], how='inner')
        CorrelatedUserRatings['weightedRating'] = CorrelatedUserRatings['similarityIndex']*CorrelatedUserRatings['rating']
        #group by movie and get sum of the weighted ratings among all users in CorrelatedUsers dataframe
        SumMovieRatings = CorrelatedUserRatings.groupby('movieId').sum()[['similarityIndex','weightedRating']]
        SumMovieRatings.columns = ['SumSimilarityIndex','SumWeightedRating']
        #create RecomendationScore by taking weighted average of all users' ratings 
        #clean RecTable and display top 20 movie recommendations (the 20 movies with the highest weighted average ratings)
        RecTable = pd.DataFrame()
        RecTable['RecommendationScore'] = SumMovieRatings['SumWeightedRating']/SumMovieRatings['SumSimilarityIndex']
        RecTable = pd.merge(RecTable, movies, on=['movieId'])
        RecTable = RecTable.rename(columns={'movieId':'MovieID', 'title':'Title', 'year':'Year'})
        RecTable = RecTable[['MovieID', 'Title', 'Year', 'RecommendationScore']]
        return RecTable.sort_values(by=['RecommendationScore'], ascending=False).head(20)

In [204]:
Rec2 = CollaborativeFiltering(title, rating)
Rec2.GetUserInput()

In [205]:
Rec2.GetCorrelatedUsers()

Unnamed: 0,userId,similarityIndex,pValue
40579,40579,0.951734,0.012637
9583,9583,0.891343,0.042288
55229,55229,0.890476,0.017336
107704,107704,0.879131,0.021031
33723,33723,0.868599,0.056038
18962,18962,0.835067,0.078388
189940,189940,0.830952,0.040451
109082,109082,0.820783,0.023653
7783,7783,0.8095,0.050979
94853,94853,0.8095,0.050979


In [206]:
Rec2.GetRecommendationTable()

Unnamed: 0,MovieID,Title,Year,RecommendationScore
3636,7669,Pride and Prejudice,1995,5.0
3552,7039,Thunderheart,1992,5.0
1892,2612,Mildred Pierce,1945,5.0
3264,5890,Elling,2001,5.0
3798,30803,3-Iron (Bin-jip),2004,5.0
3542,7001,Invasion of the Body Snatchers,1978,5.0
3540,6992,Guarding Tess,1994,5.0
566,712,Captives,1994,5.0
3569,7111,Ryan's Daughter,1970,5.0
3830,32170,Chronicles (Crónicas),2004,5.0
