In [1]:
#import packages
import pandas as pd
from math import sqrt
import numpy as np
from scipy.stats import pearsonr

In [2]:
#unzip dataset and read .csv files into python 
!unzip -o -j moviedataset.zip 
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

Archive:  moviedataset.zip
  inflating: links.csv               
  inflating: movies.csv              
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: tags.csv                


In [4]:
#move (####) from the title column to a separate column labeled as year in movies dataset
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
movies['title'] = movies['title'].apply(lambda x: x.strip())

#separate the list of genres into separate columns in moves dataset, delete genres column 
movies['genres'] = movies.genres.str.split('|')
for index, row in movies.iterrows():
    for genre in row['genres']:
        movies.at[index, genre] = 1
movies = movies.fillna(0).drop('genres', 1)

#view movies dataset
movies.shape
movies.head()

Unnamed: 0,movieId,title,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,1995,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,1995,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,1995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,1995,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,1995,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#clean by dropping timestap column and view ratings dataset
ratings = ratings.drop('timestamp', 1)
ratings.shape
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [84]:
class CollaborativeFiltering():
    def __init__(self, title):
        self.title = title
        self.corrlatedMovieDict = {}
        
    #create dataframe of user's inputted movies 
    def GetUserInput(self):
        UserInput = pd.DataFrame(self.title, columns=['title'])
        UserInput = pd.merge(UserInput, movies[['movieId', 'title', 'year']], on=['title']).sort_values(by=['movieId'], ascending=True)
        self.UserInput = UserInput
        return UserInput
    
    #get dictionary of 20 most correlated movies to all movies in movies dataset 
    #DO NOT RUN (takes ridiculously long time to run)
    def CalculateCorrelatedMovieDict(self):
        for movieID in movies['movieId']:
            self.CalculateSingleMovie(movieID)
        return pearsonCorrelationDict
    
    def CalculateSingleMovie(self, movieID):
        #get all users in ratings dataset who have rated inputted movie
        UserInput = ratings[ratings['movieId'] == movieID]
        UserInput = UserInput.sort_values(by='userId')
        #filter ratings data set on userId from above UserInput dataframe 
        RatingSubset = ratings[ratings['userId'].isin(UserInput['userId'].tolist())].groupby(['movieId'])
        # Sort by occurance and get top 20 movies most correlated to user inputted movies
        #top 20 movies users rating inputted movie are most likely to also rate
        RatingSubset2 = sorted(RatingSubset, key=lambda x: len(x[1]), reverse=True)
        RatingSub2 = RatingSubset2[0:20]
        #create empty correlation dictionary
        pearsonCorrelationDict = {}
        #determine Pearsons Correlation Coefficient between inputted movie's ratings (from all users) to the top 20 most correlated movies' ratings (from all users)       #comparing users' movie ratings with inputted movie ratings for common movie 
        for name, group in RatingSub2:
            group = group.sort_values(by='userId')
            temp_df = UserInput[UserInput['userId'].isin(group['userId'].tolist())]
            tempGroupList = group['rating'].tolist()
            tempRatingList = temp_df['rating'].tolist()
            pearsonCorrelationDict[name] = pearsonr(tempRatingList, tempGroupList)            
        self.corrlatedMovieDict[movieID] = pearsonCorrelationDict
        #get dataframe showing correlation with top 20 most correlated movies 
        CorrelatedMovies = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
        CorrelatedMovies.columns = ['correlationIndex', 'pValue']
        CorrelatedMovies['movieId'] = CorrelatedMovies.index
        CorrelatedMovies = CorrelatedMovies[['movieId', 'correlationIndex', 'pValue']]
        #clean CorrelatedMovies dataset my filtering out statstically insignificant (p-value of 0.1 or greater) or negatively correlated movies 
        CorrelatedMovies = CorrelatedMovies.drop(CorrelatedMovies[(CorrelatedMovies['correlationIndex'] < 0)].index)
        CorrelatedMovies = CorrelatedMovies.drop(CorrelatedMovies[(CorrelatedMovies['pValue'] > 0.1)].index) 
        return CorrelatedMovies
            
    def GetRecommendation(self):
        #create empty recommendations dataframe 
        Recommendation = pd.DataFrame(columns=['movieId', 'correlationIndex'])
        # Iterate on inputted movies and append similar movie to recommendation
        for index, row in self.UserInput.iterrows():
            movieID = row['movieId']
            #get Pearsons Correlation Coefficient from pearsonCorrelationDict
            if movieID not in Rec3.corrlatedMovieDict:
                self.CalculateSingleMovie(movieID)
            for key in self.corrlatedMovieDict[movieID]:
                movie = key
                corr = self.corrlatedMovieDict[movieID][key][0]
                Recommendation = Recommendation.append({'movieId': movie, 'correlationIndex': corr}, ignore_index=True)
        # Group by movieID and calcualte sum of Pearsons Correlation Coefficient for each movie, then sort
        RecTable = Recommendation.groupby('movieId').sum().sort_values(by='correlationIndex', ascending=False)
        #merge onto movies data set and select relevant information 
        RecTable = pd.merge(RecTable, movies, on=['movieId'])
        RecTable = RecTable[~RecTable['movieId'].isin(self.UserInput['movieId'])]
        RecTable = RecTable.rename(columns={'movieId':'MovieID', 'title':'Title', 'year':'Year', 'correlationIndex': 'SumCorrelation'})
        RecTable = RecTable[['MovieID', 'Title', 'Year', 'SumCorrelation']]
        return RecTable.head(20)

In [85]:
title = ['Sense and Sensibility', 'Nixon', 'Amityville: A New Generation', 'Mrs. Brown (a.k.a. Her Majesty, Mrs. Brown)',
         'Washington Square', 'Walk on the Moon, A', 'Matrix, The', 'Never Been Kissed', 'Carol']
Rec3 = CollaborativeFiltering(title)

In [86]:
Rec3.GetUserInput()

Unnamed: 0,title,movieId,year
1,Nixon,14,1995
0,Sense and Sensibility,17,1995
2,Amityville: A New Generation,1325,1993
3,"Mrs. Brown (a.k.a. Her Majesty, Mrs. Brown)",1643,1997
4,Washington Square,1650,1997
5,"Walk on the Moon, A",2570,1999
6,"Matrix, The",2571,1999
7,Never Been Kissed,2581,1999
8,Carol,133645,2015


In [87]:
Rec3.GetRecommendation()

Unnamed: 0,MovieID,Title,Year,SumCorrelation
1,260.0,Star Wars: Episode IV - A New Hope,1977,1.296299
3,318.0,"Shawshank Redemption, The",1994,1.26707
4,593.0,"Silence of the Lambs, The",1991,1.164076
5,356.0,Forrest Gump,1994,1.111057
13,608.0,Fargo,1996,0.929011
14,2396.0,Shakespeare in Love,1998,0.913877
15,2858.0,American Beauty,1999,0.874887
16,480.0,Jurassic Park,1993,0.743729
17,296.0,Pulp Fiction,1994,0.709251
18,357.0,Four Weddings and a Funeral,1994,0.691824


In [88]:
Rec3.corrlatedMovieDict

{14: {14: (0.9999999999999996, 0.0),
  780: (0.011906970338995711, 0.4244482686583827),
  608: (0.18333339216034472, 1.3364653482416569e-34),
  32: (0.1623622080265153, 3.659884721986378e-27),
  648: (0.1574303224956689, 4.521612691700496e-24),
  25: (0.25184715931312635, 1.1676051689057797e-59),
  736: (0.0436771366315455, 0.005675756492495774),
  36: (0.25508696320543045, 5.081978399084292e-60),
  1: (0.11079453310216043, 8.616112421788876e-12),
  260: (0.10187218650041392, 5.162920690889074e-10),
  141: (0.17270058062209556, 2.539672895687414e-25),
  6: (0.17504636188451744, 1.1136218616924531e-25),
  296: (0.149691873811593, 4.579304792125458e-19),
  733: (0.07058922089151273, 2.804444141220169e-05),
  62: (0.16676362219887278, 3.58563946782854e-23),
  95: (0.1253951230290652, 3.0915221536181497e-13),
  593: (0.14709418786324044, 1.6685639159915794e-17),
  318: (0.1293528313552187, 1.3247417243338766e-13),
  150: (0.17812439068446098, 1.4387076409948016e-24),
  356: (0.113294529836