###### Based on learning materials from the course '*Machine Learning, Data Science and Deep Learning with Python*' taught by Frank Kane on Udemy.com
___

Using the MovieLens 100K data set from GroupLens.org

In [1]:
import pandas as pd
import numpy as np

r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1")

m_cols = ['movie_id', 'title']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(2), encoding="ISO-8859-1")

# The datasets are merged so that Movie Titles and Ratings are available in one comprehensive dataset
ratings = pd.merge(movies, ratings)
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


# User-based collaborative filtering

In [2]:
# movieRatings is a sparse matrix of users and the movies they rated - with NaN indicating movies that a user did not watch
movieRatings = ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
movieRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [3]:
# starWarsRatings is a series of users who rated the movie 'Star Wars (1977)'
starWarsRatings = movieRatings['Star Wars (1977)']
starWarsRatings.head()

user_id
0    5.0
1    5.0
2    5.0
3    NaN
4    5.0
Name: Star Wars (1977), dtype: float64

In [4]:
# similarMovies holds the correlation coefficient of 'Star Wars (1977)' with every move in the dataset
similarMovies = movieRatings.corrwith(starWarsRatings).dropna()
similarMovies.head()

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


title
'Til There Was You (1997)    0.872872
1-900 (1994)                -0.645497
101 Dalmatians (1996)        0.211132
12 Angry Men (1957)          0.184289
187 (1997)                   0.027398
dtype: float64

In [5]:
# movieStats counts up how many ratings exist for each movie, and also the average rating for each
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})
movieStats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


In [6]:
# To counter spuriousness introduced by movies with low number of ratings, a cutoff of 250 reviews 
# is used to obtain popular movies
popularMovies = movieStats['rating']['size'] >= 250
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:5]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Schindler's List (1993),298,4.466443
"Shawshank Redemption, The (1994)",283,4.44523
"Usual Suspects, The (1995)",267,4.385768
Star Wars (1977),584,4.359589
One Flew Over the Cuckoo's Nest (1975),264,4.291667


In [7]:
# recMovies holds all the above-mentioned popular movies along with each movies similarity to 'Star Wars (1977)'
recMovies = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))
recMovies = recMovies.sort_values(['similarity'], ascending=False)
recMovies.iloc[1:].head(10)



Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Empire Strikes Back, The (1980)",368,4.206522,0.748353
Return of the Jedi (1983),507,4.00789,0.672556
Raiders of the Lost Ark (1981),420,4.252381,0.536117
Indiana Jones and the Last Crusade (1989),331,3.930514,0.350107
L.A. Confidential (1997),297,4.161616,0.319065
E.T. the Extra-Terrestrial (1982),300,3.833333,0.303619
Back to the Future (1985),350,3.834286,0.274839
Jaws (1975),280,3.775,0.265459
"Terminator, The (1984)",301,3.933555,0.262255
"Princess Bride, The (1987)",324,4.17284,0.259711


---
# Item-based collaborative filtering

Using the MovieLens 100K data set from GroupLens.org

In [8]:
# userRatings is a sparse matrix of users and the movies they rated - with NaN indicating movies that a user did not watch
userRatings = ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
userRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# corrMatrix is a correlation matrix for userRatings and holds the correlation score between every pair of movies (where at least one user rated both movies; NaN if not).
# minPeriods is set to 150 to filter out results where fewer than 150 users rated a given movie pair
corrMatrix = userRatings.corr(method='pearson', min_periods=150)
corrMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,,,,,,,,,,...,,,,,,,,,,
12 Angry Men (1957),,,,,,,,,,,...,,,,,,,,,,
187 (1997),,,,,,,,,,,...,,,,,,,,,,


In [10]:
# myRatings holds the ratings for the first user in the dataset
myRatings = userRatings.loc[0].dropna()
myRatings

title
Empire Strikes Back, The (1980)    5.0
Gone with the Wind (1939)          1.0
Star Wars (1977)                   5.0
Name: 0, dtype: float64

Go through each movie the user rated one at a time, and build up a list of possible recommendations based on the movies similar to the ones that were rated.

For each rated movie, retrieve a list of similar movies from the correlation matrix (corrMatrix). The correlation scores are scaled by by how well the user rated a movie in their list

In [11]:
simCandidates = pd.Series()
for i in range(0, len(myRatings.index)):
    print ("Adding sims for " + myRatings.index[i] + "...")
    
    # Retrieve similar movies to this one that I rated
    sims = corrMatrix[myRatings.index[i]].dropna()
    
    # Now scale its similarity by how well I rated this movie
    sims = sims.map(lambda x: (x * myRatings[i] * myRatings[i]) - 5) 

    # Add the score to the list of similarity candidates
    simCandidates = simCandidates.append(sims)
    
print ("sorting...")
simCandidates.sort_values(inplace = True, ascending = False)
print (simCandidates.head(10))

Adding sims for Empire Strikes Back, The (1980)...
Adding sims for Gone with the Wind (1939)...
Adding sims for Star Wars (1977)...
sorting...
Empire Strikes Back, The (1980)              20.000000
Star Wars (1977)                             20.000000
Empire Strikes Back, The (1980)              13.708815
Star Wars (1977)                             13.708815
Return of the Jedi (1983)                    13.030729
Return of the Jedi (1983)                    11.813896
Raiders of the Lost Ark (1981)                8.466487
Raiders of the Lost Ark (1981)                8.402928
Sting, The (1973)                             4.188459
Indiana Jones and the Last Crusade (1989)     3.752674
dtype: float64


Some movies have multiple entries as they are similar to more than one movie tha the user rated. These are aggregated by constructing a consolidated similarity score by summing the scores of each occurrance

In [12]:
simCandidates = simCandidates.groupby(simCandidates.index).sum()
simCandidates.sort_values(inplace = True, ascending = False)
simCandidates.head(10)

Empire Strikes Back, The (1980)              33.708815
Star Wars (1977)                             28.838023
Return of the Jedi (1983)                    24.844626
Raiders of the Lost Ark (1981)               12.015232
Indiana Jones and the Last Crusade (1989)     6.583583
Sting, The (1973)                             6.048135
Back to the Future (1985)                     5.503112
Field of Dreams (1989)                        5.342542
Star Trek: The Wrath of Khan (1982)           4.840398
Batman (1989)                                 4.737828
dtype: float64

Movies which the user has already rated are also filtered out before diplaying the final list

In [13]:
filteredSims = simCandidates.drop(myRatings.index)
filteredSims.head(10)

Return of the Jedi (1983)                    24.844626
Raiders of the Lost Ark (1981)               12.015232
Indiana Jones and the Last Crusade (1989)     6.583583
Sting, The (1973)                             6.048135
Back to the Future (1985)                     5.503112
Field of Dreams (1989)                        5.342542
Star Trek: The Wrath of Khan (1982)           4.840398
Batman (1989)                                 4.737828
Jaws (1975)                                   4.014673
Wizard of Oz, The (1939)                      3.850244
dtype: float64