# Simple Movie Recommender

https://www.youtube.com/watch?v=-8BrRnFzq_Y

Building a movie recommender system using the MovieLens.org user ratings and movie datasets

In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Load the datasets

# Load movie dataset into a dataFrame
movies = pd.read_csv('./datasets/ml-latest-small/movies.csv')

# Load the movie ratings into a dataFrame
ratings = pd.read_csv('./datasets/ml-latest-small/ratings.csv')
ratings.drop(['timestamp'], axis = 1, inplace = True)

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(9125, 3)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [6]:
ratings.shape

(100004, 3)

In [7]:
# Create a function that replaces the movie IDs with the movie names
def replace_name(x):
    """Replace the movie IDs with movie name"""
    return movies[movies['movieId'] == x].title.values[0]

In [8]:
# Create new column in the ratings dataFrame that shows the movie names instead of movie IDs by applying the 
# function across the dataset
ratings.movieId = ratings.movieId.map(replace_name)

In [9]:
# Show updated dataFrame
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,Dangerous Minds (1995),2.5
1,1,Dumbo (1941),3.0
2,1,Sleepers (1996),3.0
3,1,Escape from New York (1981),2.0
4,1,Cinema Paradiso (Nuovo cinema Paradiso) (1989),4.0


In [10]:
# Create a matrix of users (down) by movies (across) by implementing a pivot table
M = ratings.pivot_table(index = ['userId'], columns = ['movieId'], values = 'rating')

In [11]:
M.shape

(671, 9064)

In [12]:
M

movieId,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,4.0,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


Pearson's r will return a value of either:
    **1** for strongly correlated data
    **0** for no correlation
    **-1** for strongly negative correlated data

In [13]:
# Pearson's r
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation"""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c**2) * np.sum(s2_c**2))

Using the pearson function, compare movies from the movie matrix

In [14]:
pearson(M['\'burbs, The (1989)'], M['10 Things I Hate About You (1999)'])

0.08098559808642845

In [15]:
pearson(M['Harry Potter and the Sorcerer\'s Stone (a.k.a. Harry Potter and the Philosopher\'s Stone) (2001)'], 
        M['Harry Potter and the Half-Blood Prince (2009)'])

0.2699875367833873

In [16]:
pearson(M['Mission: Impossible II (2000)'], M['Erin Brockovich (2000)'])

0.07018573611021599

In [17]:
pearson(M['Clerks (1994)'], M['Mallrats (1995)'])

0.26948637162292466

In [18]:
# Create a function that identifies the top num of recommendations from the movie matrix
def get_recs(movie_name, M, num):
    """Uses a movie name, the movie matrix, and num to provide recommendations from the movie matrix"""
    import numpy as np
    
    reviews = []
    for title in M.columns:
        if title == movie_name:
            continue
        cor = pearson(M[movie_name], M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title, cor))
    
    reviews.sort(key = lambda tup:tup[1], reverse = True)
    return reviews[:num]

In [19]:
# Find the top 10 movies that correlate well with the movie Clerks
recs = get_recs('Clerks (1994)', M, 10)

  


In [20]:
recs[:10]

[('Go Fish (1994)', 0.3873045237464075),
 ('Chasing Amy (1997)', 0.37484047898675804),
 ('Audrey Rose (1977)', 0.3658659721521296),
 ("Razor's Edge, The (1984)", 0.36586597215212957),
 ('Trekkies (1997)', 0.35608001316844984),
 ('Everything Must Go (2010)', 0.34925991438868215),
 ('Out of Sight (1998)', 0.3380949339270231),
 ('Haunting, The (1963)', 0.33177831773335),
 ('Barefoot in the Park (1967)', 0.33153641267942063),
 ('Flawless (1999)', 0.32978207127880155)]

In [21]:
anti_recs = get_recs('Clerks (1994)', M, 8551)

  


In [22]:
anti_recs[-10:]

[('Day for Night (La Nuit Américaine) (1973)', -0.2830706145397209),
 ('Desert Blue (1998)', -0.28307061453972104),
 ('Grey Gardens (1975)', -0.2830706145397212),
 ('And Then There Were None (1945)', -0.29707942091523587),
 ('In Dreams (1999)', -0.3168161467363099),
 ('Love Stinks (1999)', -0.3168492262640342),
 ('Sheltering Sky, The (1990)', -0.3168492262640342),
 ('Vanishing Point (1971)', -0.3168492262640342),
 ('Behind the Candelabra (2013)', -0.3237223892232911),
 ('In Her Shoes (2005)', -0.32845943832631025)]

______________________________________________________________________________________________________________________

### Testing the recommender with randomly provided inputs

In [23]:
# Test #1 | Little Women
recs = get_recs('Little Women (1994)', M, 10)

  


In [24]:
recs[:10]

[("Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)",
  0.3761832265917786),
 ('Operation Dumbo Drop (1995)', 0.3595558819785322),
 ('Poison Ivy II (1996)', 0.3528514129511247),
 ("What's Love Got to Do with It? (1993)", 0.2705875516595477),
 ('Oliver & Company (1988)', 0.27054975217863203),
 ('Lord of Illusions (1995)', 0.2533862784050302),
 ('Nell (1994)', 0.2532009963757705),
 ('License to Drive (1988)', 0.2516137422614463),
 ('Lost Horizon (1937)', 0.24950362684899507),
 ('Stepfather, The (1987)', 0.24950362684899507)]

The top 10 results for Little Women seemed VERY unusual and were unexpected. None of the results were interesting to the collaborator.

In [25]:
# Test #2 | Pretty Woman
recs = get_recs('Pretty Woman (1990)', M, 10)

  


In [26]:
recs[:10]

[('Ghost (1990)', 0.34785077211136006),
 ('Sleepless in Seattle (1993)', 0.2971463712522635),
 ('Santa Clause, The (1994)', 0.2876370194750539),
 ('Dirty Dancing (1987)', 0.27420718592790605),
 ('Speed (1994)', 0.264521870059504),
 ('Cliffhanger (1993)', 0.2512140803207609),
 ('Net, The (1995)', 0.25093902178192046),
 ('Twister (1996)', 0.24723910588447418),
 ('Grease (1978)', 0.24536467037352408),
 ('Antitrust (2001)', 0.239002265294573)]

The top 10 results for Pretty Woman was much better than the other test. The collaborator was familiar with or liked approximately 70% of the results

In [27]:
# Test #3 | The Fugitive
recs = get_recs('Fugitive, The (1993)', M, 10)

  


In [28]:
recs[:10]

[('Clear and Present Danger (1994)', 0.2504742504508632),
 ('Black Mirror (2011)', 0.25021423264005643),
 ('Star Trek: Generations (1994)', 0.24617794892411568),
 ('Speed (1994)', 0.22949057068291115),
 ('Outbreak (1995)', 0.2254315005401342),
 ('Young Guns (1988)', 0.21717514891005466),
 ('Cliffhanger (1993)', 0.21403232011773446),
 ('Firm, The (1993)', 0.21190066316676953),
 ('Hand That Rocks the Cradle, The (1992)', 0.2058032818455775),
 ('Few Good Men, A (1992)', 0.20253834186502964)]

In [29]:
# Test #4 | Coming to America
recs = get_recs('Coming to America (1988)', M, 10)

  


In [30]:
recs[:10]

[('San Andreas (2015)', 0.46204893901856736),
 ('Timecrimes (Cronocrímenes, Los) (2007)', 0.4496855850283207),
 ('Duets (2000)', 0.38313051408846055),
 ('Dawn of the Planet of the Apes (2014)', 0.3714793963277432),
 ('State of Play (2009)', 0.3710013253330292),
 ('Femme Fatale (2002)', 0.365167847808038),
 ('National Treasure (2004)', 0.3650009369518367),
 ('Funny Thing Happened on the Way to the Forum, A (1966)',
  0.36346951968922964),
 ('W. (2008)', 0.3594508213460654),
 ('Gamer (2009)', 0.35348936619581645)]

In [36]:
# Test #5 | The Purge
recs = get_recs('Purge: Anarchy, The (2014)', M, 10)

  


In [37]:
recs[:10]

[('Bad Words (2013)', 1.0),
 ('Grey, The (2012)', 1.0),
 ('Observe and Report (2009)', 1.0),
 ('Salvation, The (2014)', 1.0),
 ('The Drop (2014)', 1.0),
 ('Super (2010)', 0.944911182523068),
 ('Beastmaster, The (1982)', 0.8660254037844387),
 ('Best Exotic Marigold Hotel, The (2011)', 0.8660254037844387),
 ('47 Ronin (2013)', 0.8660254037844385),
 ('Around the World in 80 Days (2004)', 0.6546536707079772)]

In [33]:
# Test #6 | Wizard of Oz
recs = get_recs('Wizard of Oz, The (1939)', M, 10)

  


In [34]:
recs[:10]

[('Pinocchio (1940)', 0.33732579403640356),
 ('Sleeping Beauty (1959)', 0.30246082729908763),
 ('Paper Moon (1973)', 0.2836080247849341),
 ('Fantasia (1940)', 0.2818187244500789),
 ('Crocodile Dundee (1986)', 0.2755476524643166),
 ('Tom and Huck (1995)', 0.2639864154623991),
 ('Christmas Story, A (1983)', 0.262805424607835),
 ('Bicycle Thieves (a.k.a. The Bicycle Thief) (a.k.a. The Bicycle Thieves) (Ladri di biciclette) (1948)',
  0.25933209951171704),
 ('All About Eve (1950)', 0.24800527840999217),
 ('Super Size Me (2004)', 0.2460084426292644)]