In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Display floats with 3 digits
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

In [2]:
colnames = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=colnames)

df.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949


In [3]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head(2)

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)


In [4]:
df = pd.merge(df, movie_titles, on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


Our dataframe is currently a list of records, with each containing a user, a movie, and a rating. We need to rearrange this data so the columns represent movies and the rows represent users. To do this, we'll use `pd.pivot_table()`.

In [5]:
ratings = pd.pivot_table(
    data=df,
    values='rating',
    index='user_id',
    columns='title'
)

ratings.head(10)

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,
6,,,,4.0,,,,5.0,,,...,,,,4.0,,,,,,
7,,,,4.0,,,5.0,5.0,,4.0,...,,,,5.0,3.0,,3.0,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,4.0,...,,,,,,,,,,


We have a lot of missing values, and we're going to replace them with the user's average rating across all the movies they've rated.

First we'll **transpose** the dataframe. This will turn it on its side so columns are now rows and vice-versa.

The quickest way to transpose a dataframe is to just add `.T`  as an attribute.

In [7]:
users = ratings.T.fillna(0)

Now we're going to convert ratings into **z-scores**. A z-score represents how many standard deviations a rating is from the user's average rating. It's a great way to normalize data; the intuition is that a "5" means a lot more coming from a user whose average rating is a 2, than one whose average rating is a 4.

In [8]:
# Calculate z-score based on non-zero ratings
users = np.where(
    users != 0,
    (users - users[users != 0].mean()) / users[users != 0].std(),
    0
)

users = pd.DataFrame(users, index=ratings.columns)

# Highest rated movies first
users.mean(axis='columns').sort_values(ascending=False).head(10)

title
Star Wars (1977)                   0.462
Godfather, The (1972)              0.300
Fargo (1996)                       0.293
Raiders of the Lost Ark (1981)     0.283
Silence of the Lambs, The (1991)   0.274
Schindler's List (1993)            0.268
Titanic (1997)                     0.254
Shawshank Redemption, The (1994)   0.244
Empire Strikes Back, The (1980)    0.226
Return of the Jedi (1983)          0.220
dtype: float64

This favors movies with lots of ratings. It's not a perfect system, but it'll work for our purposes.

Now we're going to rebuild the ratings dataframe by transposing back our users' data.

In [9]:
ratings = pd.DataFrame(users.T, columns=users.index)

ratings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,-1.271,1.104,0.0,0.0,-0.479,0.313,0.0,0.0,...,0.0,0.0,0.0,1.104,-0.479,0.0,0.0,0.0,0.313,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.605,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,-0.635,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The `.corrwith()` method can compare an entire dataframe to a single column. It's quite useful for our recommender system.

In [10]:
ratings.corrwith(
    ratings['Ace Ventura: Pet Detective (1994)']).sort_values(
    ascending=False)

title
Ace Ventura: Pet Detective (1994)                      1.000
Naked Gun 33 1/3: The Final Insult (1994)              0.260
Basic Instinct (1992)                                  0.256
City Slickers II: The Legend of Curly's Gold (1994)    0.250
Batman Forever (1995)                                  0.226
Junior (1994)                                          0.219
Dumb & Dumber (1994)                                   0.217
Beverly Hillbillies, The (1993)                        0.216
Young Guns II (1990)                                   0.214
Dirty Dancing (1987)                                   0.212
Fog, The (1980)                                        0.200
Hot Shots! Part Deux (1993)                            0.199
War, The (1994)                                        0.199
Escape from New York (1981)                            0.192
Alien 3 (1992)                                         0.188
Gumby: The Movie (1995)                                0.184
Ready to Wear (Pre

In [11]:
# We might as well make a function out of it. So here it is:

def recommended_movies(movie):
    recs = ratings.corrwith(ratings[movie]).sort_values(ascending=False)
    
    # Only return movies with a decently high correlation
    recs = recs[recs > 0.2]
    
    # Remove the movie itself
    recs = recs[recs < .99]
    
    return recs

recommended_movies('Back to the Future (1985)')

title
Indiana Jones and the Last Crusade (1989)   0.344
Raiders of the Lost Ark (1981)              0.344
Terminator 2: Judgment Day (1991)           0.280
Speed (1994)                                0.262
Fugitive, The (1993)                        0.260
Terminator, The (1984)                      0.258
Empire Strikes Back, The (1980)             0.244
Die Hard (1988)                             0.240
Return of the Jedi (1983)                   0.238
Field of Dreams (1989)                      0.228
Jurassic Park (1993)                        0.227
Aladdin (1992)                              0.216
Apollo 13 (1995)                            0.215
Star Trek: The Wrath of Khan (1982)         0.211
Blues Brothers, The (1980)                  0.203
dtype: float64