# Finding Similar Movies

In [62]:
import pandas as pd
import numpy as np

In [63]:
m_cols = ['movie_id', 'title']
movies = pd.read_csv('./ml-100k/u.item', sep='|',
                     names=m_cols, usecols=range(2), index_col='movie_id',
                     encoding="ISO-8859-1")
movies.head()

Unnamed: 0_level_0,title
movie_id,Unnamed: 1_level_1
1,Toy Story (1995)
2,GoldenEye (1995)
3,Four Rooms (1995)
4,Get Shorty (1995)
5,Copycat (1995)


In [64]:
movies[movies['title']=='Star Wars (1977)'].index[0]

50

In [65]:
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('./ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1")
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


Now the amazing pivot_table function on a DataFrame will construct a user / movie rating matrix. Note how NaN indicates missing data - movies that specific users didn't rate.

In [66]:
movieRatings = ratings.pivot_table(index=['user_id'],columns=['movie_id'],values='rating')
movieRatings.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


Let's extract a Series of users who rated Star Wars:

In [67]:
starWarsRatings = movieRatings[50]
starWarsRatings.head()

user_id
0    5.0
1    5.0
2    5.0
3    NaN
4    5.0
Name: 50, dtype: float64

Pandas' corrwith function makes it really easy to compute the pairwise correlation of Star Wars' vector of user rating with every other movie! After that, we'll drop any results that have no data, and construct a new DataFrame of movies and their correlation score (similarity) to Star Wars:

In [68]:
similarMovies = movieRatings.corrwith(starWarsRatings)
similarMovies = similarMovies.dropna()
df=pd.DataFrame(similarMovies,columns=['Similarity'])
df.index=df.index.map(lambda i: movies.loc[i][0])
df.head()

Unnamed: 0,Similarity
Toy Story (1995),0.18002
GoldenEye (1995),0.131348
Four Rooms (1995),0.036925
Get Shorty (1995),0.259182
Copycat (1995),0.031792


In [69]:
df.sort_values('Similarity',ascending=False)

Unnamed: 0,Similarity
Man of the Year (1995),1.000000
Commandments (1997),1.000000
Stripes (1981),1.000000
Hollow Reed (1996),1.000000
Cosi (1996),1.000000
No Escape (1994),1.000000
Star Wars (1977),1.000000
"Beans of Egypt, Maine, The (1994)",1.000000
"Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)",1.000000
Full Speed (1996),1.000000


Our results are probably getting messed up by movies that have only been viewed by a handful of people who also happened to like Star Wars. So we need to get rid of movies that were only watched by a few people that are producing spurious results. Let's construct a new DataFrame that counts up how many ratings exist for each movie, and also the average rating while we're at it - that could also come in handy later.

In [70]:
movieStats = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieStats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


Let's get rid of any movies rated by fewer than 100 people, and check the top-rated ones that are left:

In [71]:
popularMovies = movieStats['rating']['size'] >= 100
df=movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)
df.index=df.index.map(lambda i: movies.loc[i][0])
df.head(10)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.45679
"Shawshank Redemption, The (1994)",283,4.44523
Rear Window (1954),209,4.38756
"Usual Suspects, The (1995)",267,4.385768
Star Wars (1977),584,4.359589
12 Angry Men (1957),125,4.344
Citizen Kane (1941),198,4.292929


In [74]:
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['Similarity']))
df.index=df.index.map(lambda i: movies.loc[i][0])
df.head(10)

Unnamed: 0,"(rating, size)","(rating, mean)",Similarity
Toy Story (1995),452,3.878319,0.18002
GoldenEye (1995),131,3.206107,0.131348
Get Shorty (1995),209,3.550239,0.259182
Twelve Monkeys (1995),392,3.798469,0.15553
Babe (1995),219,3.995434,0.184512
Dead Man Walking (1995),299,3.896321,-0.034158
Seven (Se7en) (1995),236,3.847458,0.006096
"Usual Suspects, The (1995)",267,4.385768,0.211075
Mighty Aphrodite (1995),184,3.418478,0.130148
"Postino, Il (1994)",183,3.967213,0.08597


In [73]:
df.sort_values(['Similarity'], ascending=False).head(10)

Unnamed: 0,"(rating, size)","(rating, mean)",Similarity
Star Wars (1977),584,4.359589,1.0
"Empire Strikes Back, The (1980)",368,4.206522,0.748353
Return of the Jedi (1983),507,4.00789,0.672556
Raiders of the Lost Ark (1981),420,4.252381,0.536117
Austin Powers: International Man of Mystery (1997),130,3.246154,0.377433
"Sting, The (1973)",241,4.058091,0.367538
Indiana Jones and the Last Crusade (1989),331,3.930514,0.350107
Pinocchio (1940),101,3.673267,0.347868
"Frighteners, The (1996)",115,3.234783,0.332729
L.A. Confidential (1997),297,4.161616,0.319065


In [75]:
corrMatrix = movieRatings.corr()
corrMatrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.221784,0.175794,0.103135,0.386475,0.529401,0.159248,0.247367,0.090176,0.187502,...,,,,,,,,,,
2,0.221784,1.0,0.230536,0.244556,0.217559,-0.158114,0.175445,0.340525,-0.227599,0.199083,...,,,,,,,,,,
3,0.175794,0.230536,1.0,-0.201969,0.184612,0.806226,0.071509,-0.118587,0.016742,0.071563,...,,,,,,,,,,
4,0.103135,0.244556,-0.201969,1.0,-0.23735,0.066625,0.152733,0.28127,0.208459,0.231931,...,,,,,,,,,,
5,0.386475,0.217559,0.184612,-0.23735,1.0,1.0,0.180211,0.205114,0.065169,-0.843661,...,,,,,,,,,,


In [76]:
corrMatrix = movieRatings.corr(method='pearson', min_periods=100)
corrMatrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.221784,,0.103135,,,0.159248,0.247367,0.090176,,...,,,,,,,,,,
2,0.221784,1.0,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,0.103135,,,1.0,,,0.152733,0.28127,0.208459,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [135]:
myRatings = movieRatings.loc[0].dropna()
myRatings.name='rating'
pd.concat([myRatings,movies.loc[myRatings.index]],axis=1)

Unnamed: 0_level_0,rating,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
50,5.0,Star Wars (1977)
133,1.0,Gone with the Wind (1939)
172,5.0,"Empire Strikes Back, The (1980)"


In [120]:
myRatings.index[0]

50

In [133]:
simCandidates = pd.Series()
for i in range(len(myRatings.index)):
    sims = corrMatrix[myRatings.index[i]].dropna()
    sims = sims.map(lambda x: x * myRatings.iloc[i])
    simCandidates = simCandidates.append(sims)
simCandidates.name='cumulative rating'
simCandidates.sort_values(inplace = True, ascending = False)
simCandidates.head(10)

50     5.000000
172    5.000000
50     3.741763
172    3.741763
181    3.606146
181    3.362779
174    2.693297
174    2.680586
249    1.887164
194    1.837692
Name: cumulative rating, dtype: float64

In [134]:
simCandidates = simCandidates.groupby(simCandidates.index).sum()
simCandidates.sort_values(inplace = True, ascending = False)
pd.concat([simCandidates.head(10),movies.loc[simCandidates.head(10).index]],axis=1)

Unnamed: 0,cumulative rating,title
172,8.87745,"Empire Strikes Back, The (1980)"
50,8.870971,Star Wars (1977)
181,7.178172,Return of the Jedi (1983)
174,5.5197,Raiders of the Lost Ark (1981)
210,3.488028,Indiana Jones and the Last Crusade (1989)
199,3.366616,"Bridge on the River Kwai, The (1957)"
204,3.357941,Back to the Future (1985)
194,3.329843,"Sting, The (1973)"
418,3.245412,Cinderella (1950)
215,3.222311,Field of Dreams (1989)
