In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [5]:
ratings.shape

(20000263, 4)

In [6]:
len(ratings['userId'].unique())

138493

In [7]:
# taking those users who have rated the movies more than 100 times 
x = pd.DataFrame(ratings['userId'].value_counts())
x = x.reset_index()
x.rename(columns = {'index':'User ID','userId':'No. of Ratings'}, inplace = True)
x = x.loc[x['No. of Ratings'] > 100]
x

Unnamed: 0,User ID,No. of Ratings
0,118205,9254
1,8405,7515
2,82418,5646
3,121535,5520
4,125794,5491
...,...,...
51864,565,101
51865,84809,101
51866,63209,101
51867,29682,101


In [8]:
# merging ratings and x
ratings_refined = ratings.merge(x, left_on='userId', right_on='User ID')
ratings_refined

Unnamed: 0,userId,movieId,rating,timestamp,User ID,No. of Ratings
0,1,2,3.5,1112486027,1,175
1,1,29,3.5,1112484676,1,175
2,1,32,3.5,1112484819,1,175
3,1,47,3.5,1112484727,1,175
4,1,50,3.5,1112484580,1,175
...,...,...,...,...,...,...
15970199,138493,68954,4.5,1258126920,138493,373
15970200,138493,69526,4.5,1259865108,138493,373
15970201,138493,69644,3.0,1260209457,138493,373
15970202,138493,70286,5.0,1258126944,138493,373


In [9]:
# dropping the columns which are not required
ratings_refined = ratings_refined.drop(columns = ['timestamp', 'User ID', 'No. of Ratings'])

In [10]:
# merging ratings_refined and movies
movie_details=movies.merge(ratings_refined,on='movieId')
movie_details

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22,3.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,23,4.0
...,...,...,...,...,...
15970199,131254,Kein Bund für's Leben (2007),Comedy,79570,4.0
15970200,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,79570,4.0
15970201,131258,The Pirates (2014),Adventure,28906,2.5
15970202,131260,Rentun Ruusu (2001),(no genres listed),65409,3.0


In [11]:
# taking those movies who have ratings more than 25 times 
x=pd.DataFrame(movie_details['movieId'].value_counts())
x=x.reset_index()
x.rename(columns = {'index':'Movie ID','movieId':'No. of Ratings'}, inplace = True)
x = x.loc[x['No. of Ratings'] > 25]
x

Unnamed: 0,Movie ID,No. of Ratings
0,356,37657
1,296,37016
2,593,35723
3,480,35204
4,2571,34515
...,...,...
12232,96510,26
12233,33847,26
12234,62925,26
12235,60992,26


In [12]:
# merging x and movie_details
movie_details_refined=movie_details.merge(x, left_on='movieId', right_on='Movie ID')
movie_details_refined=movie_details_refined.drop(columns=['Movie ID', 'No. of Ratings'])
movie_details_refined

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22,3.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,23,4.0
...,...,...,...,...,...
15887081,127098,Louis C.K.: Live at The Comedy Store (2015),Comedy,115134,5.0
15887082,127098,Louis C.K.: Live at The Comedy Store (2015),Comedy,117144,3.5
15887083,127098,Louis C.K.: Live at The Comedy Store (2015),Comedy,121631,3.5
15887084,127098,Louis C.K.: Live at The Comedy Store (2015),Comedy,134567,3.5


In [13]:
# dropping all the duplicate values
movie_details_refined.drop_duplicates(subset = ['title','userId'],inplace=True)
movie_details_refined.shape

(15886788, 5)

In [14]:
# creating the pivot table
movie_pivot = movie_details_refined.pivot_table(columns='userId',index='title',values='rating')
movie_pivot

userId,1,3,7,11,14,18,21,22,23,24,...,138471,138472,138474,138475,138477,138483,138484,138486,138490,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",,,,,,,,,,,...,,,,,,,,,,
$9.99 (2008),,,,,,,,,,,...,,,,,,,,,,
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),,,,,,,,,,,...,,,,,,,,,,
xXx (2002),,,,,,,,,,,...,,,,,1.0,,,,,
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,
¡Three Amigos! (1986),,,2.0,,,,,,,,...,,,,,,,,,,


In [15]:
movie_pivot.shape

(12234, 51869)

In [16]:
movie_pivot.fillna(0,inplace=True)
movie_pivot

userId,1,3,7,11,14,18,21,22,23,24,...,138471,138472,138474,138475,138477,138483,138484,138486,138490,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
¡Three Amigos! (1986),0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# converting movie_pivot into csr matrix (memory efficient) as It is wasteful to store the zero elements 
# in the matrix since they do not affect the results of our computation
from scipy.sparse import csr_matrix

movie_pivot_sparse = csr_matrix(movie_pivot)

In [18]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors = 7,algorithm='brute', metric='cosine')

# Fit the nearest neighbors estimator from the training dataset
model.fit(movie_pivot_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=7)

In [19]:
def movie_recommend(moviename):
    j = 0
    for i in movie_pivot.index:
        if moviename == i:
            break
        j+=1
    index = j
    # Finding the K-neighbors of a point. # reshape(1,-1) - > as 2D array required
    l = []
    distances,suggestions = model.kneighbors(movie_pivot.iloc[index,:].values.reshape(1,-1))
    for i in range(len(suggestions)):
        return (movie_pivot.index[suggestions[i]])

In [20]:
recommendation = movie_recommend("Spider (2002)")

In [21]:
for i in recommendation:
    print(i)

Spider (2002)
Videodrome (1983)
Naked Lunch (1991)
Man Who Wasn't There, The (2001)
Mulholland Drive (2001)
Elephant (2003)
History of Violence, A (2005)
