In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv("E:datamining-master/movies.csv", usecols=['movieId','title'], 
                        sep=';',dtype={'movieId': 'int32', 'title': 'str'})

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
movies_df.shape

(9742, 2)

In [5]:
rating_df = pd.read_csv("E:datamining-master/ratings.csv", usecols=['userId','movieId','rating'], 
                        sep=';',dtype={'userId': 'int32', 'movieId': 'int32','rating': 'float32'})

In [6]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
rating_df.shape

(100836, 3)

In [8]:
df = pd.merge(rating_df,movies_df, on = 'movieId')
df

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)
...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997)
100832,610,160527,4.5,Sympathy for the Underdog (1971)
100833,610,160836,3.0,Hazard (2005)
100834,610,163937,3.5,Blair Witch (2016)


In [9]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})[['title','totalRatingCount']])
movie_ratingCount

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [10]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [11]:
pd.set_option('display.float.format', lambda x: '%.3f' %x)
print(movie_ratingCount['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [12]:
popularity_treshold = 300
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_treshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
1819,1,296,3.0,Pulp Fiction (1994),307
1820,4,296,1.0,Pulp Fiction (1994),307
1821,5,296,5.0,Pulp Fiction (1994),307
1822,6,296,2.0,Pulp Fiction (1994),307
1823,8,296,4.0,Pulp Fiction (1994),307


In [13]:
rating_popular_movie.shape

(953, 5)

In [14]:
## first lets create a pivot matrix

movie_features_df = rating_popular_movie.pivot_table(index = 'title', columns = 'userId', values = 'rating').fillna(0)
movie_features_df.head()

userId,1,2,4,5,6,7,8,10,11,14,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Forrest Gump (1994),4.0,0.0,0.0,0.0,5.0,5.0,3.0,3.5,5.0,4.0,...,0.0,3.0,3.0,0.0,3.0,4.0,0.0,3.0,4.0,3.0
Pulp Fiction (1994),3.0,0.0,1.0,5.0,2.0,0.0,4.0,1.0,0.0,3.0,...,0.0,5.0,5.0,5.0,2.0,5.0,3.0,5.0,4.0,5.0
Shawshank Redemption The (1994),0.0,3.0,0.0,3.0,5.0,0.0,5.0,0.0,4.0,3.0,...,5.0,5.0,0.0,0.0,0.0,3.5,5.0,4.5,4.0,3.0


In [15]:
from scipy.sparse import csr_matrix
movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors
model_kkn = NearestNeighbors(metric = 'cosine', algorithm= 'brute')
model_kkn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [16]:
movie_features_df.head()

userId,1,2,4,5,6,7,8,10,11,14,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Forrest Gump (1994),4.0,0.0,0.0,0.0,5.0,5.0,3.0,3.5,5.0,4.0,...,0.0,3.0,3.0,0.0,3.0,4.0,0.0,3.0,4.0,3.0
Pulp Fiction (1994),3.0,0.0,1.0,5.0,2.0,0.0,4.0,1.0,0.0,3.0,...,0.0,5.0,5.0,5.0,2.0,5.0,3.0,5.0,4.0,5.0
Shawshank Redemption The (1994),0.0,3.0,0.0,3.0,5.0,0.0,5.0,0.0,4.0,3.0,...,5.0,5.0,0.0,0.0,0.0,3.5,5.0,4.5,4.0,3.0


In [52]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distance, indices = model_kkn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors = 12)

2


ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 12

In [49]:
movie_features_df.head()

userId,1,2,4,5,6,7,8,10,11,14,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Forrest Gump (1994),4.0,0.0,0.0,0.0,5.0,5.0,3.0,3.5,5.0,4.0,...,0.0,3.0,3.0,0.0,3.0,4.0,0.0,3.0,4.0,3.0
Pulp Fiction (1994),3.0,0.0,1.0,5.0,2.0,0.0,4.0,1.0,0.0,3.0,...,0.0,5.0,5.0,5.0,2.0,5.0,3.0,5.0,4.0,5.0
Shawshank Redemption The (1994),0.0,3.0,0.0,3.0,5.0,0.0,5.0,0.0,4.0,3.0,...,5.0,5.0,0.0,0.0,0.0,3.5,5.0,4.5,4.0,3.0


In [51]:
for i in range(0, len(distance.flatten())):
    if i == 0: 
        print('Recommendation for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}: '.format(i,movie_features_df.index[indices.flatten()[i]], distance.flatten()[i]))

Recommendation for Forrest Gump (1994):

1: Shawshank Redemption The (1994), with distance of 0.2870066165924072: 
