# KNN

In [22]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [23]:
movies_df = pd.read_csv("C:/Users/exman/datasets/ml-latest-small/movies.csv", usecols=['movieId', 'title'])
ratings_df = pd.read_csv("C:/Users/exman/datasets/ml-latest-small/ratings.csv", usecols=['userId', 'movieId', 'rating'])

In [24]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [25]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [26]:
print('Unique users count: {}'.format(len(ratings_df['userId'].unique())))
print('Unique movies count: {}'.format(len(ratings_df['movieId'].unique())))
print('movies_df shape: {}'.format(movies_df.shape))
print('ratings_df shape: {}'.format(ratings_df.shape))

Unique users count: 610
Unique movies count: 9724
movies_df shape: (9742, 2)
ratings_df shape: (100836, 3)


In [27]:
# movies_df.drop(['genres'], axis=1, inplace=True)
# ratings_df.drop(['timestamp'], axis=1, inplace=True)

In [28]:
sparse_matrix = ratings_df.pivot(index='movieId', columns='userId', values='rating').fillna(0)
sparse_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
sparse_matrix.shape

(9724, 610)

In [30]:
users_votes = ratings_df.groupby('userId')['rating'].agg('count')
movies_votes = ratings_df.groupby('movieId')['rating'].agg('count')
print(users_votes.shape)
print(users_votes)
print(movies_votes.shape)
print(movies_votes)

(610,)
userId
1       232
2        29
3        39
4       216
5        44
       ... 
606    1115
607     187
608     831
609      37
610    1302
Name: rating, Length: 610, dtype: int64
(9724,)
movieId
1         215
2         110
3          52
4           7
5          49
         ... 
193581      1
193583      1
193585      1
193587      1
193609      1
Name: rating, Length: 9724, dtype: int64


In [31]:
user_mask = users_votes[users_votes > 4].index
movie_mask = movies_votes[movies_votes > 10].index
print(user_mask)
print(movie_mask)

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            601, 602, 603, 604, 605, 606, 607, 608, 609, 610],
           dtype='int64', name='userId', length=610)
Int64Index([     1,      2,      3,      5,      6,      7,      9,     10,
                11,     12,
            ...
            159093, 164179, 166528, 168250, 168252, 174055, 176371, 177765,
            179819, 187593],
           dtype='int64', name='movieId', length=2121)


In [32]:
sparse_matrix = sparse_matrix.loc[movie_mask, user_mask]
sparse_matrix.shape

(2121, 610)

In [33]:
csr_data = csr_matrix(sparse_matrix.values)
print(csr_data[:6,:6])

  (0, 0)	4.0
  (0, 4)	4.0
  (1, 5)	4.0
  (2, 0)	4.0
  (2, 5)	5.0
  (3, 5)	5.0
  (4, 0)	4.0
  (4, 5)	4.0
  (5, 5)	4.0


In [34]:
sparse_matrix = sparse_matrix.rename_axis(None, axis=1).reset_index()

In [35]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [36]:
import pickle

knnPickle = open('C:/Users/exman/datasets/ml-latest-small/knnpickle_file', 'wb') 
pickle.dump(knn, knnPickle)  
knnPickle.close()

In [37]:
recommendations = 10
search_word = 'Matrix'

In [38]:
movie_search = movies_df[movies_df['title'].str.contains(search_word)]
movie_search

Unnamed: 0,movieId,title
1939,2571,"Matrix, The (1999)"
4351,6365,"Matrix Reloaded, The (2003)"
4639,6934,"Matrix Revolutions, The (2003)"


In [39]:
movie_id = movie_search.iloc[0]['movieId']
print(movie_id)

2571


In [40]:
sparse_matrix

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
0,1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
2,3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2116,174055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2117,176371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2118,177765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2119,179819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
movie_id = sparse_matrix[sparse_matrix['movieId'] == movie_id].index[0]
print(movie_id)

901


In [42]:
distances, indices = knn.kneighbors(csr_data[movie_id], n_neighbors=recommendations + 1)
print(distances)
print(indices)

[[3.77475828e-15 2.86062573e-01 2.99065234e-01 3.20385030e-01
  3.36553254e-01 3.39015510e-01 3.45872411e-01 3.48692977e-01
  3.59797288e-01 3.68893092e-01 3.73917268e-01]]
[[ 901 1002  442  735  124  454 1362  954 1536  444 1157]]


In [43]:
indices = indices.squeeze().tolist()
distances = distances.squeeze().tolist()

In [44]:
indices_distances = list(zip(indices, distances))
indices_distances = sorted(indices_distances, key=lambda x: x[1]) #, reverse=True)
indices_distances = indices_distances[1:] # indices_distances[:-1]
indices_distances

[(1002, 0.2860625732802825),
 (442, 0.29906523379388084),
 (735, 0.3203850301290394),
 (124, 0.3365532537559339),
 (454, 0.3390155103982786),
 (1362, 0.3458724112424836),
 (954, 0.34869297680727607),
 (1536, 0.35979728836323155),
 (444, 0.3688930918516805),
 (1157, 0.37391726808726444)]

In [45]:
links_df = pd.read_csv("C:/Users/exman/datasets/ml-latest-small/links.csv", usecols=['movieId', 'imdbId'])

In [46]:
recom_list = []
for elem in indices_distances:
    matrix_movie_id = sparse_matrix.iloc[elem[0]]['movieId']
    id = movies_df[movies_df['movieId'] == matrix_movie_id].index
    title = movies_df.iloc[id]['title'].values[0]
    ref = links_df.iloc[id]['imdbId'].values[0]
    ref = "https://www.imdb.com/title/tt" + str(ref) + '/'
    sim = 1 - elem[1]
    recom_list.append({'movieId': id[0], 'title': title, 'Similarity': sim, 'Ref': ref})

In [47]:
recom_df = pd.DataFrame(recom_list, index=range(1, recommendations+1))
recom_df

Unnamed: 0,movieId,title,Similarity,Ref
1,2226,Fight Club (1999),0.713937,https://www.imdb.com/title/tt137523/
2,898,Star Wars: Episode V - The Empire Strikes Back...,0.700935,https://www.imdb.com/title/tt80684/
3,1503,Saving Private Ryan (1998),0.679615,https://www.imdb.com/title/tt120815/
4,224,Star Wars: Episode IV - A New Hope (1977),0.663447,https://www.imdb.com/title/tt76759/
5,911,Star Wars: Episode VI - Return of the Jedi (1983),0.660984,https://www.imdb.com/title/tt86190/
6,3638,"Lord of the Rings: The Fellowship of the Ring,...",0.654128,https://www.imdb.com/title/tt120737/
7,2078,"Sixth Sense, The (1999)",0.651307,https://www.imdb.com/title/tt167404/
8,4800,"Lord of the Rings: The Return of the King, The...",0.640203,https://www.imdb.com/title/tt167260/
9,900,Raiders of the Lost Ark (Indiana Jones and the...,0.631107,https://www.imdb.com/title/tt82971/
10,2674,Gladiator (2000),0.626083,https://www.imdb.com/title/tt172495/


# Correlation

In [25]:
movies_df = pd.read_csv("C:/Users/exman/datasets/ml-latest-small/movies.csv", usecols=['movieId', 'title'])
ratings_df = pd.read_csv("C:/Users/exman/datasets/ml-latest-small/ratings.csv", usecols=['userId', 'movieId', 'rating'])

In [26]:
sparse_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')
print(sparse_matrix.shape)
sparse_matrix.head()

(610, 9724)


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [27]:
search_word = 'Matrix'
movie_search = movies_df[movies_df['title'].str.contains(search_word)]
movie_search

Unnamed: 0,movieId,title
1939,2571,"Matrix, The (1999)"
4351,6365,"Matrix Reloaded, The (2003)"
4639,6934,"Matrix Revolutions, The (2003)"


In [28]:
movie_id = movie_search.iloc[0]['movieId']
print(movie_id)
#movie_id = sparse_matrix[sparse_matrix['movieId'] == movie_id].index[0]
#print(movie_id)

2571


In [29]:
film_rating_vector = sparse_matrix[movie_id]
#film_rating_vector = film_rating_vector.drop(film_rating_vector.columns[0], axis=1)
print(film_rating_vector)

userId
1      5.0
2      NaN
3      NaN
4      1.0
5      NaN
      ... 
606    5.0
607    5.0
608    5.0
609    NaN
610    5.0
Name: 2571, Length: 610, dtype: float64


In [30]:
similar_movies = sparse_matrix.corrwith(film_rating_vector)
similar_movies = similar_movies.dropna()
similar_movies.head(10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


movieId
1     0.158384
2     0.183638
3     0.249013
5     0.014900
6     0.288204
7     0.002858
9    -0.674200
10    0.172542
11    0.100030
12    0.149941
dtype: float64

In [31]:
similar_movies_df = pd.DataFrame(similar_movies, columns=['Correlation'])
similar_movies_df.head()

Unnamed: 0_level_0,Correlation
movieId,Unnamed: 1_level_1
1,0.158384
2,0.183638
3,0.249013
5,0.0149
6,0.288204


In [32]:
similar_movies_df.sort_values(['Correlation'], ascending=False, inplace=True)
similar_movies_df.head(10)

Unnamed: 0_level_0,Correlation
movieId,Unnamed: 1_level_1
2624,1.0
77798,1.0
2660,1.0
135518,1.0
6554,1.0
42191,1.0
96815,1.0
37545,1.0
1442,1.0
66511,1.0


In [33]:
ratings_df['Num of ratings'] = pd.DataFrame(ratings_df.groupby('movieId')['rating'].agg('count'))
ratings_df.head()
#similar_movies_df = similar_movies_df.join(ratings_df['num of ratings'])
#similar_movies_df.head()

Unnamed: 0,userId,movieId,rating,Num of ratings
0,1,1,4.0,
1,1,3,4.0,215.0
2,1,6,4.0,110.0
3,1,47,5.0,52.0
4,1,50,5.0,7.0


In [34]:
similar_movies_df = similar_movies_df.join(ratings_df['Num of ratings'])
similar_movies_df

Unnamed: 0_level_0,Correlation,Num of ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2624,1.0,2.0
77798,1.0,2.0
2660,1.0,4.0
135518,1.0,
6554,1.0,2.0
...,...,...
8506,-1.0,2.0
80139,-1.0,3.0
8482,-1.0,2.0
8327,-1.0,2.0


In [35]:
similar_movies_df = similar_movies_df[similar_movies_df['Num of ratings'] > 50]

similar_movies_df.head()

Unnamed: 0_level_0,Correlation,Num of ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2571,1.0,278.0
63082,0.613839,71.0
59784,0.612549,54.0
266,0.567155,68.0
91529,0.557125,76.0


In [36]:
similar_movies_df.reset_index(inplace=True)

In [37]:
similar_movies_df

Unnamed: 0,movieId,Correlation,Num of ratings
0,2571,1.000000,278.0
1,63082,0.613839,71.0
2,59784,0.612549,54.0
3,266,0.567155,68.0
4,91529,0.557125,76.0
...,...,...,...
426,1282,-0.241073,53.0
427,2599,-0.247055,56.0
428,168,-0.268982,54.0
429,432,-0.293258,55.0


In [38]:
final = similar_movies_df.merge(movies_df)

In [39]:
final.head(10)

Unnamed: 0,movieId,Correlation,Num of ratings,title
0,2571,1.0,278.0,"Matrix, The (1999)"
1,63082,0.613839,71.0,Slumdog Millionaire (2008)
2,59784,0.612549,54.0,Kung Fu Panda (2008)
3,266,0.567155,68.0,Legends of the Fall (1994)
4,91529,0.557125,76.0,"Dark Knight Rises, The (2012)"
5,1036,0.544466,145.0,Die Hard (1988)
6,6365,0.522551,96.0,"Matrix Reloaded, The (2003)"
7,79132,0.514767,143.0,Inception (2010)
8,509,0.497214,61.0,"Piano, The (1993)"
9,110,0.496045,237.0,Braveheart (1995)


In [40]:
final.drop(0, axis=0, inplace=True)
final.drop('Num of ratings', axis=1, inplace=True)
final.head(15)

Unnamed: 0,movieId,Correlation,title
1,63082,0.613839,Slumdog Millionaire (2008)
2,59784,0.612549,Kung Fu Panda (2008)
3,266,0.567155,Legends of the Fall (1994)
4,91529,0.557125,"Dark Knight Rises, The (2012)"
5,1036,0.544466,Die Hard (1988)
6,6365,0.522551,"Matrix Reloaded, The (2003)"
7,79132,0.514767,Inception (2010)
8,509,0.497214,"Piano, The (1993)"
9,110,0.496045,Braveheart (1995)
10,72998,0.493241,Avatar (2009)


In [41]:
final_new = final[0:10]
final_new

Unnamed: 0,movieId,Correlation,title
1,63082,0.613839,Slumdog Millionaire (2008)
2,59784,0.612549,Kung Fu Panda (2008)
3,266,0.567155,Legends of the Fall (1994)
4,91529,0.557125,"Dark Knight Rises, The (2012)"
5,1036,0.544466,Die Hard (1988)
6,6365,0.522551,"Matrix Reloaded, The (2003)"
7,79132,0.514767,Inception (2010)
8,509,0.497214,"Piano, The (1993)"
9,110,0.496045,Braveheart (1995)
10,72998,0.493241,Avatar (2009)


In [42]:
recom_df_new = recom_df.reindex(columns=['movieId', 'Similarity', 'title'])
recom_df_new

Unnamed: 0,movieId,Similarity,title
1,2226,0.713937,Fight Club (1999)
2,898,0.700935,Star Wars: Episode V - The Empire Strikes Back...
3,1503,0.679615,Saving Private Ryan (1998)
4,224,0.663447,Star Wars: Episode IV - A New Hope (1977)
5,911,0.660984,Star Wars: Episode VI - Return of the Jedi (1983)
6,3638,0.654128,"Lord of the Rings: The Fellowship of the Ring,..."
7,2078,0.651307,"Sixth Sense, The (1999)"
8,4800,0.640203,"Lord of the Rings: The Return of the King, The..."
9,900,0.631107,Raiders of the Lost Ark (Indiana Jones and the...
10,2674,0.626083,Gladiator (2000)


# Merge

In [43]:
recomm = final_new.merge(recom_df_new, how='outer')
recomm.fillna(0, inplace=True)
recomm['Sum'] = [0] * 20
for i in range(len(recomm)):
    recomm['Sum'][i] += recomm['Correlation'][i] * 0.4 + recomm['Similarity'][i] * 0.6
    
recomm.sort_values(['Sum'], ascending=False, inplace=True)
recomm.drop(['Correlation', 'Similarity'], axis=1, inplace=True)
recomm

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recomm['Sum'][i] += recomm['Correlation'][i] * 0.4 + recomm['Similarity'][i] * 0.6


Unnamed: 0,movieId,title,Sum
10,2226,Fight Club (1999),0.428362
11,898,Star Wars: Episode V - The Empire Strikes Back...,0.420561
12,1503,Saving Private Ryan (1998),0.407769
13,224,Star Wars: Episode IV - A New Hope (1977),0.398068
14,911,Star Wars: Episode VI - Return of the Jedi (1983),0.396591
15,3638,"Lord of the Rings: The Fellowship of the Ring,...",0.392477
16,2078,"Sixth Sense, The (1999)",0.390784
17,4800,"Lord of the Rings: The Return of the King, The...",0.384122
18,900,Raiders of the Lost Ark (Indiana Jones and the...,0.378664
19,2674,Gladiator (2000),0.37565
