# This is a collaborative based filtering using nearest neighbours

In [18]:
import pandas as pd
import numpy as np

In [19]:
df_movies = pd.read_csv('movies.csv',usecols = ['movieId','title'],dtype = {'movieId':'int32','title':'str'})

In [20]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [21]:
df_ratings = pd.read_csv("ratings.csv",usecols = ['userId','movieId','rating'],dtype ={'userId':'int32','movieId':'int32','rating':'float32'})
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [22]:
df = pd.merge(df_ratings,df_movies, on = 'movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [23]:
combine_rating = df.dropna(axis = 0, subset = ['title'])
movie_rating_count = (combine_rating.groupby(by = ['title'])['rating'].count().reset_index().rename(columns={'rating':'Totalratingcount'})
[['title','Totalratingcount']])

movie_rating_count.head()

Unnamed: 0,title,Totalratingcount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


# left inner join

In [24]:
ratings_df = combine_rating.merge(movie_rating_count, left_on = 'title',right_on = 'title',how = 'left')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,title,Totalratingcount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [25]:
# adjusting the display format for total ratingscount and getting the descriptive function
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(ratings_df['Totalratingcount'].describe())

count   100836.000
mean        58.759
std         61.965
min          1.000
25%         13.000
50%         39.000
75%         84.000
max        329.000
Name: Totalratingcount, dtype: float64


# here we are considering a parameter for selecting the movie the minimum ratinh should be above 40 for the movie to be considered


In [26]:
popularity_threshold = 40
rating_popular_movie = ratings_df.query('Totalratingcount >= @popularity_threshold')
rating_popular_movie.tail(30)

Unnamed: 0,userId,movieId,rating,title,Totalratingcount
82280,156,6218,4.5,Bend It Like Beckham (2002),40
82281,169,6218,4.0,Bend It Like Beckham (2002),40
82282,199,6218,3.5,Bend It Like Beckham (2002),40
82283,200,6218,4.0,Bend It Like Beckham (2002),40
82284,226,6218,2.5,Bend It Like Beckham (2002),40
82285,254,6218,1.5,Bend It Like Beckham (2002),40
82286,263,6218,4.0,Bend It Like Beckham (2002),40
82287,275,6218,5.0,Bend It Like Beckham (2002),40
82288,280,6218,3.0,Bend It Like Beckham (2002),40
82289,286,6218,3.0,Bend It Like Beckham (2002),40


In [27]:
rating_popular_movie.shape

(49630, 5)

# creating the pivot matrix for better understanding 

In [29]:
# there wil be null as as all users have not given ratings for all movies

In [28]:
features_df = rating_popular_movie.pivot_table(index = 'title',columns = 'userId',values = 'rating').fillna(0)
features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


  # converting it into array matrix

In [31]:
from scipy.sparse import csr_matrix

In [32]:
features_df_matrix = csr_matrix(features_df.values)

In [35]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine',algorithm = 'brute')
model_knn.fit(features_df_matrix)

In [36]:
features_df.shape

(638, 608)

In [38]:
query_index = np.random.choice(features_df.shape[0])
print(query_index)
distances,indices = model_knn.kneighbors(features_df.iloc[query_index,:].values.reshape(1,-1),n_neighbors = 6)

128


In [39]:
features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
for i in range(0,len(distances.flatten())):
    if i == 0:
        print("Recommendations for {0}:\n".format(features_df.index[query_index]))
    else:
        print('{0}:{1},with distance of {2}'.format(i,features_df.index[indices.flatten()[i]],distances.flatten()[i]))

Recommendations for Cinderella (1950):

1:Peter Pan (1953),with distance of 0.35332638025283813
2:Alice in Wonderland (1951),with distance of 0.44317561388015747
3:Little Mermaid, The (1989),with distance of 0.458329975605011
4:Snow White and the Seven Dwarfs (1937),with distance of 0.46339863538742065
5:Pinocchio (1940),with distance of 0.4780101180076599
