In [25]:
#Import all libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
#Import all the required dataset
#Load Movies dataset
movies_df = pd.read_csv('movies.csv', usecols=['movieId', 'title'], dtype={'movieId': 'int32', 'title':'str'})
#Load ratings dataset
rating_df = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype={
    'userId':'int32', 'movieId':'int32', 'rating':'float32'
})

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
#Shape of the ratings and movies df
rating_df.shape

(100836, 3)

In [6]:
movies_df.shape

(9742, 2)

In [7]:
#Merge the movie and ratings df
df = pd.merge(rating_df, movies_df, on='movieId')

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [9]:
#Check missing data in the combined df
df.isnull().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

In [11]:
#Group by he title and calculate total rating it has recieved
movie_ratingCount = (df.groupby(by=['title'])['rating'].count().reset_index().rename(columns={'rating':'totalRatingCount'})[['title','totalRatingCount']])

In [12]:
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [13]:
#Merge totalRatingCount df with combined df
rating_with_totalRatingCount = df.merge(movie_ratingCount, on='title', how='left')

In [14]:
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [15]:
#Statistics of Movie received the ratings
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [17]:
#Find out popular movies
popularity_threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [18]:
rating_popular_movie.shape

(41362, 5)

In [19]:
rating_popular_movie.title.nunique()

450

In [20]:
df.title.nunique()

9719

In [23]:
#First lets create a pivot matrix
moview_feature_df = rating_popular_movie.pivot_table(index='title', columns='userId', values='rating').fillna(0)
moview_feature_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [26]:
#Create the csr matrix to feed KNN
movie_feature_matrix = csr_matrix(moview_feature_df.values)

In [27]:
#Create an Instance of KNN Algorithm
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_feature_matrix)

In [28]:
#Built a query_index for random id and then find the recommendation
query_index = np.random.choice(moview_feature_df.shape[0])
query_index

389

In [29]:
#Find the distance and indices
distances, indices = model_knn.kneighbors(moview_feature_df.iloc[query_index, :].values.reshape(1, -1), n_neighbors=5)

In [30]:
distances

array([[5.9604645e-08, 4.7734612e-01, 5.2980655e-01, 5.3756940e-01,
        5.4477549e-01]], dtype=float32)

In [31]:
indices

array([[389, 447, 179, 158, 436]], dtype=int64)

In [32]:
#Print the recommendations
for i in range(0, len(distances.flatten())):
    if i==0:
        print('Recommendation for {0}: \n'.format(moview_feature_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, moview_feature_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendation for Sting, The (1973): 

1: Young Frankenstein (1974), with distance of 0.47734612226486206:
2: Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966), with distance of 0.5298065543174744:
3: Fish Called Wanda, A (1988), with distance of 0.5375694036483765:
4: When Harry Met Sally... (1989), with distance of 0.5447754859924316:
