In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
movies_df = pd.read_csv('movies_small.csv')
ratings_df = pd.read_csv('ratings_small.csv')

In [30]:
movies_df.shape

(9742, 3)

### Data preprocessing

Separating the year from the title

In [31]:
def rm_dates_from_title(df: pd.DataFrame):
  df['year'] = df.title.str.extract(r'\((\d{4})\)', expand=False)
  df['title'] = df.title.str.replace(r'\(\d{4}\)', '', regex=True)
  df['title'] = df.title.apply(lambda x: x.strip())

rm_dates_from_title(movies_df)
movies_df.sample(5)

Unnamed: 0,movieId,title,genres,year
6431,51573,Meshes of the Afternoon,Fantasy,1943
2914,3909,Woman on Top,Comedy|Romance,2000
5410,25788,Scarface,Crime|Drama,1932
1421,1944,From Here to Eternity,Drama|Romance|War,1953
2051,2730,Barry Lyndon,Drama|Romance|War,1975


Splitting the genre column by '|' and converting it into a list

In [32]:
movies_df.genres = movies_df.genres.str.split('|')
movies_df.sample(5)

Unnamed: 0,movieId,title,genres,year
2523,3379,On the Beach,[Drama],1959
7294,75816,Women in Trouble,[Comedy],2009
7789,91947,"Revenant, The","[Comedy, Horror]",2009
2169,2885,Guinevere,"[Drama, Romance]",1999
1233,1642,Indian Summer (a.k.a. Alive & Kicking),"[Comedy, Drama]",1996


In [33]:
ratings_df['userId'].value_counts()

414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
442      20
569      20
320      20
576      20
53       20
Name: userId, Length: 610, dtype: int64

In [34]:
ratings_df['userId'].unique().shape

(610,)

removing users who rated less than 200 movies

In [35]:
x = ratings_df['userId'].value_counts() > 10

In [36]:
x[x].shape

(610,)

In [37]:
y = x[x].index

In [38]:
y

Int64Index([414, 599, 474, 448, 274, 610,  68, 380, 606, 288,
            ...
            147, 257, 406, 431, 278, 442, 569, 320, 576,  53],
           dtype='int64', length=610)

In [39]:
ratings_df = ratings_df[ratings_df['userId'].isin(y)]

In [40]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [41]:
ratings_df.shape

(100836, 4)

In [42]:
ratings_with_movies = ratings_df.merge(movies_df, on='movieId')

In [43]:
ratings_with_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,1,4.0,964982703,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,5,1,4.0,847434962,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
2,7,1,4.5,1106635946,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
3,15,1,2.5,1510577970,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
4,17,1,4.5,1305696483,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995


In [44]:
ratings_with_movies.shape

(100836, 7)

In [45]:
num_rating = ratings_with_movies.groupby('title')['rating'].count().reset_index()

In [46]:
num_rating.head()

Unnamed: 0,title,rating
0,'71,1
1,'Hellboy': The Seeds of Creation,1
2,'Round Midnight,2
3,'Salem's Lot,1
4,'Til There Was You,2


In [47]:
num_rating.rename(columns={'rating': 'num_of_rating'}, inplace=True)

In [48]:
num_rating.head()

Unnamed: 0,title,num_of_rating
0,'71,1
1,'Hellboy': The Seeds of Creation,1
2,'Round Midnight,2
3,'Salem's Lot,1
4,'Til There Was You,2


In [49]:
final_rating = ratings_with_movies.merge(num_rating, on='title')

In [50]:
final_rating.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,num_of_rating
0,1,1,4.0,964982703,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215
1,5,1,4.0,847434962,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215


In [51]:
final_rating.shape

(100836, 8)

In [52]:
final_rating = final_rating[final_rating['num_of_rating']>=10]

In [53]:
final_rating.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,num_of_rating
19147,524,21,3.0,852404913,Get Shorty,"[Comedy, Crime, Thriller]",1995,89
42589,483,1791,2.0,1230468666,Twilight,"[Crime, Drama, Thriller]",1998,25
19004,525,3949,4.0,1476478055,Requiem for a Dream,[Drama],2000,96
54888,483,34150,2.0,1415576529,Fantastic Four,"[Action, Adventure, Sci-Fi]",2005,41
64193,292,3476,2.0,1015375411,Jacob's Ladder,"[Horror, Mystery]",1990,26


In [54]:
final_rating.shape

(81761, 8)

In [55]:
final_rating.drop_duplicates(['userId', 'title'], inplace=True)

In [56]:
final_rating.shape

(81422, 8)

In [57]:
final_rating

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,num_of_rating
0,1,1,4.0,964982703,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215
1,5,1,4.0,847434962,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215
2,7,1,4.5,1106635946,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215
3,15,1,2.5,1510577970,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215
4,17,1,4.5,1305696483,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215
...,...,...,...,...,...,...,...,...
96407,558,5128,1.0,1035415850,Queen of the Damned,"[Fantasy, Horror]",2002,11
96408,599,5128,2.5,1498514642,Queen of the Damned,"[Fantasy, Horror]",2002,11
96409,600,5128,2.5,1237714385,Queen of the Damned,"[Fantasy, Horror]",2002,11
96410,605,5128,3.0,1277176955,Queen of the Damned,"[Fantasy, Horror]",2002,11


In [58]:
movie_pivot = final_rating.pivot_table(columns='userId', index='title', values='rating')

In [59]:
movie_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The",,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer,,,,,,,,,,,...,,,,,,,,,,3.5
10 Cloverfield Lane,,,,,,,,,,,...,,,,,,,,,,4.0
10 Things I Hate About You,,,,,,,,,,,...,,,3.0,,5.0,,,,,
"10,000 BC",,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoolander,,,,,,,,,,,...,,,,,,,,3.0,,4.0
Zootopia,,,,,,,,,,,...,4.5,,,,,,,,,4.0
eXistenZ,,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx,,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0


In [60]:
movie_pivot.shape

(2269, 610)

In [61]:
movie_pivot.fillna(0, inplace=True)

In [62]:
movie_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
"10,000 BC",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoolander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Zootopia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
eXistenZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0


In [63]:
from scipy.sparse import csr_matrix

In [64]:
movie_sparse = csr_matrix(movie_pivot)

In [38]:
movie_sparse

<6106x26599 sparse matrix of type '<class 'numpy.float64'>'
	with 11821205 stored elements in Compressed Sparse Row format>

In [39]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [40]:
model.fit(movie_sparse)

In [41]:
pip install --user scikit-learn threadpoolctl




In [42]:
distance, suggestion = model.kneighbors(movie_pivot.iloc[237,:].values.reshape(1,-1), n_neighbors=6)

In [43]:
distance

array([[ 0.        , 81.91153765, 82.23442102, 82.87038072, 83.17301245,
        83.30066026]])

In [44]:
suggestion

array([[ 237, 2113, 4289, 1546, 4891, 2204]], dtype=int64)

In [45]:
for i in range(len(suggestion)):
  print(movie_pivot.index[suggestion[i]])

Index(['Alpha Dog', 'From Justin to Kelly', 'Pokémon Heroes', 'Disaster Movie',
       'Shanghai Surprise', 'Ghoulies II'],
      dtype='object', name='title')


In [46]:
movies_name = movie_pivot.index

In [47]:
import pickle
import os

directory = 'artifacts'
if not os.path.exists(directory):
  os.makedirs(directory)
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(movies_name, open('artifacts/movies_name.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(movie_pivot, open('artifacts/movie_pivot.pkl', 'wb'))

print(os.listdir('artifacts'))

['final_rating.pkl', 'model.pkl', 'movies_name.pkl', 'movie_pivot.pkl']


In [48]:
def recommend_movie(movie_name):
  movie_id = np.where(movie_pivot.index == movie_name)[0][0]
  distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors=6)

  for i in range(len(suggestion)):
    movies = movie_pivot.index[suggestion[i]]
    for j in movies:
      print(j)


In [49]:
movie_name = '10 Things I Hate About You'
recommend_movie(movie_name)

10 Things I Hate About You
She's All That
Never Been Kissed
Bring It On
Can't Hardly Wait
Save the Last Dance
