In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [3]:
movies_df.shape

(27278, 3)

### Data preprocessing

Separating the year from the title

In [4]:
def rm_dates_from_title(df: pd.DataFrame):
  df['year'] = df.title.str.extract(r'\((\d{4})\)', expand=False)
  df['title'] = df.title.str.replace(r'\(\d{4}\)', '', regex=True)
  df['title'] = df.title.apply(lambda x: x.strip())

rm_dates_from_title(movies_df)
movies_df.sample(5)

Unnamed: 0,movieId,title,genres,year
23659,112538,Black Moon,Horror,1934
6623,6733,I'm Going Home (Je rentre à la maison),Comedy|Drama,2001
8952,26613,Ashik Kerib,Drama|Romance,1988
13278,65045,Alien Raiders,Mystery|Sci-Fi|Thriller,2008
10945,44777,Evil Aliens,Comedy|Horror|Sci-Fi,2005


Splitting the genre column by '|' and converting it into a list

In [5]:
movies_df.genres = movies_df.genres.str.split('|')
movies_df.sample(5)

Unnamed: 0,movieId,title,genres,year
22724,108789,Beyond the Walls (Hors les murs),[Drama],2012
2171,2256,Parasite,"[Horror, Sci-Fi]",1982
15155,77298,Hell (L'enfer),[Drama],2005
15917,80727,Middle Men,"[Comedy, Crime, Drama]",2010
8468,25905,"Miracle of Morgan's Creek, The","[Comedy, Romance]",1944


In [6]:
ratings_df['userId'].value_counts()

118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
89305       20
110463      20
96990       20
134747      20
6526        20
Name: userId, Length: 138493, dtype: int64

In [7]:
ratings_df['userId'].unique().shape

(138493,)

removing users who rated less than 200 movies

In [8]:
x = ratings_df['userId'].value_counts() > 200

In [9]:
x[x].shape

(26599,)

In [10]:
y = x[x].index

In [11]:
y

Int64Index([118205,   8405,  82418, 121535, 125794,  74142,  34576, 131904,
             83090,  59477,
            ...
             43882,  73903,  19094,  66002, 133714,  43934,  12039,  34693,
             43829, 112853],
           dtype='int64', length=26599)

In [12]:
ratings_df = ratings_df[ratings_df['userId'].isin(y)]

In [13]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
541,7,3,3.0,1011208463
542,7,7,3.0,1011208220
543,7,11,4.0,1011207889
544,7,15,2.0,1011208732
545,7,16,3.0,1011205378


In [14]:
ratings_df.shape

(12426476, 4)

In [15]:
ratings_with_movies = ratings_df.merge(movies_df, on='movieId')

In [16]:
ratings_with_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,7,3,3.0,1011208463,Grumpier Old Men,"[Comedy, Romance]",1995
1,91,3,3.0,1111531392,Grumpier Old Men,"[Comedy, Romance]",1995
2,96,3,4.0,1371089927,Grumpier Old Men,"[Comedy, Romance]",1995
3,116,3,2.0,1132728058,Grumpier Old Men,"[Comedy, Romance]",1995
4,156,3,2.0,1038801225,Grumpier Old Men,"[Comedy, Romance]",1995


In [17]:
ratings_with_movies.shape

(12426476, 7)

In [18]:
num_rating = ratings_with_movies.groupby('title')['rating'].count().reset_index()

In [19]:
num_rating.head()

Unnamed: 0,title,rating
0,"""Great Performances"" Cats",35
1,#chicagoGirl: The Social Network Takes on a Di...,2
2,$ (Dollars),11
3,$5 a Day,16
4,$9.99,30


In [20]:
num_rating.rename(columns={'rating': 'num_of_rating'}, inplace=True)

In [21]:
num_rating.head()

Unnamed: 0,title,num_of_rating
0,"""Great Performances"" Cats",35
1,#chicagoGirl: The Social Network Takes on a Di...,2
2,$ (Dollars),11
3,$5 a Day,16
4,$9.99,30


In [22]:
final_rating = ratings_with_movies.merge(num_rating, on='title')

In [23]:
final_rating.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,num_of_rating
0,7,3,3.0,1011208463,Grumpier Old Men,"[Comedy, Romance]",1995,4665
1,91,3,3.0,1111531392,Grumpier Old Men,"[Comedy, Romance]",1995,4665


In [24]:
final_rating.shape

(12426476, 8)

In [25]:
final_rating = final_rating[final_rating['num_of_rating']>=200]

In [26]:
final_rating.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,num_of_rating
8907792,23780,8981,5.0,1233511184,Closer,"[Drama, Romance]",2004,2860
9656181,98091,319,2.0,1032680108,Shallow Grave,"[Comedy, Drama, Thriller]",1994,3669
11917510,47848,5447,3.0,1041858949,Sunshine State,[Drama],2002,408
7760351,69654,315,2.0,951876427,"Specialist, The","[Action, Drama, Thriller]",1994,3279
9822513,98628,3258,2.0,1100375706,Death Becomes Her,"[Comedy, Fantasy]",1992,2791


In [27]:
final_rating.shape

(11883373, 8)

In [28]:
final_rating.drop_duplicates(['userId', 'title'], inplace=True)

In [29]:
final_rating.shape

(11821205, 8)

In [30]:
final_rating

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,num_of_rating
0,7,3,3.0,1011208463,Grumpier Old Men,"[Comedy, Romance]",1995,4665
1,91,3,3.0,1111531392,Grumpier Old Men,"[Comedy, Romance]",1995,4665
2,96,3,4.0,1371089927,Grumpier Old Men,"[Comedy, Romance]",1995,4665
3,116,3,2.0,1132728058,Grumpier Old Men,"[Comedy, Romance]",1995,4665
4,156,3,2.0,1038801225,Grumpier Old Men,"[Comedy, Romance]",1995,4665
...,...,...,...,...,...,...,...,...
12308381,137342,6478,5.0,1258059686,"Life and Times of Judge Roy Bean, The","[Comedy, Western]",1972,231
12308382,137409,6478,3.0,1109610682,"Life and Times of Judge Roy Bean, The","[Comedy, Western]",1972,231
12308383,137854,6478,4.0,1094701431,"Life and Times of Judge Roy Bean, The","[Comedy, Western]",1972,231
12308384,137926,6478,3.0,1063286157,"Life and Times of Judge Roy Bean, The","[Comedy, Western]",1972,231


In [31]:
movie_pivot = final_rating.pivot_table(columns='userId', index='title', values='rating')

In [32]:
movie_pivot

userId,7,11,14,24,31,53,54,58,69,82,...,138456,138457,138459,138464,138467,138472,138474,138477,138483,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You,,,,,,,,,,,...,,,,,,,,,,
"'burbs, The",,,,,,,3.0,,,,...,3.0,3.0,,,,,4.0,,,
'night Mother,,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer,,,,,,,,,,,...,,,,,,3.5,,,,
*batteries not included,,5.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC],,,,,,,,,,,...,,,,,,,,,,
eXistenZ,,5.0,,3.0,,,,,,,...,,,,,,,,,,
xXx,,,,,,,,,,,...,,,,,,,,1.0,,
xXx: State of the Union,,,,,,,,,,,...,,,,,,,,,,


In [33]:
movie_pivot.shape

(6106, 26599)

In [34]:
movie_pivot.fillna(0, inplace=True)

In [35]:
movie_pivot

userId,7,11,14,24,31,53,54,58,69,82,...,138456,138457,138459,138464,138467,138472,138474,138477,138483,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The",0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,3.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
'night Mother,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
*batteries not included,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ,0.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
xXx: State of the Union,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
from scipy.sparse import csr_matrix

In [37]:
movie_sparse = csr_matrix(movie_pivot)

In [38]:
movie_sparse

<6106x26599 sparse matrix of type '<class 'numpy.float64'>'
	with 11821205 stored elements in Compressed Sparse Row format>

In [39]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [40]:
model.fit(movie_sparse)

In [41]:
pip install --user scikit-learn threadpoolctl




In [42]:
distance, suggestion = model.kneighbors(movie_pivot.iloc[237,:].values.reshape(1,-1), n_neighbors=6)

In [43]:
distance

array([[ 0.        , 81.91153765, 82.23442102, 82.87038072, 83.17301245,
        83.30066026]])

In [44]:
suggestion

array([[ 237, 2113, 4289, 1546, 4891, 2204]], dtype=int64)

In [45]:
for i in range(len(suggestion)):
  print(movie_pivot.index[suggestion[i]])

Index(['Alpha Dog', 'From Justin to Kelly', 'Pokémon Heroes', 'Disaster Movie',
       'Shanghai Surprise', 'Ghoulies II'],
      dtype='object', name='title')


In [46]:
movies_name = movie_pivot.index

In [47]:
import pickle
import os

directory = 'artifacts'
if not os.path.exists(directory):
  os.makedirs(directory)
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(movies_name, open('artifacts/movies_name.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(movie_pivot, open('artifacts/movie_pivot.pkl', 'wb'))

print(os.listdir('artifacts'))

['final_rating.pkl', 'model.pkl', 'movies_name.pkl', 'movie_pivot.pkl']


In [48]:
def recommend_movie(movie_name):
  movie_id = np.where(movie_pivot.index == movie_name)[0][0]
  distance, suggestion = model.kneighbors(movie_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors=6)

  for i in range(len(suggestion)):
    movies = movie_pivot.index[suggestion[i]]
    for j in movies:
      print(j)


In [49]:
movie_name = '10 Things I Hate About You'
recommend_movie(movie_name)

10 Things I Hate About You
She's All That
Never Been Kissed
Bring It On
Can't Hardly Wait
Save the Last Dance
