In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv("tmdb_movies.csv")

In [3]:
#credits = pd.read_csv("tmdb_5000_credits.csv")

In [4]:
#credits.info()

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [6]:
movies.shape

(10000, 9)

In [7]:
#feature selection

movies = movies[['id', 'title', 'genre', 'overview']]
movies.head()

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...


In [8]:
movies.shape

(10000, 4)

In [9]:
movies['tags'] = movies['overview']+movies['genre']
movies = movies.drop(columns = ['genre', 'overview'])

In [10]:
#vectorization of genre and overview columns

cv = CountVectorizer(max_features = 10000, stop_words = 'english')

In [11]:
cv

In [12]:
vector = cv.fit_transform(movies['tags'].values.astype('U')).toarray()
vector[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
similarity = cosine_similarity(vector)

In [14]:
similarity

array([[1.        , 0.05634362, 0.12888482, ..., 0.07559289, 0.11065667,
        0.06388766],
       [0.05634362, 1.        , 0.07624929, ..., 0.        , 0.03636965,
        0.        ],
       [0.12888482, 0.07624929, 1.        , ..., 0.02273314, 0.06655583,
        0.08645856],
       ...,
       [0.07559289, 0.        , 0.02273314, ..., 1.        , 0.03253   ,
        0.02817181],
       [0.11065667, 0.03636965, 0.06655583, ..., 0.03253   , 1.        ,
        0.0412393 ],
       [0.06388766, 0.        , 0.08645856, ..., 0.02817181, 0.0412393 ,
        1.        ]])

In [15]:
distance = sorted(list(enumerate(similarity[2])), reverse = True, key = lambda vector:vector[1])
for i in distance[0:5] :
    print(movies.iloc[i[0]].title)

The Godfather
The Godfather: Part II
Blood Ties
Joker
Bomb City


little explaination of this step---vector function is basically vectorizing/transforming the tags. Enumerate function is giving me an indexed list of similarity of a particular element with all other elements. We want to sort this list in order of high to low similarity(reverse = True). Thus, we are performing the vector function on the similarity element which is the 1st element of every tuple. 

In [16]:
def recommend(movie):
    index = movies[movies['title'] == movie].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda vector:vector[1])
    for i in distance[1:6] :
        print(movies.iloc[i[0]].title)

In [17]:
recommend("Avatar")

Krull
Small Soldiers
Predator
Iron Man 3
Journey 2: The Mysterious Island


import pickle
pickle.dump(movies, open('movies_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.load(open('movies_list.pkl', 'rb'))