In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("imdb_top_1000.csv")

In [4]:
# Cleaning the data
df.Gross.fillna(method='ffill',axis=0,inplace=True)
df.Gross = df.Gross.str.replace(',','')
df.Certificate.dropna(inplace=True)
df.Meta_score.dropna(inplace=True)
df.isna().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross              0
dtype: int64

In [5]:
# Create a CountVectorizer to convert the Overview column into a sparse matrix
vectorizer = CountVectorizer()
X_overview = vectorizer.fit_transform(df["Overview"])

In [6]:
# Extracting features other than Overview 
director = df["Director"]
star1 = df["Star1"]
star2 = df["Star2"]
star3 = df["Star3"]
star4 = df["Star4"]
gross = df["Gross"]
imdb_rating = df["IMDB_Rating"]

In [7]:
# Convert the Director, Star1, Star2, Star3, Star4 columns into one-hot encoded features
X_director = pd.get_dummies(director)
X_star1 = pd.get_dummies(star1)
X_star2 = pd.get_dummies(star2)
X_star3 = pd.get_dummies(star3)
X_star4 = pd.get_dummies(star4)

In [8]:
# Normalizing the gross and imdb_rating 
scaler = MinMaxScaler()
gross = scaler.fit_transform(np.array(gross).reshape(-1,1))
imdb_rating = scaler.fit_transform(np.array(imdb_rating).reshape(-1,1))

In [9]:
# Combine all the feature matrix to create a final feature matrix
X = np.hstack([X_overview.toarray(),X_director.to_numpy(), X_star1.to_numpy(), X_star2.to_numpy(), X_star3.to_numpy(), X_star4.to_numpy(), gross, imdb_rating])

In [10]:
# Compute the cosine similarity between all movies
similarities = cosine_similarity(X)

In [16]:
def recommend_movies(movie_title, n=10):
    # Create a dictionary that maps each movie to its most similar movies
    similar_movies = {i: list(np.argsort(-similarities[i])[1:n+1]) for i in range(df.shape[0])}
    # Get the index of the movie
    movie_idx = df[df.Series_Title == movie_title].index[0]
    # Get the most similar movies
    similar_movies_idx = similar_movies[movie_idx]
    # Get the movies that are similar to the given movie
    similar_movies_title = df.iloc[similar_movies_idx].Series_Title.unique()
    # Return the top n similar movies
    return similar_movies_title[:n]

In [17]:
movie_title = 'Pulp Fiction'

recommend_movies(movie_title, n=5)

array(['Pulp Fiction', 'Brokeback Mountain', 'Forrest Gump', 'Naked',
       'The Green Mile'], dtype=object)