In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings(action='ignore')

In [30]:
# Read BollywoodMovieDetail.csv file
df = pd.read_csv("BollywoodMovieDetail.csv")

In [31]:
df.shape

(1284, 10)

In [32]:
print(df.head())

      imdbId                              title  releaseYear  releaseDate  \
0  tt0118578                             Albela         2001  20 Apr 2001   
1  tt0169102  Lagaan: Once Upon a Time in India         2001  08 May 2002   
2  tt0187279           Meri Biwi Ka Jawab Nahin         2004  02 Jul 2004   
3  tt0222024             Hum Tumhare Hain Sanam         2002  24 May 2002   
4  tt0227194                         One 2 Ka 4         2001  30 Mar 2001   

                         genre  \
0                      Romance   
1  Adventure | Drama | Musical   
2              Action | Comedy   
3              Drama | Romance   
4      Action | Comedy | Drama   

                                             writers  \
0  Honey Irani (screenplay) | Honey Irani (story)...   
1  Ashutosh Gowariker (story) | Ashutosh Gowarike...   
2                                                NaN   
3  K.S. Adiyaman | Arun Kumar (assistant dialogue...   
4  Sanjay Chhel | Raaj Kumar Dahima (screenplay) ...

In [33]:
print(df.columns)

Index(['imdbId', 'title', 'releaseYear', 'releaseDate', 'genre', 'writers',
       'actors', 'directors', 'sequel', 'hitFlop'],
      dtype='object')


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1284 entries, 0 to 1283
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdbId       1284 non-null   object 
 1   title        1284 non-null   object 
 2   releaseYear  1284 non-null   int64  
 3   releaseDate  1231 non-null   object 
 4   genre        1282 non-null   object 
 5   writers      1165 non-null   object 
 6   actors       1281 non-null   object 
 7   directors    1280 non-null   object 
 8   sequel       1281 non-null   float64
 9   hitFlop      1284 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 100.4+ KB


In [35]:
# Select essential Features - we will use the string columns
essential_features = ['title','actors','genre','writers','directors']

# New DF column combinig all selected features
def combine_features(row):
    final_row=row['title'] +" "+row['actors']+" "+row["genre"]+" "+row["writers"]+" "+row["directors"]
    return final_row
for feature in essential_features:
# Replacing NaN values with empty string
    df[feature] = df[feature].fillna('') 
    
# Passing each row individually in apply() 
df["combined_features"] = df.apply(combine_features,axis=1)

#Printing Combined Features
print(df["combined_features"].head())

0    Albela Govinda | Aishwarya Rai Bachchan | Jack...
1    Lagaan: Once Upon a Time in India Aamir Khan |...
2    Meri Biwi Ka Jawab Nahin Akshay Kumar | Sridev...
3    Hum Tumhare Hain Sanam Shah Rukh Khan | Madhur...
4    One 2 Ka 4 Shah Rukh Khan | Juhi Chawla | Jack...
Name: combined_features, dtype: object


In [36]:
# Vectorizing pre-processed movie plots using TF-IDF
tfidfvec = TfidfVectorizer(stop_words='english')
tfidf_movieid = tfidfvec.fit_transform(df["combined_features"])

# Finding cosine similarity between vectors to find similar movies

cosine_sim = cosine_similarity(tfidf_movieid, tfidf_movieid)
cosine_sim

#Creating a series with the titles as index so that we can use the name of the entered movie to access the row number
df = df.reset_index()
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

# Input the movie from User
title=input("Enter bollywood movie name  whose release year is between 2001 to 2014    ")


Enter bollywood movie name  whose release year is between 2001 to 2014    Kranti


In [37]:
# Get the index of entered movie from its title
idx = indices[title]

# find list of tuple for entered movie with other movies
sim_scores = list(enumerate(cosine_sim[idx]))

# Arranging the similar movies having more similarity score with entered movie
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:31]

#Printing pairwise cosine similarity score
print(sim_scores)

[(99, 0.2586863732759953), (89, 0.2007347067222752), (1258, 0.1918172131271181), (341, 0.1871178968486492), (894, 0.18620852241901537), (77, 0.16789430548321835), (477, 0.1658319911388399), (1283, 0.16070494947328684), (867, 0.16044352324946148), (32, 0.15839055415089748), (1138, 0.15110216448322425), (91, 0.14796232655505195), (908, 0.14689097080184424), (722, 0.14655858803876043), (328, 0.13894678594980087), (661, 0.13771215308891113), (460, 0.13723608064926224), (53, 0.13706967832515773), (1005, 0.13626315030015163), (399, 0.1362015165716708), (48, 0.13600902488284985), (1186, 0.13368029476862112), (481, 0.13285342896226618), (240, 0.13082068307513678), (104, 0.1299280827938123), (27, 0.12961817937316417), (1275, 0.1290622212138377), (910, 0.1274412250390664), (261, 0.12743509638064818), (1080, 0.1257770317222959)]


In [38]:
# Printing 10 similar movies
i=0
print("Top 10 similar movies to "+title+" are:\n")
for element in sim_scores:
        print(titles.iloc[element[0]])
        i=i+1
        # loop for 10 movies
        if i>=10:
               break
 


Top 10 similar movies to Kranti are:

Yeh Hai Jalwa
23rd March 1931: Shaheed
Koyelaanchal
Tathastu
Diary of a Butterfly
Humko Tumse Pyaar Hai
Aap Ki Khatir
Zid
Chitkabrey
Gadar: Ek Prem Katha
