In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import json


In [2]:
data_movies = pd.read_csv("/content/tmdb_5000_movies.csv")
data_credits = pd.read_csv("/content/tmdb_5000_credits.csv")
# Display the first few rows of the dataset
print(data_movies.head())
# Check the exact column names in the dataset
print(data_movies.columns)

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [3]:
data_movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [5]:
def extractFullNames(self,names):
  fullName=''
  for name in names:
    fullName+=name['name']+' '
  return fullName




In [6]:
#fixing values removing spaces extracting from json
data_movies['genres']= data_movies['genres'].apply(lambda x: extractFullNames(x,json.loads(x)))
data_movies['keywords']= data_movies['keywords'].apply(lambda x: extractFullNames(x,json.loads(x)))
data_movies['production_companies']= data_movies['production_companies'].apply(lambda x: extractFullNames(x,json.loads(x)))
data_movies['production_countries']= data_movies['production_countries'].apply(lambda x: extractFullNames(x,json.loads(x)))
data_movies['spoken_languages']= data_movies['spoken_languages'].apply(lambda x: extractFullNames(x,json.loads(x)))


In [7]:
data_movies.isna().sum()

Unnamed: 0,0
budget,0
genres,0
homepage,3091
id,0
keywords,0
original_language,0
original_title,0
overview,3
popularity,0
production_companies,0


In [8]:
print(data_movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [9]:
data_movies['homepage'].fillna("Unknown", inplace=True)
data_movies['runtime'].fillna(data_movies['runtime'].median(), inplace=True)
data_movies['overview'].fillna("NoOverviewAvailable", inplace=True)
data_movies['tagline'].fillna("NoTagline", inplace=True)
data_movies.dropna(subset=['release_date'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_movies['homepage'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_movies['runtime'].fillna(data_movies['runtime'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat

In [10]:
data_movies.isna().sum()

Unnamed: 0,0
budget,0
genres,0
homepage,0
id,0
keywords,0
original_language,0
original_title,0
overview,0
popularity,0
production_companies,0


In [11]:
def extractCast(casts):
  fullcast=''
  for cast in casts:
    fullcast+=cast['character']+' '
    fullcast+=cast['name']+' '
  return fullcast

data_credits['cast']= data_credits['cast'].apply(lambda x: extractCast(json.loads(x)))


In [12]:
data_credits['crew']
def extractCrew(crews):
  fullcrew=''
  for crew in crews:
    fullcrew+=crew['department']+' '
    fullcrew+=crew['job']+' '
    fullcrew+=crew['name']+' '
  return fullcrew

data_credits['crew']=data_credits['crew'].apply(lambda x: extractCrew(json.loads(x)))

In [13]:
data_credits

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,Jake Sully Sam Worthington Neytiri Zoe Saldana...,Editing Editor Stephen E. Rivkin Art Productio...
1,285,Pirates of the Caribbean: At World's End,Captain Jack Sparrow Johnny Depp Will Turner O...,Camera Director of Photography Dariusz Wolski ...
2,206647,Spectre,James Bond Daniel Craig Blofeld Christoph Walt...,Sound Original Music Composer Thomas Newman Di...
3,49026,The Dark Knight Rises,Bruce Wayne / Batman Christian Bale Alfred Pen...,Sound Original Music Composer Hans Zimmer Prod...
4,49529,John Carter,John Carter Taylor Kitsch Dejah Thoris Lynn Co...,Writing Screenplay Andrew Stanton Directing Di...
...,...,...,...,...
4798,9367,El Mariachi,El Mariachi Carlos Gallardo Bigotón Jaime de H...,Directing Director Robert Rodriguez Camera Dir...
4799,72766,Newlyweds,Buzzy Edward Burns Linda Kerry Bishé Marsha Ma...,Directing Director Edward Burns Production Pro...
4800,231617,"Signed, Sealed, Delivered",Oliver O’Toole Eric Mabius Shane McInerney Kri...,Costume & Make-Up Costume Design Carla Hetland...
4801,126186,Shanghai Calling,Sam Daniel Henney Amanda Eliza Coupe Donald Bi...,Directing Director Daniel Hsia Writing Writer ...


In [14]:
data_movies

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,Ingenious Film Partners Twentieth Century Fox ...,United States of America United Kingdom,2009-12-10,2787965087,162.0,English Español,Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,Walt Disney Pictures Jerry Bruckheimer Films S...,United States of America,2007-05-19,961000000,169.0,English,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6 bri...,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,Columbia Pictures Danjaq B24,United Kingdom United States of America,2015-10-26,880674609,148.0,Français English Español Italiano Deutsch,Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,Legendary Pictures Warner Bros. DC Entertainme...,United States of America,2012-07-16,1084939099,165.0,English,Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,Walt Disney Pictures,United States of America,2012-03-07,284139100,132.0,English,Released,"Lost in our world, found in another.",John Carter,6.1,2124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,Action Crime Thriller,Unknown,9367,united states–mexico barrier legs arms paper k...,es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,Columbia Pictures,Mexico United States of America,1992-09-04,2040920,81.0,Español,Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238
4799,9000,Comedy Romance,Unknown,72766,,en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,,,2011-12-26,0,85.0,,Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5
4800,0,Comedy Drama Romance TV Movie,http://www.hallmarkchannel.com/signedsealeddel...,231617,date love at first sight narration investigati...,en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,Front Street Pictures Muse Entertainment Enter...,United States of America,2013-10-13,0,120.0,English,Released,NoTagline,"Signed, Sealed, Delivered",7.0,6
4801,0,,http://shanghaicalling.com/,126186,,en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,,United States of America China,2012-05-03,0,98.0,English,Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7


In [29]:
df = pd.merge(data_credits, data_movies, left_on="movie_id", right_on="id", how="inner")

text_columns = df.select_dtypes(include=['object']).columns

df[text_columns] = df[text_columns].fillna("")

df["combinedFeatures"] = df[text_columns].agg(" ".join, axis=1)

df = df[["combinedFeatures"]]

df

Unnamed: 0,combinedFeatures
0,Avatar Jake Sully Sam Worthington Neytiri Zoe ...
1,Pirates of the Caribbean: At World's End Capta...
2,Spectre James Bond Daniel Craig Blofeld Christ...
3,The Dark Knight Rises Bruce Wayne / Batman Chr...
4,John Carter John Carter Taylor Kitsch Dejah Th...
...,...
4797,El Mariachi El Mariachi Carlos Gallardo Bigotó...
4798,Newlyweds Buzzy Edward Burns Linda Kerry Bishé...
4799,"Signed, Sealed, Delivered Oliver O’Toole Eric ..."
4800,Shanghai Calling Sam Daniel Henney Amanda Eliz...


In [37]:
vectorizer = TfidfVectorizer()

# Fit and transform the dataset
tfidf_matrix = vectorizer.fit_transform(df["combinedFeatures"])
def find_similar_rows(input_text, df, top_n=4):
    input_vector = vectorizer.transform([input_text])

    similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()

    top_indices = similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices].copy().assign(similarity=similarities[top_indices])

input_text = "batman begins"

# Find the top 4 most similar rows
similar_rows = find_similar_rows(input_text, df)

print(similar_rows)


                                       combinedFeatures  similarity
428   Batman Returns Bruce Wayne/Batman Michael Keat...    0.416695
3854  Batman: The Dark Knight Returns, Part 2 Batman...    0.369175
299   Batman Forever Bruce Wayne / Batman Val Kilmer...    0.336486
1359  Batman Jack Napier/The Joker Jack Nicholson Br...    0.324753
