Import statements

In [3]:
import pandas as pd
import numpy as np
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import pickle


Importing the datasets

In [4]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


Gives values of a specific columns

In [6]:
credits.head(1)['cast'].values

array(['[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "ge

In [7]:
movies.shape

(4803, 20)

In [8]:
credits.shape

(4803, 4)

Merging two datasets [movies and credits]

In [9]:
movies = movies.merge(credits,on='title')

In [10]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [11]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

Important Columns for my Recommendation System: Genres, ID, Keyword, Title, Overview, Cast, Crew

In [12]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


Dropping the empty data

Checking if there is any empty/null data

In [13]:
movies.dropna(inplace=True)
movies.isnull().sum()

Unnamed: 0,0
movie_id,0
title,0
overview,0
genres,0
keywords,0
cast,0
crew,0


Checking if there is any duplicate data

In [14]:
movies.duplicated().sum()

0

1. The function **convert(obj)** takes a string representation of a list of dictionaries and extracts the values associated with the key 'name' from each dictionary, return them as a list.

2. **ast.literal_eval(obj)** - to safely convert the string into a Python list of dictionaries

In [15]:
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

Converts the string into a list of dictionaries ⬇️

In [16]:
movies['genres'] = movies['genres'].apply(convert)

Same thing apply on the keywords column

In [17]:
movies['keywords'] = movies['keywords'].apply(convert)

The function **convert3(obj)** is similar to the previous convert(obj) function but with an added constraint: it extracts at most three names from the list of dictionaries.

In [18]:
def convert3(obj):
  L = []
  counter = 0
  for i in ast.literal_eval(obj):
    if counter != 3:
      L.append(i['name'])
      counter+=1
    else:
      break
  return L

Returns 3 cast members name and then store in the movies data

In [19]:
movies['cast'] = movies['cast'].apply(convert3)

The function **fetch_director(obj)** extracts the director's name from a string-encoded list of dictionaries where each dictionary represents a person and their role in a movie's crew.

In [20]:
def fetch_director(obj):
  L = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

Returns the director name

In [21]:
movies['crew'] = movies['crew'].apply(fetch_director)

Converting each row of overview into list ⬇️

**x.split()** → Splits each string (movie overview) into a list of words based on whitespace.

In [22]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [23]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


1. We need to apply a transformation on the keywords, genres, cast, crew columns to reduce the spaces between the names of a person to make them a single entity. Eg: Sam Worthington and Sam Mendes are two different person bt both have Sam in their name so to avoid problem in future while recommendation of a movie we make SamWorthington a single entity and SamMendes a single different entity.

2. **apply(lambda x:[i.replace(" ","") for i in x])** - Applies a function to each value in the "genres" column

  **for i in x** → Iterates over each element (i) in x assuming x is a list

   **i.replace(" ", "")** → Removes spaces from each genre name.

In [24]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [25]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


Concat/Joining these 5 columns and making a single column named tags

In [26]:
movies['tags'] = movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [27]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [28]:
new_df = movies[['movie_id','title','tags']]

Removing all the unnecessary columns from the dataset and storing it under a new name.

Converting all the list present in tags into string

In [29]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))


In [30]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [31]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

Converting every letter present in tags in lowercase

In [32]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [33]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


**PorterStemmer()** - used to reduce words to their root or base form

In [34]:
ps = PorterStemmer()

The stem(text) function splits the input text into words, stems each word using ps.stem(), and joins the stemmed words back into a sentence.

In [35]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

Applying stem function in tags columns to reduce words to their base form like happily -> happy etc....

In [36]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


#Vectorization

CountVectorizer takes two arguments max_features means only take 5000 different words from tags column and remove the stop_words like are,is,in,of,do etc..... from vectorization

In [37]:
cv = CountVectorizer(max_features=5000,stop_words='english')

Passing tags column gives a Compressed sparse row format convert this to an array and then printing the shape

Gives output like(movies,words)

In [38]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [39]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [40]:
vectors.shape

(4806, 5000)

cv.get_feature_names_out() - gives all the selected 5000 words from the tags column

In [41]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

**Cosine similarity** calculate the angle between two points(like movies) if angle is smaller similar movies and angle larger means dissimilar movies and then we pass vectors in this.

In short what happens here: each movie is mapped to 4806 movies present in the dataset that is why the shape returns as (4806,4806)

In [42]:
similarity = cosine_similarity(vectors)

In [43]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

find_closest_title(movie, movie_list)
This function helps find the closest matching movie title using fuzzy matching from the difflib library.

It searches for the most similar title in movie_list and returns the closest match if its similarity score is at least 0.6.

If no match is found, it returns None.

In [44]:
def find_closest_title(movie, movie_list):
    """Finds the closest matching movie title using fuzzy matching."""
    match = difflib.get_close_matches(movie, movie_list, n=1, cutoff=0.6)
    return match[0] if match else None


1. **recommend(movie)** -
This function suggests five similar movies based on the given movie title.

2. **movie_list = new_df['title'].tolist()** - Extracts all movie titles from new_df (a DataFrame) and stores them in movie_list.

3. **if movie not in movie_list:** - If the input movie is not found in movie_list, the function tries to find a similar title using find_closest_title().

4. If a close match is not found, the function suggests the corrected title and updates movie to this closest match.
If no match is found, the function terminates with a message.

5. **movie_index = new_df[new_df['title'] == movie].index[0]** - this first check the movie name in the database and then returns the index of that movie.

6. **distances = similarity[movie_index]** - this finds similar movies related to movie tags and then finds the distance between them.(similarity score)

7. **movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]** - **enumerate(distances)** - Converts the distances list into an enumerated list of tuples, where each tuple is in the format (index, distance),
rest of the code sorts distances in descending order with their indices, then selects the top 5 most similar items (excluding the first, which is likely the same item being compared).

8. **for i in movies_list:**       
   **print(new_df.iloc[i[0]].title)**  - this first searches the name of the similar movies and then return their title.

In [45]:
def recommend(movie):
    movie_list = new_df['title'].tolist()

    # Check if the movie exists
    if movie not in movie_list:
        closest_match = find_closest_title(movie, movie_list)
        if closest_match:
            print(f"Movie not found! Did you mean '{closest_match}'?")
            movie = closest_match
        else:
            print("Movie not found. Try another title.")
            return

    # Get movie index
    movie_index = new_df[new_df['title'] == movie].index[0]

    # Get similarity scores
    distances = similarity[movie_index]

    # Sort by highest similarity
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    # Print recommended movies
    print("Recommended Movies:")
    for i in movies_list:
        print(new_df.iloc[i[0]].title)


In [46]:
recommend("Avatar")

Recommended Movies:
Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [47]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [48]:
pickle.dump(similarity,open('similarity.pkl','wb'))