## *Import Libraries*

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
import joblib

In [None]:
# Load the datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')


In [4]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [None]:
movies.shape , credits.shape


((4803, 20), (4803, 4))

## *Clean and Merge Data*

In [8]:
# Rename the 'movie_id' column in credits to 'id' for merging
credits_renamed = credits.rename(columns={"movie_id": "id"})

# Merge the datasets on the 'id' column
movies_merge = movies.merge(credits_renamed, on='id')

In [9]:
movies_merge.shape

(4803, 23)

In [10]:
movies_merge.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title_x                    0
vote_average               0
vote_count                 0
title_y                    0
cast                       0
crew                       0
dtype: int64

In [11]:


# Drop unnecessary columns
movies_cleaned = movies_merge.drop(columns=['homepage', 'title_x', 'title_y', 'status', 'production_countries'])

# Fill missing values in 'overview' with an empty string
movies_cleaned['overview'] = movies_cleaned['overview'].fillna('')


In [13]:
movies_cleaned = movies_cleaned.dropna(subset=['release_date'])

In [14]:
movies_cleaned = movies_cleaned.dropna(subset=['runtime'])


In [15]:
movies_cleaned['tagline'] = movies_cleaned['tagline'].fillna('No tagline available')


In [16]:
movies_cleaned.isnull().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
tagline                 0
vote_average            0
vote_count              0
cast                    0
crew                    0
dtype: int64

## *Convert Text to Numerical Features Using TF-IDF*

In [17]:
# Initialize TF-IDF Vectorizer
tfv = TfidfVectorizer(
    min_df=3,  # Ignore terms that appear in less than 3 documents
    max_features=None,  # Use all features
    strip_accents='unicode',  # Remove accents and special characters
    analyzer='word',  # Analyze at the word level
    token_pattern=r'\w{1,}',  # Tokenize words
    ngram_range=(1, 3),  # Use unigrams, bigrams, and trigrams
    stop_words='english'  # Ignore English stop words
)

# Fit and transform the 'overview' column
tfv_matrix = tfv.fit_transform(movies_cleaned['overview'])

# Display the shape of the TF-IDF matrix
print("\nTF-IDF Matrix Shape:", tfv_matrix.shape)


TF-IDF Matrix Shape: (4800, 10416)


## *Compute Similarity Using Sigmoid Kernel*

In [18]:
# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

# Display the shape of the similarity matrix
print("\nSigmoid Kernel Matrix Shape:", sig.shape)


Sigmoid Kernel Matrix Shape: (4800, 4800)


## *Create Reverse Mapping for Movie Titles and Indices*


In [20]:
indices = pd.Series(movies_cleaned.index, index=movies_cleaned['original_title']).drop_duplicates()
print("\nReverse Mapping:")
print(indices.head(10))


Reverse Mapping:
original_title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
Spider-Man 3                                5
Tangled                                     6
Avengers: Age of Ultron                     7
Harry Potter and the Half-Blood Prince      8
Batman v Superman: Dawn of Justice          9
dtype: int64


## *Function to get recommendations*

In [22]:
def give_recommendations(title, sig=sig):
    try:
        # Get the index corresponding to the movie title
        idx = indices[title]

        # Get the pairwise similarity scores for the movie
        sig_scores = list(enumerate(sig[idx]))  # sig[idx] is a 1D array of similarity scores

        # Sort the movies based on similarity scores
        sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)  # Sort by the similarity score (x[1])

        # Get the scores of the 10 most similar movies (excluding the movie itself)
        sig_scores = sig_scores[1:11]

        # Get the movie indices
        movie_indices = [i[0] for i in sig_scores]

        # Return the top 10 most similar movies
        return movies_cleaned['original_title'].iloc[movie_indices]
    except KeyError:
        # Handle the case where the movie title is not found
        return ["Movie not found. Please try another title."]


In [23]:

# Test the recommendation function
print("\nRecommendations for 'Avatar':")
print(give_recommendations('Avatar'))


Recommendations for 'Avatar':
1341                Obitaemyy Ostrov
634                       The Matrix
3604                       Apollo 18
2130                    The American
775                        Supernova
529                 Tears of the Sun
151                          Beowulf
311     The Adventures of Pluto Nash
847                         Semi-Pro
942                 The Book of Life
Name: original_title, dtype: object


## *Save the Model and Data*

In [24]:
# Save the similarity matrix
joblib.dump(sig, 'sig_matrix.pkl')

# Save the TF-IDF Vectorizer
joblib.dump(tfv, 'tfv_vectorizer.pkl')

# Save the reverse mapping
joblib.dump(indices, 'indices.pkl')

# Save the cleaned data
joblib.dump(movies_cleaned, 'movies_cleaned.pkl')


['movies_cleaned.pkl']