In [1]:
import pandas as pd
import numpy as np
import ast # For safely evaluating string representations of lists/dicts

# Load the datasets
movies = pd.read_csv('../data/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/tmdb_5000_credits.csv')

# Merge them on the 'title' column
movies = movies.merge(credits, on='title')

In [2]:
# We'll use: genres, id, keywords, title, overview, cast, crew
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

In [3]:
# Helper function to extract names from the stringified JSON
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# For 'cast', let's just take the top 3 actors
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert3)

# For 'crew', we only want the director's name
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

In [4]:
# Make 'overview' a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Remove spaces between words in other columns to create unique tags (e.g., "James Cameron" -> "JamesCameron")
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Combine everything into a 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new DataFrame with just the essential info
# ---- THE FIX IS HERE ----
# We explicitly create a copy to avoid the SettingWithCopyWarning
new_df = movies[['movie_id', 'title', 'tags']].copy()

# Convert the list of tags back into a string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# Let's check the first row to see our final 'tags' column
print(new_df.head())

   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                                tags  
0  in the 22nd century, a paraplegic marine is di...  
1  captain barbossa, long believed to be dead, ha...  
2  a cryptic message from bond’s past sends him o...  
3  following the death of district attorney harve...  
4  john carter is a war-weary, former military ca...  


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorization
# We use TF-IDF which is better than simple CountVectorizer for this task.
# max_features=5000 means we'll consider the 5000 most frequent words.
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(new_df['tags']).toarray()

# Calculate Cosine Similarity
# This creates a matrix where each movie is compared to every other movie.
similarity = cosine_similarity(vectors)

In [6]:
import pickle

# Save the DataFrame (which contains movie titles)
pickle.dump(new_df.to_dict(), open('static/movie_dict.pkl', 'wb'))

# Save the similarity matrix
pickle.dump(similarity, open('static/similarity.pkl', 'wb'))

print("Model saved successfully!")

Model saved successfully!
