In [1]:
import pandas as pd
import numpy as np
import ast
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge on 'title'
movies = movies.merge(credits, on='title')


In [2]:
# Keep only the columns we care about
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [3]:
# Convert JSON-style strings to lists

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)


In [4]:
def convert_cast(obj):
    L = []
    count = 0
    for i in ast.literal_eval(obj):
        if count < 3:
            L.append(i['name'])
            count += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert_cast)


In [None]:
def fetch_director(obj):
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies['crew'] = movies['crew'].apply(fetch_director)


In [None]:
# Fill NaNs and split overview
movies['overview'] = movies['overview'].fillna("").apply(lambda x: x.split())


In [None]:
# Combine all useful info into one list column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
    

In [None]:
# Convert list to lowercase string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x).lower())


In [None]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(movies['tags']).toarray()

# Cosine similarity
similarity = cosine_similarity(vectors)


In [None]:
# Save only useful columns, retain genres
final_df = movies[['movie_id', 'title', 'tags', 'genres']].copy()

# Save pickles
pickle.dump(final_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
