In [2]:
import pandas as pd
import numpy as np
import ast

In [6]:
movies = pd.read_csv("dataset/tmdb_5000_movies.csv")
credits = pd.read_csv("dataset/tmdb_5000_credits.csv")

In [24]:
data = movies.merge(credits, left_on='id', right_on='movie_id')
data.drop('movie_id', axis=1, inplace=True)

In [25]:
data.drop(['homepage', 'tagline', 'status'], axis=1, inplace=True)

In [26]:
data = data.dropna(subset=['overview', 'genres', 'cast', 'crew'])
data = data.reset_index(drop=True)


In [27]:
def extract_names(text, top_n=5):
    return " ".join(
        [i['name'].replace(" ", "") for i in ast.literal_eval(text)[:top_n]]
    )

In [28]:
def get_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return i['name'].replace(" ", "")
    return ""

In [29]:
data['cast'] = data['cast'].apply(extract_names)
data['genre'] = data['genres'].apply(extract_names)
data['director'] = data['crew'].apply(get_director)

In [30]:
data['features'] = (
    data['cast'] * 2 +
    data['director'] * 3 +
    data['genre'] * 3 +
    data['overview']
)
data['features'] = data['features'].str.lower()

In [31]:
print(data.columns)


Index(['budget', 'genres', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'title_x', 'vote_average', 'vote_count', 'title_y',
       'cast', 'crew', 'genre', 'director', 'features'],
      dtype='object')


In [40]:
data['features'] = data['features'].apply(lambda x: x.lower())
data['title'] = data['original_title'].str.strip().str.lower()

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['features'])

In [43]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [44]:
indices = pd.Series(data.index, index=data['title']).drop_duplicates()

In [45]:
def recommend_movies_hybrid(title, top_n=10, w_genre=0.4, w_cast=0.4, w_features=0.2):
    title = title.strip().lower()

    if title not in indices:
        print(f"Movie '{title}' not found in dataset.")
        return pd.Series(dtype='str')

    idx = indices[title]

    target_genre_set = set(data['genre'].iloc[idx].split())
    target_cast_set = set(data['cast'].iloc[idx].split())

    base_scores = list(enumerate(cosine_sim[idx]))
    scores = []

    for i, base in base_scores:
        if i == idx:
            continue

        score = 0

        # Genre overlap (accuracy fix)
        genre_i = set(data['genre'].iloc[i].split())
        genre_overlap = len(genre_i & target_genre_set) / (len(target_genre_set) + 1e-6)
        score += w_genre * genre_overlap

        # Cast overlap (top cast only)
        cast_i = set(data['cast'].iloc[i].split())
        cast_overlap = len(cast_i & target_cast_set) / (len(target_cast_set) + 1e-6)
        score += w_cast * cast_overlap

        # TF-IDF similarity
        score += w_features * base

        scores.append((i, score))

    scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    movie_indices = [i for i, _ in scores]

    return data['title'].iloc[movie_indices]


In [46]:
recommend_movies_hybrid("avatar")

Unnamed: 0,title
14,man of steel
46,x-men: days of future past
2163,the covenant
813,superman
72,suicide squad
3493,beastmaster 2: through the portal of time
870,superman ii
1192,spawn
1932,sheena
1191,small soldiers


In [47]:
data[data['title'].str.contains('avatar', na=False)]


Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,...,title_x,vote_average,vote_count,title_y,cast,crew,genre,director,features,title
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",...,Avatar,7.2,11800,Avatar,SamWorthington ZoeSaldana SigourneyWeaver Step...,"[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",Action Adventure Fantasy ScienceFiction,JamesCameron,samworthington zoesaldana sigourneyweaver step...,avatar
