In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load data
df = pd.read_csv("../data/data.csv").head(10000)
df.dropna(subset=["title", "genres"], inplace=True)

# Combine genres into a string for TF-IDF
df["genres_str"] = df["genres"].apply(lambda x: x.replace(", ", " "))

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["genres_str"])

# Compute similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Mapping titles to index
indices = pd.Series(df.index, index=df["title"]).drop_duplicates()

def recommend(title, num_recommendations=5):
    idx = indices.get(title)
    if idx is None:
        return "Title not found."
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    movie_indices = [i[0] for i in sim_scores]
    return df[["title", "genres", "imdbAverageRating"]].iloc[movie_indices]

# Example usage
print(recommend("Forrest Gump"))


                  title          genres  imdbAverageRating
20        Before Sunset  Drama, Romance                8.1
32          Open Hearts  Drama, Romance                7.5
50   Brokeback Mountain  Drama, Romance                7.7
102            Chocolat  Drama, Romance                7.2
120   Good Will Hunting  Drama, Romance                8.3
