## Load dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

movies_path = "dataset/tmdb_5000_movies.csv"
credits_path = "dataset/tmdb_5000_credits.csv"

movies = pd.read_csv(movies_path)
credits = pd.read_csv(credits_path)

data = movies.copy()
data = data.dropna(subset=["vote_average", "overview"])
data = data.reset_index(drop=True)

train_df, temp_df = train_test_split(
    data,
    test_size=0.3,
    random_state=42,
    shuffle=True,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    shuffle=True,
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [2]:
print(train_df.shape)
print(train_df.columns)

(3360, 20)
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [6]:
import ast
import pandas as pd

TOP_K_CAST = 5      
TOP_N_ACTORS = 100  

def parse_cast(cast_str):
    if pd.isna(cast_str):
        return []
    try:
        return ast.literal_eval(cast_str)
    except (ValueError, SyntaxError):
        return []


def get_top_k_actors(cast_list, k):
    if not isinstance(cast_list, list):
        return []
    cast_sorted = sorted(cast_list, key=lambda x: x.get("order", 1e9))
    top_k = cast_sorted[:k]
    return [member.get("name") for member in top_k if "name" in member]


credits_parsed = credits.copy()
credits_parsed["cast_parsed"] = credits_parsed["cast"].apply(parse_cast)

records = []
for row in credits_parsed.itertuples(index=False):
    movie_id = row.movie_id
    top_actors = get_top_k_actors(row.cast_parsed, TOP_K_CAST)
    for actor_name in top_actors:
        records.append({"movie_id": movie_id, "actor_name": actor_name})

top_cast_df = pd.DataFrame(records)

movie_ratings = data[["id", "vote_average"]].rename(columns={"id": "movie_id"})
merged = top_cast_df.merge(movie_ratings, on="movie_id", how="inner")

actor_counts = merged["actor_name"].value_counts()

top_actor_names = actor_counts.head(TOP_N_ACTORS).index
filtered = merged[merged["actor_name"].isin(top_actor_names)]

actor_stats = (
    filtered
    .groupby("actor_name")
    .agg(
        n_movies=("movie_id", "nunique"),    
        appearances=("movie_id", "size"),    
        mean_rating=("vote_average", "mean"), 
    )
    .reset_index()
    .sort_values("mean_rating", ascending=False)
)



In [5]:
print("Best 10 actors")
display(actor_stats.head(10))

print("Worst 10 Actors")
display(actor_stats.tail(10))


Best 10 actors


Unnamed: 0,actor_name,n_movies,appearances,mean_rating
63,Leonardo DiCaprio,22,22,7.072727
94,Tom Hanks,29,29,7.051724
13,Brad Pitt,32,32,6.8125
19,Christian Bale,23,23,6.786957
77,Philip Seymour Hoffman,22,22,6.777273
65,Mark Ruffalo,22,22,6.768182
29,Ed Harris,25,25,6.756
25,Denzel Washington,30,30,6.73
20,Christopher Plummer,26,26,6.723077
79,Ralph Fiennes,25,25,6.696


Worst 10 Actors


Unnamed: 0,actor_name,n_movies,appearances,mean_rating
96,Will Ferrell,28,28,6.017857
73,Nicolas Cage,35,35,5.988571
28,Dwayne Johnson,21,21,5.985714
52,John Travolta,27,27,5.985185
0,Adam Sandler,24,24,5.970833
76,Paul Rudd,24,24,5.958333
92,Sylvester Stallone,27,27,5.888889
75,Owen Wilson,31,31,5.848387
30,Eddie Murphy,28,28,5.8
57,Justin Long,21,21,5.619048


## Feature Extraction

### Build the feature vectors for each movie:
Turns the overview text into a bag-of-words representation:
Top 500 most frequent words (after English stop-word removal).
Each word is a binary feature (1 = word appears in the overview, 0 = does not).

Adds 4 numeric features:
**budget**, **popularity**, **runtime**
Standardized (zero mean, unit variance) using StandardScaler.
Concatenate text features and numeric features into a single sparse matrix:
First 500 dimensions = text features (overview words)
Last 4 dimensions = numeric features

In [7]:
import ast
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error


TOP_K_CAST = 5      
TOP_N_ACTORS = 100  


def parse_cast(cast_str):
    if pd.isna(cast_str):
        return []
    try:
        return ast.literal_eval(cast_str)
    except (ValueError, SyntaxError):
        return []


def get_top_k_actors(cast_list, k):
    if not isinstance(cast_list, list):
        return []
    cast_sorted = sorted(cast_list, key=lambda x: x.get("order", 1e9))
    top_k = cast_sorted[:k]
    return [m.get("name") for m in top_k if "name" in m]


def build_cast_features(
    train_df,
    val_df,
    test_df,
    credits_df,
    top_k_cast=5,
    top_n_actors=100,
):
    """
    Build one-hot actor features for each movie.
    Length = top_n_actors; 1 if actor is in top_k_cast for that movie.
    """

    credits_parsed = credits_df.copy()
    credits_parsed["cast_parsed"] = credits_parsed["cast"].apply(parse_cast)

    movie_to_actors = {}
    records = []
    for row in credits_parsed.itertuples(index=False):
        movie_id = row.movie_id
        top_actors = get_top_k_actors(row.cast_parsed, top_k_cast)
        movie_to_actors[movie_id] = top_actors
        for name in top_actors:
            records.append((movie_id, name))

    cast_df = pd.DataFrame(records, columns=["movie_id", "actor_name"])

    actor_counts = cast_df["actor_name"].value_counts()
    top_actor_names = list(actor_counts.head(top_n_actors).index)
    actor_to_idx = {name: i for i, name in enumerate(top_actor_names)}

    def make_matrix(df):
        n = len(df)
        mat = np.zeros((n, len(top_actor_names)), dtype=np.float32)
        movie_ids = df["id"].values  # TMDB movie id in movies.csv
        for i, mid in enumerate(movie_ids):
            actors = movie_to_actors.get(mid, [])
            for name in actors:
                j = actor_to_idx.get(name)
                if j is not None:
                    mat[i, j] = 1.0
        return csr_matrix(mat)

    X_train = make_matrix(train_df)
    X_val = make_matrix(val_df)
    X_test = make_matrix(test_df)

    feature_names = np.array([f"cast__{name}" for name in top_actor_names])
    return (X_train, X_val, X_test), feature_names



In [8]:
def build_overview_features(train_df, val_df, test_df, max_features=500):
    vectorizer = CountVectorizer(
        max_features=max_features,
        binary=True,
        stop_words="english",
    )
    X_train = vectorizer.fit_transform(train_df["overview"])
    X_val = vectorizer.transform(val_df["overview"])
    X_test = vectorizer.transform(test_df["overview"])
    words = vectorizer.get_feature_names_out()
    feature_names = np.array([f"overview__{w}" for w in words])
    return (X_train, X_val, X_test), feature_names, vectorizer


def build_numeric_features(train_df, val_df, test_df, cols):
    for col in cols:
        if col not in train_df.columns:
            train_df[col] = 0.0
            val_df[col] = 0.0
            test_df[col] = 0.0
    numeric_train = train_df[cols].fillna(0.0).values.astype(float)
    numeric_val = val_df[cols].fillna(0.0).values.astype(float)
    numeric_test = test_df[cols].fillna(0.0).values.astype(float)

    scaler = StandardScaler()
    numeric_train_scaled = scaler.fit_transform(numeric_train)
    numeric_val_scaled = scaler.transform(numeric_val)
    numeric_test_scaled = scaler.transform(numeric_test)

    X_train = csr_matrix(numeric_train_scaled)
    X_val = csr_matrix(numeric_val_scaled)
    X_test = csr_matrix(numeric_test_scaled)
    feature_names = np.array(cols)
    return (X_train, X_val, X_test), feature_names, scaler

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix

def build_overview_features(train_df, val_df, test_df, max_features=500):
    vectorizer = CountVectorizer(
        max_features=max_features,
        binary=True,
        stop_words="english"
    )
    X_train = vectorizer.fit_transform(train_df["overview"])
    X_val = vectorizer.transform(val_df["overview"])
    X_test = vectorizer.transform(test_df["overview"])
    words = vectorizer.get_feature_names_out()
    feature_names = np.array([f"overview__{w}" for w in words])
    return (X_train, X_val, X_test), feature_names, vectorizer

def build_numeric_features(train_df, val_df, test_df, cols):
    for col in cols:
        if col not in train_df.columns:
            train_df[col] = 0.0
            val_df[col] = 0.0
            test_df[col] = 0.0
    numeric_train = train_df[cols].fillna(0.0).values.astype(float)
    numeric_val = val_df[cols].fillna(0.0).values.astype(float)
    numeric_test = test_df[cols].fillna(0.0).values.astype(float)
    scaler = StandardScaler()
    numeric_train_scaled = scaler.fit_transform(numeric_train)
    numeric_val_scaled = scaler.transform(numeric_val)
    numeric_test_scaled = scaler.transform(numeric_test)
    X_train = csr_matrix(numeric_train_scaled)
    X_val = csr_matrix(numeric_val_scaled)
    X_test = csr_matrix(numeric_test_scaled)
    feature_names = np.array(cols)
    return (X_train, X_val, X_test), feature_names, scaler

from sklearn.metrics import mean_squared_error, mean_absolute_error

def build_features(train_df, val_df, test_df, max_overview_features, numeric_cols, verbose=False):
    feature_blocks_train = []
    feature_blocks_val = []
    feature_blocks_test = []
    feature_name_blocks = []

    (overview_train, overview_val, overview_test), overview_feature_names, overview_vectorizer = build_overview_features(
        train_df, val_df, test_df, max_features=max_overview_features
    )
    feature_blocks_train.append(overview_train)
    feature_blocks_val.append(overview_val)
    feature_blocks_test.append(overview_test)
    feature_name_blocks.append(overview_feature_names)

    if len(numeric_cols) > 0:
        (numeric_train_sparse, numeric_val_sparse, numeric_test_sparse), numeric_feature_names, numeric_scaler = build_numeric_features(
            train_df, val_df, test_df, numeric_cols
        )
        feature_blocks_train.append(numeric_train_sparse)
        feature_blocks_val.append(numeric_val_sparse)
        feature_blocks_test.append(numeric_test_sparse)
        feature_name_blocks.append(numeric_feature_names)
    else:
        numeric_feature_names = np.array([])

    X_train = hstack(feature_blocks_train).tocsr()
    X_val = hstack(feature_blocks_val).tocsr()
    X_test = hstack(feature_blocks_test).tocsr()

    feature_names = np.concatenate(feature_name_blocks)

    y_train = train_df["vote_average"].values
    y_val = val_df["vote_average"].values
    y_test = test_df["vote_average"].values

    global_mean_rating = y_train.mean()

    if verbose:
        print("X_train shape:", X_train.shape)
        print("X_val shape:", X_val.shape)
        print("X_test shape:", X_test.shape)
        print()
        print("Total number of features:", X_train.shape[1])
        print("Number of text features:", len(overview_feature_names))
        print("Number of numeric features:", len(numeric_feature_names))
        print()
        print("First 20 feature names:")
        print(feature_names[:20])

    return {
        "X_train": X_train,
        "X_val": X_val,
        "X_test": X_test,
        "overview_train": overview_train,
        "overview_val": overview_val,
        "overview_test": overview_test,
        "feature_names": feature_names,
        "overview_feature_names": overview_feature_names,
        "numeric_feature_names": numeric_feature_names,
        "overview_vectorizer": overview_vectorizer,
        "y_train": y_train,
        "y_val": y_val,
        "y_test": y_test,
        "global_mean_rating": global_mean_rating,
    }


In [9]:
def build_features(
    train_df,
    val_df,
    test_df,
    max_overview_features,
    numeric_cols,
    use_cast=False,
    top_k_cast=TOP_K_CAST,
    top_n_actors=TOP_N_ACTORS,
    verbose=False,
):
    feature_blocks_train = []
    feature_blocks_val = []
    feature_blocks_test = []
    feature_name_blocks = []

    # Text features (overview)
    (overview_train, overview_val, overview_test), overview_feature_names, overview_vectorizer = (
        build_overview_features(
            train_df,
            val_df,
            test_df,
            max_features=max_overview_features,
        )
    )
    feature_blocks_train.append(overview_train)
    feature_blocks_val.append(overview_val)
    feature_blocks_test.append(overview_test)
    feature_name_blocks.append(overview_feature_names)

    # Numeric features
    if len(numeric_cols) > 0:
        (numeric_train_sparse, numeric_val_sparse, numeric_test_sparse), numeric_feature_names, numeric_scaler = (
            build_numeric_features(
                train_df,
                val_df,
                test_df,
                numeric_cols,
            )
        )
        feature_blocks_train.append(numeric_train_sparse)
        feature_blocks_val.append(numeric_val_sparse)
        feature_blocks_test.append(numeric_test_sparse)
        feature_name_blocks.append(numeric_feature_names)
    else:
        numeric_feature_names = np.array([])

    # Cast one-hot features (top-N actors, only if enabled)
    if use_cast:
        (cast_train, cast_val, cast_test), cast_feature_names = build_cast_features(
            train_df,
            val_df,
            test_df,
            credits,
            top_k_cast=top_k_cast,
            top_n_actors=top_n_actors,
        )
        feature_blocks_train.append(cast_train)
        feature_blocks_val.append(cast_val)
        feature_blocks_test.append(cast_test)
        feature_name_blocks.append(cast_feature_names)
    else:
        cast_feature_names = np.array([])

    # Combine all blocks
    X_train = hstack(feature_blocks_train).tocsr()
    X_val = hstack(feature_blocks_val).tocsr()
    X_test = hstack(feature_blocks_test).tocsr()

    feature_names = np.concatenate(feature_name_blocks)

    y_train = train_df["vote_average"].values
    y_val = val_df["vote_average"].values
    y_test = test_df["vote_average"].values

    global_mean_rating = y_train.mean()

    if verbose:
        print("X_train shape:", X_train.shape)
        print("X_val shape:", X_val.shape)
        print("X_test shape:", X_test.shape)
        print()
        print("Total number of features:", X_train.shape[1])
        print("Number of text features:", len(overview_feature_names))
        print("Number of numeric features:", len(numeric_feature_names))
        print("Number of cast features:", len(cast_feature_names))
        print()
        print("First 20 feature names:")
        print(feature_names[:20])

    return {
        "X_train": X_train,
        "X_val": X_val,
        "X_test": X_test,
        "overview_train": overview_train,
        "overview_val": overview_val,
        "overview_test": overview_test,
        "feature_names": feature_names,
        "overview_feature_names": overview_feature_names,
        "numeric_feature_names": numeric_feature_names,
        "cast_feature_names": cast_feature_names,
        "overview_vectorizer": overview_vectorizer,
        "y_train": y_train,
        "y_val": y_val,
        "y_test": y_test,
        "global_mean_rating": global_mean_rating,
    }


In [None]:
feats = build_features(
    train_df,
    val_df,
    test_df,
    max_overview_features=200,
    numeric_cols=["budget", "popularity", "runtime"],
    use_cast=False,
    verbose=True
)


X_train shape: (3360, 203)
X_val shape: (720, 203)
X_test shape: (720, 203)

Total number of features: 203
Number of text features: 200
Number of numeric features: 3
Number of cast features: 0

First 20 feature names:
['overview__accident' 'overview__action' 'overview__adventure'
 'overview__agent' 'overview__america' 'overview__american'
 'overview__angeles' 'overview__army' 'overview__attempt' 'overview__away'
 'overview__based' 'overview__battle' 'overview__beautiful'
 'overview__begin' 'overview__begins' 'overview__best' 'overview__big'
 'overview__black' 'overview__boy' 'overview__british']


In [12]:
feats = build_features(
    train_df,
    val_df,
    test_df,
    max_overview_features=200,
    numeric_cols=["budget", "popularity", "runtime"],
    use_cast=True,
    top_k_cast=TOP_K_CAST,
    top_n_actors=TOP_N_ACTORS,
    verbose=True
)


X_train shape: (3360, 303)
X_val shape: (720, 303)
X_test shape: (720, 303)

Total number of features: 303
Number of text features: 200
Number of numeric features: 3
Number of cast features: 100

First 20 feature names:
['overview__accident' 'overview__action' 'overview__adventure'
 'overview__agent' 'overview__america' 'overview__american'
 'overview__angeles' 'overview__army' 'overview__attempt' 'overview__away'
 'overview__based' 'overview__battle' 'overview__beautiful'
 'overview__begin' 'overview__begins' 'overview__best' 'overview__big'
 'overview__black' 'overview__boy' 'overview__british']


In [14]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_matrix(X_query, X_ref):
    return cosine_similarity(X_query, X_ref)

def weighted_average_predict(sim_matrix, y_train, k=20, global_mean=None):
    if global_mean is None:
        global_mean = float(np.mean(y_train))
    n_query = sim_matrix.shape[0]
    y_pred = np.empty(n_query, dtype=float)
    for i in range(n_query):
        sims = sim_matrix[i]
        if k is not None and k < sims.shape[0]:
            idx = np.argpartition(-sims, k)[:k]
        else:
            idx = np.arange(sims.shape[0])
        neighbor_sims = sims[idx]
        neighbor_ratings = y_train[idx]
        positive = neighbor_sims > 0
        if not np.any(positive):
            y_pred[i] = global_mean
        else:
            weights = neighbor_sims[positive]
            ratings = neighbor_ratings[positive]
            y_pred[i] = np.sum(weights * ratings) / np.sum(weights)
    return y_pred


## Experiments

In [16]:
def compute_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    return mse, rmse, mae

word_sizes = [1, 100, 200, 400, 800, 1200]

numeric_configs = [
    ("all", ["budget", "popularity", "runtime", "vote_count"]),
    ("drop_vote_count", ["budget", "popularity", "runtime"]),
    ("drop_runtime", ["budget", "popularity", "vote_count"]),
    ("drop_all", []),
    ("drop_budget", ["popularity", "runtime", "vote_count"]),
]

rows = []

for max_words in word_sizes:
    for numeric_name, numeric_cols in numeric_configs:
        feats = build_features(
            train_df,
            val_df,
            test_df,
            max_overview_features=max_words,
            numeric_cols=numeric_cols,
            use_cast=True,
            top_k_cast=TOP_K_CAST,
            top_n_actors=TOP_N_ACTORS,
            verbose=False
        )

        X_train = feats["X_train"]
        X_val = feats["X_val"]
        overview_train = feats["overview_train"]
        overview_val = feats["overview_val"]
        y_train = feats["y_train"]
        y_val = feats["y_val"]
        global_mean_rating = feats["global_mean_rating"]

        cos_sim_val_train = cosine_similarity_matrix(X_val, X_train)
        y_pred_cos = weighted_average_predict(
            cos_sim_val_train,
            y_train,
            k=20,
            global_mean=global_mean_rating
        )
        mse_cos, rmse_cos, mae_cos = compute_metrics(y_val, y_pred_cos)

        rows.append({
            "word_features": max_words,
            "numeric_config": numeric_name,
            "MSE": mse_cos,
            "RMSE": rmse_cos,
            "MAE": mae_cos,
        })

ablation_df = pd.DataFrame(rows)
ablation_df.sort_values(["word_features", "numeric_config"]).reset_index(drop=True)


Unnamed: 0,word_features,numeric_config,MSE,RMSE,MAE
0,1,all,0.921092,0.959735,0.66937
1,1,drop_all,1.316173,1.147246,0.794058
2,1,drop_budget,0.986364,0.993159,0.715733
3,1,drop_runtime,0.85273,0.923434,0.682352
4,1,drop_vote_count,0.949936,0.974647,0.6899
5,100,all,1.007774,1.00388,0.690306
6,100,drop_all,1.343712,1.159186,0.815122
7,100,drop_budget,1.081073,1.039747,0.731789
8,100,drop_runtime,1.124408,1.060381,0.743496
9,100,drop_vote_count,1.045684,1.022587,0.712887
