## Load dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

movies_path = "dataset/tmdb_5000_movies.csv"
credits_path = "dataset/tmdb_5000_credits.csv"

movies = pd.read_csv(movies_path)
credits = pd.read_csv(credits_path)

data = movies.copy()
data = data.dropna(subset=["vote_average", "overview"])
data = data.reset_index(drop=True)

train_df, temp_df = train_test_split(
    data,
    test_size=0.3,
    random_state=42,
    shuffle=True,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    shuffle=True,
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [2]:
print(train_df.shape)
print(train_df.columns)

(3360, 20)
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


## Feature Extraction

### Build the feature vectors for each movie:
Turns the overview text into a bag-of-words representation:
Top 500 most frequent words (after English stop-word removal).
Each word is a binary feature (1 = word appears in the overview, 0 = does not).

Adds 4 numeric features:
**budget**, **popularity**, **runtime**
Standardized (zero mean, unit variance) using StandardScaler.
Concatenate text features and numeric features into a single sparse matrix:
First 500 dimensions = text features (overview words)
Last 4 dimensions = numeric features

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix

def build_overview_features(train_df, val_df, test_df, max_features=500):
    vectorizer = CountVectorizer(
        max_features=max_features,
        binary=True,
        stop_words="english"
    )
    X_train = vectorizer.fit_transform(train_df["overview"])
    X_val = vectorizer.transform(val_df["overview"])
    X_test = vectorizer.transform(test_df["overview"])
    words = vectorizer.get_feature_names_out()
    feature_names = np.array([f"overview__{w}" for w in words])
    return (X_train, X_val, X_test), feature_names, vectorizer

def build_numeric_features(train_df, val_df, test_df, cols):
    for col in cols:
        if col not in train_df.columns:
            train_df[col] = 0.0
            val_df[col] = 0.0
            test_df[col] = 0.0
    numeric_train = train_df[cols].fillna(0.0).values.astype(float)
    numeric_val = val_df[cols].fillna(0.0).values.astype(float)
    numeric_test = test_df[cols].fillna(0.0).values.astype(float)
    scaler = StandardScaler()
    numeric_train_scaled = scaler.fit_transform(numeric_train)
    numeric_val_scaled = scaler.transform(numeric_val)
    numeric_test_scaled = scaler.transform(numeric_test)
    X_train = csr_matrix(numeric_train_scaled)
    X_val = csr_matrix(numeric_val_scaled)
    X_test = csr_matrix(numeric_test_scaled)
    feature_names = np.array(cols)
    return (X_train, X_val, X_test), feature_names, scaler

from sklearn.metrics import mean_squared_error, mean_absolute_error

def build_features(train_df, val_df, test_df, max_overview_features, numeric_cols, verbose=False):
    feature_blocks_train = []
    feature_blocks_val = []
    feature_blocks_test = []
    feature_name_blocks = []

    (overview_train, overview_val, overview_test), overview_feature_names, overview_vectorizer = build_overview_features(
        train_df, val_df, test_df, max_features=max_overview_features
    )
    feature_blocks_train.append(overview_train)
    feature_blocks_val.append(overview_val)
    feature_blocks_test.append(overview_test)
    feature_name_blocks.append(overview_feature_names)

    if len(numeric_cols) > 0:
        (numeric_train_sparse, numeric_val_sparse, numeric_test_sparse), numeric_feature_names, numeric_scaler = build_numeric_features(
            train_df, val_df, test_df, numeric_cols
        )
        feature_blocks_train.append(numeric_train_sparse)
        feature_blocks_val.append(numeric_val_sparse)
        feature_blocks_test.append(numeric_test_sparse)
        feature_name_blocks.append(numeric_feature_names)
    else:
        numeric_feature_names = np.array([])

    X_train = hstack(feature_blocks_train).tocsr()
    X_val = hstack(feature_blocks_val).tocsr()
    X_test = hstack(feature_blocks_test).tocsr()

    feature_names = np.concatenate(feature_name_blocks)

    y_train = train_df["vote_average"].values
    y_val = val_df["vote_average"].values
    y_test = test_df["vote_average"].values

    global_mean_rating = y_train.mean()

    if verbose:
        print("X_train shape:", X_train.shape)
        print("X_val shape:", X_val.shape)
        print("X_test shape:", X_test.shape)
        print()
        print("Total number of features:", X_train.shape[1])
        print("Number of text features:", len(overview_feature_names))
        print("Number of numeric features:", len(numeric_feature_names))
        print()
        print("First 20 feature names:")
        print(feature_names[:20])

    return {
        "X_train": X_train,
        "X_val": X_val,
        "X_test": X_test,
        "overview_train": overview_train,
        "overview_val": overview_val,
        "overview_test": overview_test,
        "feature_names": feature_names,
        "overview_feature_names": overview_feature_names,
        "numeric_feature_names": numeric_feature_names,
        "overview_vectorizer": overview_vectorizer,
        "y_train": y_train,
        "y_val": y_val,
        "y_test": y_test,
        "global_mean_rating": global_mean_rating,
    }


In [4]:
feats = build_features(
    train_df,
    val_df,
    test_df,
    max_overview_features=200,
    numeric_cols=["budget", "popularity", "runtime"],
    verbose=True
)


X_train shape: (3360, 203)
X_val shape: (720, 203)
X_test shape: (720, 203)

Total number of features: 203
Number of text features: 200
Number of numeric features: 3

First 20 feature names:
['overview__accident' 'overview__action' 'overview__adventure'
 'overview__agent' 'overview__america' 'overview__american'
 'overview__angeles' 'overview__army' 'overview__attempt' 'overview__away'
 'overview__based' 'overview__battle' 'overview__beautiful'
 'overview__begin' 'overview__begins' 'overview__best' 'overview__big'
 'overview__black' 'overview__boy' 'overview__british']


In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_matrix(X_query, X_ref):
    return cosine_similarity(X_query, X_ref)

def weighted_average_predict(sim_matrix, y_train, k=20, global_mean=None):
    if global_mean is None:
        global_mean = float(np.mean(y_train))
    n_query = sim_matrix.shape[0]
    y_pred = np.empty(n_query, dtype=float)
    for i in range(n_query):
        sims = sim_matrix[i]
        if k is not None and k < sims.shape[0]:
            idx = np.argpartition(-sims, k)[:k]
        else:
            idx = np.arange(sims.shape[0])
        neighbor_sims = sims[idx]
        neighbor_ratings = y_train[idx]
        positive = neighbor_sims > 0
        if not np.any(positive):
            y_pred[i] = global_mean
        else:
            weights = neighbor_sims[positive]
            ratings = neighbor_ratings[positive]
            y_pred[i] = np.sum(weights * ratings) / np.sum(weights)
    return y_pred


## Experiments

In [6]:
def compute_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    return mse, rmse, mae

word_sizes = [100, 200, 400, 800, 1200]

numeric_configs = [
    ("all", ["budget", "popularity", "runtime", "vote_count"]),
    ("drop_vote_count", ["budget", "popularity", "runtime"]),
    ("drop_runtime", ["budget", "popularity", "vote_count"]),
    ("drop_all", []),
    ("drop_budget", ["popularity", "runtime", "vote_count"]),
]

rows = []

for max_words in word_sizes:
    for numeric_name, numeric_cols in numeric_configs:
        feats = build_features(
            train_df,
            val_df,
            test_df,
            max_overview_features=max_words,
            numeric_cols=numeric_cols,
            verbose=False
        )

        X_train = feats["X_train"]
        X_val = feats["X_val"]
        overview_train = feats["overview_train"]
        overview_val = feats["overview_val"]
        y_train = feats["y_train"]
        y_val = feats["y_val"]
        global_mean_rating = feats["global_mean_rating"]

        cos_sim_val_train = cosine_similarity_matrix(X_val, X_train)
        y_pred_cos = weighted_average_predict(
            cos_sim_val_train,
            y_train,
            k=20,
            global_mean=global_mean_rating
        )
        mse_cos, rmse_cos, mae_cos = compute_metrics(y_val, y_pred_cos)

        rows.append({
            "word_features": max_words,
            "numeric_config": numeric_name,
            "MSE": mse_cos,
            "RMSE": rmse_cos,
            "MAE": mae_cos,
        })

ablation_df = pd.DataFrame(rows)
ablation_df.sort_values(["word_features", "numeric_config"]).reset_index(drop=True)


Unnamed: 0,word_features,numeric_config,MSE,RMSE,MAE
0,100,all,1.015266,1.007604,0.689959
1,100,drop_all,1.327912,1.152351,0.794432
2,100,drop_budget,1.052835,1.026077,0.724144
3,100,drop_runtime,1.112336,1.054673,0.728012
4,100,drop_vote_count,1.03851,1.019073,0.707575
5,200,all,1.027442,1.013628,0.690299
6,200,drop_all,1.347282,1.160725,0.808208
7,200,drop_budget,1.086291,1.042253,0.730124
8,200,drop_runtime,1.159565,1.076831,0.734692
9,200,drop_vote_count,1.048237,1.023834,0.703956
