# Imports & Dataset

## Importing relavent packages

In [None]:
import pandas as pd
import numpy as np
import ast
from collections import Counter
from itertools import chain
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import gdown
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

## Loading Preprocessed files into Colab

In [None]:
#### MOVIES DATA #####
metadata_drive = '1vUxq_77g3r1jH1S3Zctmj-u5pArq7iZd'
gdown.download(f"https://drive.google.com/uc?id={metadata_drive}", "movies_metadata.csv", quiet=False)
df_movies_metadata = pd.read_csv("movies_metadata.csv")

##### LOAD RATINGS DATA ####
ratings_small_drive = '1lNJbFv82a2HTSaHqY7yloXoPovUpy6zn'
gdown.download(f"https://drive.google.com/uc?id={ratings_small_drive}", "ratings_small.csv", quiet=False)
df_ratings = pd.read_csv("ratings_small.csv")


Downloading...
From: https://drive.google.com/uc?id=1vUxq_77g3r1jH1S3Zctmj-u5pArq7iZd
To: /content/movies_metadata.csv
100%|██████████| 50.9M/50.9M [00:00<00:00, 54.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1lNJbFv82a2HTSaHqY7yloXoPovUpy6zn
To: /content/ratings_small.csv
100%|██████████| 1.46M/1.46M [00:00<00:00, 123MB/s]


In [None]:
df_movies_metadata.head()

Unnamed: 0,adult,genres,id,original_language,original_title,overview,production_companies,production_countries,spoken_languages,release_year,runtime_category,vote_count_log,vote_average_norm,vote_count_norm,popularity_norm,years_since_release,keyword_values,textual_representation
0,False,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,United States of America,English,1995,Medium,8.597113,0.77,0.900011,0.040087,22,"jealousy, toy, boy, friendship, friends, rival...","This movie is titled Toy Story, produced in Un..."
1,False,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,"English, Français",1995,Medium,7.78904,0.69,0.815416,0.031079,22,"board game, disappearance, based on children's...","This movie is titled Jumanji, produced in Unit..."
2,False,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,"Warner Bros., Lancaster Gate",United States of America,English,1995,Medium,4.532599,0.65,0.474507,0.021394,22,"fishing, best friend, duringcreditsstinger, ol...","This movie is titled Grumpier Old Men, produce..."
3,False,"Comedy, Drama, Romance",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Twentieth Century Fox Film Corporation,United States of America,English,1995,Long,3.555348,0.61,0.372201,0.007049,22,"based on novel, interracial relationship, sing...","This movie is titled Waiting to Exhale, produc..."
4,False,Comedy,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,"Sandollar Productions, Touchstone Pictures",United States of America,English,1995,Medium,5.159055,0.57,0.540089,0.01532,22,"baby, midlife crisis, confidence, aging, daugh...",This movie is titled Father of the Bride Part ...


# LSTM Data Pre-processing

Defining functions to clean text and combine genre and keyword fields

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Clean text and remove stopwords"""
    if not isinstance(text, str):
        return ""

    text = re.sub(r"[^\w\s]", '', text)  # Remove punctuation
    text = re.sub(r"\d+", '', text)      # Remove digits
    text = text.lower().strip()          # Lowercase and strip

    # Remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

def build_doc(row):
    # Clean and combine genres
    genres = preprocess_text(row['genres'].replace('|', ' ')) if isinstance(row['genres'], str) else ""

    # Clean and combine keyword values (list of phrases → space-separated string)
    keywords = preprocess_text(row['keyword_values'].replace(',', ' ')) if isinstance(row['keyword_values'], str) else ""

    # Combine all parts into one document
    return f"{genres} {keywords}".strip()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training a Word2Vec model on tokenized genre and keyword text

In [None]:
!pip install gensim --upgrade ## If encounter error, restart kernel and run again
from gensim.models import Word2Vec

df_movies_metadata['doc'] = df_movies_metadata.apply(build_doc, axis=1)
df_movies_metadata['doc'].head()

tokenized_docs = [doc.split() for doc in df_movies_metadata['doc'].tolist()]


w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,  # Skip-gram
    workers=4,
    epochs=20
)



Generating average Word2Vec embeddings for each movie document

In [None]:
def embed_doc(doc, w2v_model, embedding_dim=100):
    tokens = doc.split()
    vecs = [w2v_model.wv[token] for token in tokens if token in w2v_model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(embedding_dim)

# Apply to all rows
embedding_dim = w2v_model.vector_size
df_movies_metadata['embedding'] = df_movies_metadata['doc'].apply(lambda doc: embed_doc(doc, w2v_model, embedding_dim))


For each movie, the code concatenates three types of features:
- A semantic vector from the Word2Vec model, representing textual information (e.g., genres, keywords),
- Three normalized numerical features: `vote_average_norm`, `vote_count_norm`, and `release_year_norm`,
- Encoded IDs for categorical attributes like production company and runtime category (stored separately).

In [None]:
from sklearn.preprocessing import LabelEncoder

prod_encoder = LabelEncoder()
runtime_encoder = LabelEncoder()

df_movies_metadata['prod_id'] = prod_encoder.fit_transform(df_movies_metadata['production_companies'])
df_movies_metadata['runtime_id'] = runtime_encoder.fit_transform(df_movies_metadata['runtime_category'])

min_year = df_movies_metadata['release_year'].min()
max_year = df_movies_metadata['release_year'].max()

df_movies_metadata['release_year_norm'] = (df_movies_metadata['release_year'] - min_year) / (max_year - min_year)

num_prods = df_movies_metadata['prod_id'].nunique()
num_runtimes = df_movies_metadata['runtime_id'].nunique()


combined_features = []
prod_ids = []
runtime_ids = []

for i, row in df_movies_metadata.iterrows():
    # Word2Vec embedding
    vector = row['embedding']

    # Normalized numeric features
    numerics = row[['vote_average_norm', 'vote_count_norm', 'release_year_norm']].values

    # Combine semantic and numeric features
    full_vector = np.concatenate([vector, numerics])
    combined_features.append(full_vector)

    # Keep separate category IDs for embedding
    prod_ids.append(row['prod_id'])
    runtime_ids.append(row['runtime_id'])

# Final matrix for LSTM input
combined_vector = np.array(combined_features)  # shape: [num_samples, input_dim]
prod_ids = np.array(prod_ids)                  # shape: [num_samples]
runtime_ids = np.array(runtime_ids)            # shape: [num_samples]


In [None]:
df_movies_metadata['combined_vector'] = combined_features
combined_vector = np.stack(df_movies_metadata['combined_vector'].values)
df_movies_metadata.head(10)

Unnamed: 0,adult,genres,id,original_language,original_title,overview,production_companies,production_countries,spoken_languages,release_year,...,popularity_norm,years_since_release,keyword_values,textual_representation,doc,embedding,prod_id,runtime_id,release_year_norm,combined_vector
0,False,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,United States of America,English,1995,...,0.040087,22,"jealousy, toy, boy, friendship, friends, rival...","This movie is titled Toy Story, produced in Un...",animation comedy family jealousy toy boy frien...,"[-0.11513898, 0.008001769, -0.019505141, -0.09...",15199,1,0.828767,"[-0.11513897776603699, 0.008001768961548805, -..."
1,False,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,"English, Français",1995,...,0.031079,22,"board game, disappearance, based on children's...","This movie is titled Jumanji, produced in Unit...",adventure fantasy family board game disappeara...,"[-0.3166961, 0.2819797, -0.15894498, -0.022609...",19622,1,0.828767,"[-0.3166961073875427, 0.2819797098636627, -0.1..."
2,False,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,"Warner Bros., Lancaster Gate",United States of America,English,1995,...,0.021394,22,"fishing, best friend, duringcreditsstinger, ol...","This movie is titled Grumpier Old Men, produce...",romance comedy fishing best friend duringcredi...,"[-0.31996247, 0.27372667, -0.28294957, -0.0098...",21675,1,0.828767,"[-0.3199624717235565, 0.2737266719341278, -0.2..."
3,False,"Comedy, Drama, Romance",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Twentieth Century Fox Film Corporation,United States of America,English,1995,...,0.007049,22,"based on novel, interracial relationship, sing...","This movie is titled Waiting to Exhale, produc...",comedy drama romance based novel interracial r...,"[-0.17462923, 0.01776824, -0.24283282, -0.0685...",19763,0,0.828767,"[-0.17462922632694244, 0.017768239602446556, -..."
4,False,Comedy,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,"Sandollar Productions, Touchstone Pictures",United States of America,English,1995,...,0.01532,22,"baby, midlife crisis, confidence, aging, daugh...",This movie is titled Father of the Bride Part ...,comedy baby midlife crisis confidence aging da...,"[0.13128638, 0.073717125, -0.13439795, -0.2470...",16838,1,0.828767,"[0.1312863826751709, 0.07371712476015091, -0.1..."
5,False,"Action, Crime, Drama, Thriller",949,en,Heat,"Obsessive master thief, Neil McCauley leads a ...","Regency Enterprises, Forward Pass, Warner Bros.",United States of America,"English, Español",1995,...,0.03274,22,"robbery, detective, bank, obsession, chase, sh...","This movie is titled Heat, produced in United ...",action crime drama thriller robbery detective ...,"[-0.35876963, -0.190521, -0.094962895, 0.09381...",16114,0,0.828767,"[-0.35876962542533875, -0.1905210018157959, -0..."
6,False,"Comedy, Romance",11860,en,Sabrina,An ugly duckling having undergone a remarkable...,"Paramount Pictures, Scott Rudin Productions, M...","Germany, United States of America","Français, English",1995,...,0.012196,22,"paris, brother brother relationship, chauffeur...","This movie is titled Sabrina, produced in Germ...",comedy romance paris brother brother relations...,"[-0.12749855, -0.0057888348, -0.14918678, 0.09...",14774,0,0.828767,"[-0.1274985522031784, -0.005788834765553474, -..."
7,False,"Action, Adventure, Drama, Family",45325,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",Walt Disney Pictures,United States of America,"English, Deutsch",1995,...,0.004678,22,"Action, Adventure, Drama, Family","This movie is titled Tom and Huck, produced in...",action adventure drama family action adventure...,"[-0.1416744, 0.23635162, -0.16573212, 0.128426...",21394,1,0.828767,"[-0.14167439937591553, 0.2363516241312027, -0...."
8,False,"Action, Adventure, Thriller",9091,en,Sudden Death,International action superstar Jean Claude Van...,"Universal Pictures, Imperial Entertainment, Si...",United States of America,English,1995,...,0.009556,22,"terrorist, hostage, explosive, vice president","This movie is titled Sudden Death, produced in...",action adventure thriller terrorist hostage ex...,"[-0.37880528, -0.06083382, 0.07977502, 0.20052...",20643,1,0.828767,"[-0.3788052797317505, -0.06083381921052933, 0...."
9,False,"Adventure, Action, Thriller",710,en,GoldenEye,James Bond must unmask the mysterious head of ...,"United Artists, Eon Productions","United Kingdom, United States of America","English, Pусский, Español",1995,...,0.026824,22,"cuba, falsely accused, secret identity, comput...","This movie is titled GoldenEye, produced in Un...",adventure action thriller cuba falsely accused...,"[-0.14259414, 0.07861366, 0.07763441, 0.018767...",20312,0,0.828767,"[-0.1425941437482834, 0.07861366122961044, 0.0..."


Merging Movie Metadata with Ratings to Form User Interaction History

After the merge, the resulting DataFrame (`df_small_ratings`) is sorted by `userId` and `date_time` to maintain a chronological order of user activity. This ordering is crucial for models like LSTM, which rely on the temporal sequence of user interactions to learn behavioral patterns.

In [None]:
# First, align the column names
df_movies_and_keywords = df_movies_metadata.rename(columns={'id': 'movieId'})

# Then perform the merge
df_small_ratings = df_ratings.merge(
    df_movies_and_keywords[['movieId','original_title', 'prod_id', 'runtime_id', 'combined_vector']],
    on='movieId',
    how='inner'  # or 'left' if you want to keep all ratings
)

df_small_ratings = df_small_ratings.sort_values(by=['userId', 'date_time'])


df_small_ratings.head()

Unnamed: 0,userId,movieId,rating,date_time,original_title,prod_id,runtime_id,combined_vector
4,1,2294,2.0,2009-12-14 02:51:48,Jay and Silent Bob Strike Back,5488,1,"[-0.2089790254831314, 0.1709032654762268, -0.2..."
5,1,2455,2.5,2009-12-14 02:51:53,Vivement dimanche!,10955,1,"[-0.11371078342199326, -0.0551886148750782, 0...."
0,1,1371,2.5,2009-12-14 02:52:15,Rocky III,20264,1,"[-0.5076473951339722, -0.06163134425878525, 0...."
2,1,2105,4.0,2009-12-14 02:52:19,American Pie,20810,1,"[-0.41693365573883057, 0.29266980290412903, -0..."
3,1,2193,2.0,2009-12-14 02:53:18,My Tutor,4904,1,"[-0.20992015302181244, 0.2695636451244354, -0...."


Preparing Movie Embedding Matrix for Similarity Computation


In [None]:
# Ensure every vector is converted to float32 numpy array
movie_id_to_vector = {
    row['movieId']: np.array(row['combined_vector'], dtype=np.float32)
    for _, row in df_movies_and_keywords.iterrows()
}

# Sort movie IDs (for consistent indexing)
movie_id_list = sorted(movie_id_to_vector.keys())

# Stack all vectors into one 2D float32 array
movie_db_vectors = np.stack([movie_id_to_vector[mid] for mid in movie_id_list])


# LSTM with Softmax Classification

## Preparing Train-Test Data

### Preparing Input for LSTM Softmax Classification

To prepare inputs for the LSTM softmax classification model, we start by defining a fixed `max_len` that determines how many previous movie interactions are used as a sequence input. Each user's interaction history is sorted and processed using a sliding window approach, where each windowed segment of past interactions becomes one input sequence, and the next movie becomes the prediction target.

To ensure sufficient sequence length, only users with at least `max_interactions` are selected. We construct a mapping (`movie_id_to_idx`) that indexes each unique movie, and prepare the full movie embedding matrix (`movie_db_vectors`) for downstream lookup and comparison.

Users are then randomly split into training + validation(80%), and test sets (20%) using `train_test_split`. The training and validation users are used to generate multiple (input, target) pairs for supervised learning, while for the test set, only one held-out prediction per user is kept for fair evaluation.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

max_len = 5  # number of historical time steps
max_interactions = 19

movie_id_list = sorted(df_movies_and_keywords['movieId'].drop_duplicates())
movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_id_list)}
movie_db_vectors = np.vstack(df_movies_and_keywords.drop_duplicates('movieId').sort_values('movieId')['combined_vector'].tolist())

# === Group by user and filter ===
grouped = df_small_ratings.groupby('userId')
filtered_users = [uid for uid, group in grouped if len(group) >= max_interactions]
train_val_users, test_users = train_test_split(filtered_users, test_size=0.2, random_state=42)
train_users, val_users = train_test_split(train_val_users, test_size=0.2, random_state=42)

Using a split ratio of 80-20, each user's sequence is divided into training and validation segments. For each point in the sequence, the model takes a padded sequence of up to `max_len` previous movies as input, and the next movie in the timeline as the prediction target. This way, multiple training samples can be generated from a single user.

Each training and validation sample includes not just the sequence of movie embeddings but also the corresponding categorical metadata (`prod_id`, `runtime_id`), which are stored separately for embedding purposes.

For test users, exactly one final sequence is created after the training/validation split point to simulate a real-world setting where the model must predict the next movie a user might watch. This ensures no information from the test interaction leaks into training and preserves the independence of evaluation.

In [None]:
# Initialize empty lists to store training, validation, and test data
prod_train, runtime_train = [], []
prod_val, runtime_val = [], []
prod_test, runtime_test = [], []
X_train, y_train = [], []
X_val, y_val = [], []
X_test, y_test = [], []

# Loop through each user in the filtered user set
for uid in filtered_users:
    # Retrieve and sort the user's interaction history by timestamp
    group = grouped.get_group(uid).sort_values('date_time')
    group = group.iloc[:max_interactions]  # Limit to max number of interactions

    # Extract sequences of movie features and metadata
    seq = group['combined_vector'].tolist()
    movie_ids = group['movieId'].tolist()
    prod_ids_seq = group['prod_id'].tolist()
    runtime_ids_seq = group['runtime_id'].tolist()

    # Define the split point for training/validation (80%)
    split_point = int(0.8 * len(seq))

    # Build training/validation samples using a sliding window
    for i in range(1, split_point):
        start = max(0, i - max_len)  # Look back up to max_len steps
        x_seq = seq[start:i]
        x_padded = [np.zeros_like(seq[0])] * (max_len - len(x_seq)) + x_seq  # Left-pad sequence

        # Identify the next movie in the sequence as the target
        target_movie_id = movie_ids[i]
        target_idx = movie_id_to_idx[target_movie_id]

        # Retrieve categorical metadata IDs
        prod_id = prod_ids_seq[i]
        runtime_id = runtime_ids_seq[i]

        # Assign data to train or validation set based on user
        if uid in train_users:
            X_train.append(np.array(x_padded))
            y_train.append(target_idx)
            prod_train.append(prod_id)
            runtime_train.append(runtime_id)
        elif uid in val_users:
            X_val.append(np.array(x_padded))
            y_val.append(target_idx)
            prod_val.append(prod_id)
            runtime_val.append(runtime_id)

    # Add one test sample for each user from the remaining sequence
    if split_point < len(seq) and uid in test_users:
        start = max(0, split_point - max_len)
        x_seq = seq[start:split_point]
        x_padded = [np.zeros_like(seq[0])] * (max_len - len(x_seq)) + x_seq

        target_movie_id = movie_ids[split_point]
        target_idx = movie_id_to_idx[target_movie_id]

        prod_id = prod_ids_seq[split_point]
        runtime_id = runtime_ids_seq[split_point]

        X_test.append(np.array(x_padded))
        y_test.append(target_idx)
        prod_test.append(prod_id)
        runtime_test.append(runtime_id)


DataLoader Preparation for LSTM Softmax Model

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

# === Convert sequence inputs to float32 arrays ===
X_train = np.stack([np.array(x, dtype=np.float32) for x in X_train])
X_val = np.stack([np.array(x, dtype=np.float32) for x in X_val])
X_test = np.stack([np.array(x, dtype=np.float32) for x in X_test])

# === Convert main labels to tensors ===
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# === Convert categorical IDs to tensors (prod_id, runtime_id only) ===
prod_train_tensor = torch.tensor(prod_train, dtype=torch.long)
runtime_train_tensor = torch.tensor(runtime_train, dtype=torch.long)

prod_val_tensor = torch.tensor(prod_val, dtype=torch.long)
runtime_val_tensor = torch.tensor(runtime_val, dtype=torch.long)

prod_test_tensor = torch.tensor(prod_test, dtype=torch.long)
runtime_test_tensor = torch.tensor(runtime_test, dtype=torch.long)

# === Create TensorDatasets (3 inputs: X, prod_id, runtime_id) ===
train_dataset = TensorDataset(X_train_tensor, y_train_tensor,
                              prod_train_tensor, runtime_train_tensor)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor,
                            prod_val_tensor, runtime_val_tensor)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor,
                             prod_test_tensor, runtime_test_tensor)

# === DataLoaders ===
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# === Print summary ===
print("Per-user temporal split completed with filtering and capping!")
print("X_train shape:", X_train_tensor.shape)
print("X_val shape:", X_val_tensor.shape)
print("X_test shape:", X_test_tensor.shape)
print("Number of extra features per sample: prod_id, runtime_id")


Per-user temporal split completed with filtering and capping!
X_train shape: torch.Size([4592, 5, 103])
X_val shape: torch.Size([1148, 5, 103])
X_test shape: torch.Size([103, 5, 103])
Number of extra features per sample: prod_id, runtime_id


### Defining Evaluation Metrics Functions
In this recommendation setup, our goal is not to predict the exact next movie a user will watch, but rather to recommend **relevant** ones.

We modify standard metrics (Precision@K, Recall@K, F1, NDCG) to use **cosine similarity** instead of exact ID matching. This allows us to reward predictions that are semantically close to the ground truth movie—even if the IDs differ—making our evaluation more flexible and aligned with real-world recommendation goals.

In [None]:
import numpy as np

def precision_at_k_cosine(y_true, y_probs, movie_db_vectors, k=10, threshold=0.8):
    total_hits = 0

    for i in range(len(y_true)):
        true_vec = movie_db_vectors[y_true[i]]
        top_k = y_probs[i].argsort()[-k:][::-1]

        hits = 0
        for pred_idx in top_k:
            pred_vec = movie_db_vectors[pred_idx]
            cos_sim = cosine_similarity([true_vec], [pred_vec])[0][0]
            if cos_sim >= threshold:
                hits += 1

        total_hits += hits / k  # per-sample precision

    return total_hits / len(y_true)


def recall_at_k_cosine(y_true, y_probs, movie_db_vectors, k=10, threshold=0.8):
    hits = 0

    for i in range(len(y_true)):
        true_vec = movie_db_vectors[y_true[i]]
        top_k = y_probs[i].argsort()[-k:][::-1]

        for pred_idx in top_k:
            pred_vec = movie_db_vectors[pred_idx]
            cos_sim = cosine_similarity([true_vec], [pred_vec])[0][0]
            if cos_sim >= threshold:
                hits += 1
                break  # one hit is enough
    return hits / len(y_true)


def f1_at_k(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def dcg_at_k(ranked_list, true_label, k=10):
    for i in range(min(k, len(ranked_list))):
        if ranked_list[i] == true_label:
            return 1 / np.log2(i + 2)  # +2 because i starts at 0
    return 0.0

def ndcg_cosine_at_k(y_true, y_probs, movie_db_vectors, k=10, threshold=0.8):
    total_ndcg = 0.0
    for i in range(len(y_true)):
        true_vec = movie_db_vectors[y_true[i]]

        # Get top-K predictions
        top_k_indices = y_probs[i].argsort()[-k:][::-1]

        # Relevance vector (1 if cosine ≥ threshold)
        relevance_scores = []
        for pred_idx in top_k_indices:
            pred_vec = movie_db_vectors[pred_idx]
            sim = cosine_similarity([true_vec], [pred_vec])[0][0]
            relevance = 1 if sim >= threshold else 0
            relevance_scores.append(relevance)

        # --- DCG ---
        dcg = 0.0
        for rank, rel in enumerate(relevance_scores):
            if rel:
                dcg += 1 / np.log2(rank + 2)

        # --- IDCG (ideal ranking) ---
        ideal_rels = sorted(relevance_scores, reverse=True)
        idcg = 0.0
        for rank, rel in enumerate(ideal_rels):
            if rel:
                idcg += 1 / np.log2(rank + 2)

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg

    return total_ndcg / len(y_true)



## LSTM Model

### LSTM Model Architecture for Movie Classification

This model combines sequential movie watch history with categorical metadata to classify the next movie a user is likely to watch.

The model architecture consists of the following components:

- **LSTM Layer:** Processes sequences of movie feature vectors representing user history. The final hidden state summarizes temporal patterns of watched movies.
- **Categorical Embeddings:** Production company and runtime category are embedded into lower-dimensional vectors using trainable `nn.Embedding` layers. These embeddings provide auxiliary context to the model.
- **Feature Fusion:** The final LSTM output is concatenated with the categorical embeddings to form a single feature vector for each user.
- **Feedforward Layers:** This fused representation passes through two linear layers with ReLU activation and dropout for regularization. The final output layer maps the hidden representation to a probability distribution over all movie classes.
- **Output:** A softmax-compatible output layer predicts the most likely next movie from the list of known movies in the training set.



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MovieClassifier(nn.Module):
    def __init__(self, input_dim, num_movies, num_prods, num_runtimes,
                 emb_dim=16, hidden_dim=64, dropout_rate=0.5):
        super(MovieClassifier, self).__init__()

        # LSTM for processing sequence of combined features
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            batch_first=True
        )

        # Embedding layers for categorical features
        self.prod_emb = nn.Embedding(num_prods, emb_dim)
        self.runtime_emb = nn.Embedding(num_runtimes, emb_dim)

        # Concatenated input = LSTM output + prod_emb + runtime_emb
        concat_input_dim = hidden_dim + 2 * emb_dim

        # Feedforward layers
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dense1 = nn.Linear(concat_input_dim, 128)
        self.dropout2 = nn.Dropout(0.3)
        self.concat_dense = nn.Linear(128, 128)

        # Output layer
        self.output_layer = nn.Linear(128, num_movies)

    def forward(self, x, prod_ids, runtime_ids):
        # x: [batch_size, seq_len, input_dim]
        batch_size = x.size(0)
        h0 = torch.zeros(1, batch_size, self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(1, batch_size, self.lstm.hidden_size).to(x.device)

        lstm_out, _ = self.lstm(x, (h0, c0))
        x_seq = lstm_out[:, -1, :]  # take final time step output

        # Embeddings
        prod_vec = self.prod_emb(prod_ids)
        runtime_vec = self.runtime_emb(runtime_ids)

        # Combine LSTM output with embeddings
        combined = torch.cat([x_seq, prod_vec, runtime_vec], dim=1)

        # Feedforward
        combined = self.dropout1(combined)
        combined = F.relu(self.dense1(combined))
        combined = self.dropout2(combined)
        combined = F.relu(self.concat_dense(combined))

        return self.output_layer(combined)


Training and validation process for the LSTM-based movie recommendation model with a softmax classification head. During each epoch, the model learns to predict the next movie from a user's watch sequence using categorical features such as production company and runtime.

In [None]:
def train_lstm_model(model, train_loader, val_loader, device, num_classes, epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0

        for x_batch, y_batch, prod_batch, runtime_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            prod_batch = prod_batch.to(device)
            runtime_batch = runtime_batch.to(device)

            optimizer.zero_grad()
            outputs = model(x_batch, prod_batch, runtime_batch)

            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

        acc = correct / total
        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Accuracy: {acc:.4f}")

        # === Validation ===
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for x_val, y_val, prod_val, runtime_val in val_loader:
                x_val = x_val.to(device)
                y_val = y_val.to(device)
                prod_val = prod_val.to(device)
                runtime_val = runtime_val.to(device)

                outputs = model(x_val, prod_val, runtime_val)
                loss = criterion(outputs, y_val)

                val_loss += loss.item()
                preds = outputs.argmax(dim=1)
                val_correct += (preds == y_val).sum().item()
                val_total += y_val.size(0)

        val_acc = val_correct / val_total
        print(f"Validation Loss = {val_loss:.4f} | Val Acc = {val_acc:.4f}")


### Model Training

In [None]:
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define dimensions based on your data
input_dim = combined_vector.shape[1]  # input feature size per timestep
num_classes = movie_db_vectors.shape[0]  # total number of movie classes

model = MovieClassifier(
    input_dim=input_dim,
    num_movies=num_classes,
    num_prods=num_prods,
    num_runtimes=num_runtimes,
)

train_lstm_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    num_classes=num_classes,
    epochs=20,        # or any number you want
    lr=0.001           # learning rate
)





Epoch 1/20 - Loss: 605.4605 - Accuracy: 0.0348
Validation Loss = 122.6591 | Val Acc = 0.1359
Epoch 2/20 - Loss: 426.3916 - Accuracy: 0.1032
Validation Loss = 107.8969 | Val Acc = 0.2160
Epoch 3/20 - Loss: 367.2227 - Accuracy: 0.1827
Validation Loss = 94.3835 | Val Acc = 0.3145
Epoch 4/20 - Loss: 316.9558 - Accuracy: 0.2493
Validation Loss = 82.5455 | Val Acc = 0.4329
Epoch 5/20 - Loss: 277.8885 - Accuracy: 0.3095
Validation Loss = 74.5351 | Val Acc = 0.5401
Epoch 6/20 - Loss: 246.2767 - Accuracy: 0.3491
Validation Loss = 69.6137 | Val Acc = 0.6106
Epoch 7/20 - Loss: 221.2745 - Accuracy: 0.3874
Validation Loss = 65.9933 | Val Acc = 0.6890
Epoch 8/20 - Loss: 198.7829 - Accuracy: 0.4307
Validation Loss = 64.3018 | Val Acc = 0.7108
Epoch 9/20 - Loss: 188.1177 - Accuracy: 0.4475
Validation Loss = 62.5756 | Val Acc = 0.7274
Epoch 10/20 - Loss: 176.1982 - Accuracy: 0.4732
Validation Loss = 60.8072 | Val Acc = 0.7535
Epoch 11/20 - Loss: 169.2770 - Accuracy: 0.4885
Validation Loss = 60.1590 | V

### Evaluation

For the softmax-based LSTM classification model, we evaluate its ability to predict the next movie a user is likely to watch from a predefined set of movie labels seen during training. The model is trained to output a probability distribution over all known movie classes using a softmax layer, and the prediction corresponds to the class with the highest probability.

While classification accuracy serves as a direct measure of how often the model correctly identifies the next movie, this metric alone may not fully capture the quality of recommendations, especially in cases where semantically similar alternatives exist. To address this, we complement accuracy with ranking-based metrics using cosine similarity between the predicted softmax probabilities and movie embeddings.

In this extended evaluation, we calculate Precision@K, Recall@K, F1@K, and NDCG@K. These metrics help assess whether the model ranks relevant or similar movies highly within the top-K predictions, based on cosine similarity to the ground truth movie vector. This is particularly useful in recommendation settings, where offering suitable or thematically related content is often more important than identifying a single correct label.



In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch

def evaluate_model(model, test_loader, device, movie_db_vectors, k=10, threshold=0.8):
    model.eval()
    correct, total = 0, 0
    all_probs = []
    all_true = []

    with torch.no_grad():
        for x_test, y_test, prod_test, runtime_test in test_loader:
            x_test = x_test.to(device)
            y_test = y_test.to(device)
            prod_test = prod_test.to(device)
            runtime_test = runtime_test.to(device)

            outputs = model(x_test, prod_test, runtime_test)  # updated forward pass
            probs = torch.softmax(outputs, dim=1).cpu().numpy()
            all_probs.append(probs)
            all_true.append(y_test.cpu().numpy())

            preds = outputs.argmax(dim=1)
            correct += (preds == y_test).sum().item()
            total += y_test.size(0)

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")

    # --- Cosine-based evaluation ---
    y_probs = np.vstack(all_probs)
    y_true = np.concatenate(all_true)

    precision = precision_at_k_cosine(y_true, y_probs, movie_db_vectors, k, threshold)
    recall = recall_at_k_cosine(y_true, y_probs, movie_db_vectors, k, threshold)
    f1 = f1_at_k(precision, recall)
    ndcg = ndcg_cosine_at_k(y_true, y_probs, movie_db_vectors, k, threshold)

    print(f"Precision@{k}: {precision:.4f}")
    print(f"Recall@{k}: {recall:.4f}")
    print(f"F1@{k}: {f1:.4f}")
    print(f"NDCG@{k}: {ndcg:.4f}")

    return accuracy, precision, recall, f1, ndcg


In [None]:
print("\n--- Training Set Metrics ---")
evaluate_model(model, train_loader, device, movie_db_vectors, k=10)

print("\n--- Test Set Metrics ---")
evaluate_model(model, test_loader, device, movie_db_vectors,k=10)


--- Training Set Metrics ---
Accuracy: 0.9103
Precision@10: 0.2855
Recall@10: 0.9937
F1@10: 0.4435
NDCG@10: 0.8746

--- Test Set Metrics ---
Accuracy: 0.8447
Precision@10: 0.2660
Recall@10: 0.9515
F1@10: 0.4158
NDCG@10: 0.8190


(0.8446601941747572,
 0.2660194174757282,
 0.9514563106796117,
 0.4157879252411701,
 0.8190156580641321)

### Preparing Movie Embedding Matrix for Top-K Recommendation

In [None]:

index_to_movie_id = {i: movie_id for movie_id, i in movie_id_to_idx.items()}

# Get the dimension of your full vector (Word2Vec + features)
embedding_dim = len(df_small_ratings['combined_vector'].iloc[0])

# Initialize the movie embedding matrix
movie_embedding_matrix = np.zeros((len(movie_id_list), embedding_dim))

# Fill it using the combined_vector
for row in df_small_ratings.itertuples():
    idx = movie_id_to_idx[row.movieId]
    movie_embedding_matrix[idx] = row.combined_vector

movie_id_to_title = df_movies_and_keywords.set_index('movieId')['original_title'].to_dict()


### Generating Top K Movie Recommendations

In [None]:
import random
from sklearn.metrics.pairwise import cosine_similarity

model.eval()
all_probs = []
all_true = []

with torch.no_grad():
    for x_batch, y_batch, prod_batch, runtime_batch in test_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        prod_batch = prod_batch.to(device)
        runtime_batch = runtime_batch.to(device)

        logits = model(x_batch, prod_batch, runtime_batch)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        all_probs.append(probs)
        all_true.append(y_batch.cpu().numpy())

# Flatten predictions and labels
y_probs = np.vstack(all_probs)
y_true = np.concatenate(all_true)

# Sample 10 random predictions
sample_indices = random.sample(range(len(y_true)), 10)

for idx in sample_indices:
    true_idx = y_true[idx]
    true_movie_id = index_to_movie_id[true_idx]
    true_title = movie_id_to_title.get(true_movie_id, "Unknown")
    true_vec = movie_embedding_matrix[true_idx]

    # Top-10 predictions
    top_k = y_probs[idx].argsort()[-10:][::-1]
    top_movie_ids = [index_to_movie_id[i] for i in top_k]
    top_titles = [movie_id_to_title.get(mid, "Unknown") for mid in top_movie_ids]

    print(f"\n=== Sample {idx} ===")
    print(f"Ground Truth: {true_title}")
    print("Top-10 Predictions:")

    for rank, pred_idx in enumerate(top_k, 1):
        pred_movie_id = index_to_movie_id[pred_idx]
        pred_title = movie_id_to_title.get(pred_movie_id, "Unknown")
        pred_vec = movie_embedding_matrix[pred_idx]

        cos_sim = cosine_similarity([true_vec], [pred_vec])[0][0]
        print(f"{rank}. {pred_title} (cosine: {cos_sim:.3f})")



=== Sample 73 ===
Ground Truth: One Night at McCool's
Top-10 Predictions:
1. One Night at McCool's (cosine: 1.000)
2. The Killing (cosine: 0.852)
3. The Garden of Eden (cosine: 0.777)
4. Fever Pitch (cosine: 0.709)
5. Rush Hour (cosine: 0.800)
6. Taxi 4 (cosine: 0.853)
7. Bandyta (cosine: 0.779)
8. In the Mouth of Madness (cosine: 0.793)
9. While You Were Sleeping (cosine: 0.720)
10. La science des rêves (cosine: 0.771)

=== Sample 24 ===
Ground Truth: Back to the Future Part II
Top-10 Predictions:
1. Back to the Future Part II (cosine: 1.000)
2. Back to the Future (cosine: 0.934)
3. Back to the Future Part III (cosine: 0.868)
4. A Nightmare on Elm Street (cosine: 0.778)
5. Sommersturm (cosine: 0.797)
6. License to Wed (cosine: 0.721)
7. Cousin, Cousine (cosine: 0.738)
8. Tuya de hun shi (cosine: 0.768)
9. Meet the Parents (cosine: 0.831)
10. Say Anything... (cosine: 0.860)

=== Sample 58 ===
Ground Truth: Little Buddha
Top-10 Predictions:
1. The Grapes of Wrath (cosine: 0.748)
2. Lon

# LSTM with Triplet Loss Margin

## Preparing Train-Test Data

### Preparing Input Data for LSTM with Triplet Loss

To train our LSTM-based recommendation model with triplet loss, we structure each user’s watch history using a sliding window. For every user, we extract recent movie sequences of fixed length (`max_len`) as input, with the next movie serving as the positive example.

To train the model to distinguish relevant from irrelevant content, we also sample negative movies the user hasn't watched. These help the model learn to pull the user closer to the positive movie and push away the negatives in the embedding space.

We further enrich each sample with categorical features like production company and runtime category, which are embedded and fed alongside the sequence to improve context-aware learning.


#### Example Representation of Data

| userId | x_seq (prev 5 movies)    | pos_vec (next) | neg_vecs (5 sampled)   | prod_id | runtime_id |
|--------|---------------------------|----------------|-------------------------|---------|------------|
| 001    | [m1, m2, m3, m4, m5]      | m6             | [m20, m34, m50, m77, m9]| 22   | 1      |
| 002    | [m4, m5, m6, m7, m8]      | m9             | [m12, m14, m60, m88, m2]| 223  | 2     |

This preprocessing ensures that the model learns to position the user embedding closer to relevant (positive) movies and farther from irrelevant (negative) ones within the same embedding space.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

max_len = 5  # Number of previous movies to consider in the input sequence
neg_sample_size = 5  # Number of negative samples for each training example
min_required = max_len + 1  # Minimum interactions required per user
X_seq, pos_vecs, neg_vecs = [], [], []  # Lists to store training triplets
prod_ids_seq, runtime_ids_seq = [], []  # Store categorical features for each training sample

# Group the dataset by user
grouped = df_small_ratings.groupby('userId')

# Filter users who have enough interactions to form at least one sequence
filtered_users = [uid for uid, group in grouped if len(group) >= min_required]

# Iterate through each user group
for uid, group in df_small_ratings.groupby('userId'):
    if uid not in filtered_users:
        continue  # Skip users with too few interactions

    group = group.sort_values('date_time')  # Sort interactions chronologically
    vectors = group['combined_vector'].tolist()  # List of movie embedding vectors
    movie_ids = group['movieId'].tolist()  # Corresponding movie IDs
    prod_ids = group['prod_id'].tolist()  # Production company IDs
    runtime_ids = group['runtime_id'].tolist()  # Runtime category IDs

    # Iterate through each time step starting from the second interaction
    for i in range(1, len(vectors)):
        start = max(0, i - max_len)  # Determine the window of past interactions
        x_seq = vectors[start:i]  # Extract past movie vectors
        x_padded = [np.zeros_like(x_seq[0])] * (max_len - len(x_seq)) + x_seq  # Left-pad to fixed length

        target_movie_id = movie_ids[i]  # The target movie at current time step
        pos_vec = movie_id_to_vector[target_movie_id]  # Positive sample (actual next movie vector)

        # Sample negative movie vectors (not watched by this user)
        watched_ids = set(movie_ids)
        available_neg_ids = list(set(movie_id_to_vector.keys()) - watched_ids)
        sampled_neg_ids = np.random.choice(available_neg_ids, size=neg_sample_size, replace=False)
        sampled_neg_vecs = [movie_id_to_vector[mid] for mid in sampled_neg_ids]

        # Append training sample: input sequence, positive and negative vectors, categorical features
        X_seq.append(np.array(x_padded, dtype=np.float32))
        pos_vecs.append(pos_vec)
        neg_vecs.append(np.array(sampled_neg_vecs, dtype=np.float32))
        prod_ids_seq.append(prod_ids[i])
        runtime_ids_seq.append(runtime_ids[i])


### Splitting Train-Test Data
We split the data into training and testing sets (80/20) to ensure the model learns from historical user interactions while evaluating its performance on unseen sequences and associated movie features.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, pos_train, pos_test, neg_train, neg_test, prod_train, prod_test, runtime_train, runtime_test = train_test_split(
    X_seq, pos_vecs, neg_vecs, prod_ids_seq, runtime_ids_seq, test_size=0.2, random_state=42)


### Creating Custom DataLoader
To prepare the data for training with triplet loss, we define a custom PyTorch dataset that returns five components for each sample: a padded sequence of movie embeddings representing a user’s viewing history, the embedding of the actual next movie watched (positive sample), several embeddings of movies the user has not seen (negative samples), and categorical identifiers for production company and runtime. These components are bundled into a `TripletDataset` and loaded into PyTorch `DataLoader`s to facilitate batch-wise training and evaluation. This format ensures that each training step has the necessary contrastive signals to teach the model which movies are more relevant to a user’s preferences.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TripletDataset(Dataset):
    def __init__(self, X_seq, pos_vecs, neg_vecs, prod_ids, runtime_ids):
        self.X_seq = torch.tensor(X_seq, dtype=torch.float32)
        self.pos_vecs = torch.tensor(pos_vecs, dtype=torch.float32)
        self.neg_vecs = torch.tensor(neg_vecs, dtype=torch.float32)
        self.prod_ids = torch.tensor(prod_ids, dtype=torch.long)
        self.runtime_ids = torch.tensor(runtime_ids, dtype=torch.long)

    def __len__(self):
        return len(self.X_seq)

    def __getitem__(self, idx):
        return (
            self.X_seq[idx],        # [seq_len, input_dim]
            self.pos_vecs[idx],     # [embedding_dim]
            self.neg_vecs[idx],     # [neg_sample_size, embedding_dim]
            self.prod_ids[idx],     # scalar (int)
            self.runtime_ids[idx],  # scalar (int)
        )


train_dataset = TripletDataset(X_train, pos_train, neg_train, prod_train, runtime_train)
test_dataset  = TripletDataset(X_test,  pos_test,  neg_test,  prod_test,  runtime_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)



  self.X_seq = torch.tensor(X_seq, dtype=torch.float32)


## LSTM Triplet Loss Model

We use an **LSTM-based model** to capture the temporal sequence of user interactions and learn personalized user representations based on their past watched movies. Unlike matrix factorization or static models like NCF that assume user preferences are fixed, our approach acknowledges that user interests evolve over time.

The LSTM processes each user’s movie-watching history in chronological order, encoding not only the individual movie features but also the context and order of consumption. We enrich this with additional movie-level categorical features (production company, runtime category) using embedding layers to help the model capture nuanced user preferences.

By training the model using **triplet loss**, it learns to project each user sequence into an embedding space such that the next movie the user watched (positive sample) is closer to the user vector than randomly sampled movies (negative samples). This way, the model learns discriminative temporal representations that support more personalized and accurate ranking-based recommendations.



In [None]:
import torch.nn as nn
import torch

class UserLSTMEncoder(nn.Module):
    def __init__(self, input_dim, num_prods, num_runtimes, hidden_dim=64, emb_dim=16, out_dim=103, dropout_rate=0.3):
        super().__init__()

        # LSTM layer to model the sequential dependencies in a user's movie history
        # input_dim: size of each movie vector (W2V + numeric features = 103)
        # hidden_dim: dimension of the output hidden state from LSTM
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)

        # Embedding layers for categorical features
        # emb_dim: size of the learnable embedding vectors for each category
        self.prod_emb = nn.Embedding(num_prods, emb_dim)       # production company
        self.runtime_emb = nn.Embedding(num_runtimes, emb_dim) # runtime category

        # Concatenate: LSTM hidden state + prod_emb + runtime_emb
        concat_dim = hidden_dim + emb_dim * 2

        # Dropout layer to reduce overfitting during training
        self.dropout = nn.Dropout(dropout_rate)

        # Fully connected layers to project combined feature into embedding space
        self.fc1 = nn.Linear(concat_dim, 128)  # Intermediate projection to 128-dim
        self.relu = nn.ReLU()                 # Non-linear activation
        self.fc2 = nn.Linear(128, out_dim)    # Final output vector (same size as movie embeddings)

    def forward(self, x_seq, prod_id, runtime_id):
        # LSTM returns final hidden state h_n: [1, batch, hidden_dim]
        _, (h_n, _) = self.lstm(x_seq)
        lstm_out = h_n.squeeze(0)  # Reshape to [batch, hidden_dim]

        # Look up embeddings for categorical features
        prod_vec = self.prod_emb(prod_id)       # [batch, emb_dim]
        runtime_vec = self.runtime_emb(runtime_id)  # [batch, emb_dim]

        # Concatenate all features into one combined vector
        combined = torch.cat([lstm_out, prod_vec, runtime_vec], dim=1)

        # Apply dropout and feedforward layers
        combined = self.dropout(combined)
        combined = self.relu(self.fc1(combined))
        out = self.fc2(combined)  # Final output: [batch, out_dim]

        return out


### Training with Triplet Margin Loss for LSTM Recommendation Model

The `train_triplet_lstm_model()` function trains an LSTM-based recommendation model using triplet margin loss. It learns user embeddings from watch history sequences and movie metadata (production company and runtime), encouraging the user vector to be closer to a positive movie than to a negative one by a defined margin.

During training, the LSTM processes sequences of movie embeddings, and categorical features are embedded and concatenated with the LSTM output. The triplet loss guides the model to learn a meaningful distance metric, penalizing cases where the positive is not sufficiently closer than the negative.

Training accuracy is tracked by how often the user vector is more similar to the positive than the negative movie, reflecting how well the model ranks relevant content. This ranking-based approach aligns with real-world recommendation tasks where relative preferences matter more than exact classification.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def train_triplet_lstm_model(model, train_loader, device, epochs, lr, margin=0.3):
    # Adam optimizer for efficient gradient updates
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Triplet margin loss: encourages anchor (user_vec) to be closer to pos_vec than neg_vec by at least the margin
    loss_fn = nn.TripletMarginLoss(margin=margin)

    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_samples = 0
        correct = 0  # for monitoring similarity comparisons

        for x_seq, pos_vec, neg_vecs, prod_ids, runtime_ids in train_loader:
            x_seq = x_seq.to(device)
            pos_vec = pos_vec.to(device)
            neg_vec = neg_vecs[:, 0, :].to(device)  # Use only the first negative sample for training
            prod_ids = prod_ids.to(device)
            runtime_ids = runtime_ids.to(device)

            optimizer.zero_grad()

            # Forward pass through the LSTM encoder with categorical embeddings
            user_vec = model(x_seq, prod_ids, runtime_ids)

            # Compute triplet loss
            loss = loss_fn(user_vec, pos_vec, neg_vec)

            # Backpropagation
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * x_seq.size(0)
            total_samples += x_seq.size(0)

            # --- Compute similarity-based "accuracy" ---
            # Normalize embeddings for cosine similarity
            user_norm = F.normalize(user_vec, dim=1)
            pos_norm = F.normalize(pos_vec, dim=1)
            neg_norm = F.normalize(neg_vec, dim=1)

            # Compute cosine similarity (dot product of normalized vectors)
            sim_pos = (user_norm * pos_norm).sum(dim=1)  # similarity between user and positive movie
            sim_neg = (user_norm * neg_norm).sum(dim=1)  # similarity between user and negative movie

            # Accuracy: how often the model correctly scores positive higher than negative
            correct += (sim_pos > sim_neg).sum().item()

        avg_loss = total_loss / total_samples
        accuracy = correct / total_samples

        print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f} - Accuracy: {accuracy:.4f}")


### Model Training

Training accuracy refers to the percentage of triplets where cosine similarity(user, positive) > cosine similarity(user, negative)

In [None]:
# Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the LSTM model with embedding layers for categorical features
model = UserLSTMEncoder(
    input_dim=combined_vector.shape[1],  # input size of each movie vector
    num_prods=num_prods,                # number of unique production companies
    num_runtimes=num_runtimes,          # number of unique runtime categories
    emb_dim=16,                         # size of each categorical embedding
    hidden_dim=64,                      # hidden size of LSTM
    out_dim=103                         # final output embedding size for user vector
).to(device)

# Train the model using triplet margin loss
train_triplet_lstm_model(
    model=model,
    train_loader=train_loader,
    device=device,
    epochs=10,           # number of training epochs
    lr=0.001             # learning rate
)


Epoch 1/10 - Loss: 0.0828 - Accuracy: 0.8903
Epoch 2/10 - Loss: 0.0599 - Accuracy: 0.9252
Epoch 3/10 - Loss: 0.0506 - Accuracy: 0.9402
Epoch 4/10 - Loss: 0.0448 - Accuracy: 0.9484
Epoch 5/10 - Loss: 0.0400 - Accuracy: 0.9548
Epoch 6/10 - Loss: 0.0370 - Accuracy: 0.9599
Epoch 7/10 - Loss: 0.0342 - Accuracy: 0.9624
Epoch 8/10 - Loss: 0.0317 - Accuracy: 0.9658
Epoch 9/10 - Loss: 0.0298 - Accuracy: 0.9680
Epoch 10/10 - Loss: 0.0289 - Accuracy: 0.9698


### Defining Evaluation Metrics Functions
In this recommendation setup, our goal is not to predict the exact next movie a user will watch, but rather to recommend **relevant** ones.

We modify standard metrics (Precision@K, Recall@K, F1, NDCG) to use **cosine similarity** instead of exact ID matching. This allows us to reward predictions that are semantically close to the ground truth movie—even if the IDs differ—making our evaluation more flexible and aligned with real-world recommendation goals.

In [None]:
import numpy as np

# Computes precision@k based on cosine similarity
def precision_at_k_cosine(y_true, y_probs, movie_db_vectors, k=10, threshold=0.8):
    total_hits = 0

    for i in range(len(y_true)):
        true_vec = movie_db_vectors[y_true[i]]  # Ground-truth movie vector
        top_k = y_probs[i].argsort()[-k:][::-1]  # Top-k predicted indices

        hits = 0
        for pred_idx in top_k:
            pred_vec = movie_db_vectors[pred_idx]
            # If prediction is similar enough to ground truth
            cos_sim = cosine_similarity([true_vec], [pred_vec])[0][0]
            if cos_sim >= threshold:
                hits += 1

        total_hits += hits / k  # Compute precision for this sample

    return total_hits / len(y_true)  # Average precision over all samples

# Computes recall@k based on cosine similarity
def recall_at_k_cosine(y_true, y_probs, movie_db_vectors, k=10, threshold=0.8):
    hits = 0

    for i in range(len(y_true)):
        true_vec = movie_db_vectors[y_true[i]]
        top_k = y_probs[i].argsort()[-k:][::-1]

        for pred_idx in top_k:
            pred_vec = movie_db_vectors[pred_idx]
            cos_sim = cosine_similarity([true_vec], [pred_vec])[0][0]
            if cos_sim >= threshold:
                hits += 1
                break  # Only need one match to count as a hit

    return hits / len(y_true)  # Proportion of samples with at least one correct prediction

# Computes F1 score from precision and recall
def f1_at_k(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

# Compute DCG (Discounted Cumulative Gain)
def dcg_at_k(ranked_list, true_label, k=10):
    for i in range(min(k, len(ranked_list))):
        if ranked_list[i] == true_label:
            return 1 / np.log2(i + 2)  # Log base 2, offset by 2 to avoid log(1)
    return 0.0

# Computes NDCG@k using cosine similarity
def ndcg_cosine_at_k(y_true, y_probs, movie_db_vectors, k=10, threshold=0.8):
    total_ndcg = 0.0

    for i in range(len(y_true)):
        true_vec = movie_db_vectors[y_true[i]]
        top_k_indices = y_probs[i].argsort()[-k:][::-1]  # Top-k predicted indices

        # Assign relevance based on similarity threshold
        relevance_scores = []
        for pred_idx in top_k_indices:
            pred_vec = movie_db_vectors[pred_idx]
            sim = cosine_similarity([true_vec], [pred_vec])[0][0]
            relevance = 1 if sim >= threshold else 0
            relevance_scores.append(relevance)

        # Calculate DCG
        dcg = 0.0
        for rank, rel in enumerate(relevance_scores):
            if rel:
                dcg += 1 / np.log2(rank + 2)

        # Calculate IDCG (ideal DCG)
        ideal_rels = sorted(relevance_scores, reverse=True)
        idcg = 0.0
        for rank, rel in enumerate(ideal_rels):
            if rel:
                idcg += 1 / np.log2(rank + 2)

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg

    return total_ndcg / len(y_true)  # Average NDCG across all samples


### Evaluation

In this evaluation, we assess the effectiveness of our LSTM-based model for movie recommendation by measuring how well it retrieves relevant movies, rather than predicting the exact next movie a user will watch. The model learns user representations based on their historical watch sequences and then ranks all movies in the database according to their similarity to the predicted user embedding.

To evaluate this, we use cosine similarity to compare the user embedding with every movie vector in the database, generating a ranked list of recommendations. We then locate the ground truth movie (the actual next movie the user watched) within this list. If the predicted ranking places the correct movie—or a semantically similar one—among the top results, the model is considered effective.

We report several metrics. Accuracy reflects how often the correct movie is ranked at the top. Precision@K and Recall@K evaluate whether the model retrieves relevant movies within the top K recommendations based on cosine similarity. F1@K combines precision and recall to give a balanced measure, while NDCG@K accounts for the position of the correct movie in the ranking, rewarding higher placements more heavily.

This evaluation design prioritizes semantic relevance over exact matches, aligning better with real-world recommendation goals where offering related or suitable content often matters more than predicting the precise item.


In [None]:
def evaluate_lstm_ranking_model(model, test_loader, movie_db_vectors, movie_id_to_vector, movie_id_list, device, k=10, threshold=0.8):
    model.eval()
    all_y_true, all_y_probs = [], []
    correct_count, total_count = 0, 0

    # Stack movie vectors into a matrix and move to the correct device
    movie_db_matrix = np.stack([np.array(movie_id_to_vector[mid], dtype=np.float32) for mid in movie_id_list])
    movie_db_tensor = torch.tensor(movie_db_matrix, dtype=torch.float32).to(device)

    with torch.no_grad():
        for x_seq_batch, pos_vec_batch, _, prod_batch, runtime_batch in test_loader:
            # Move batch data to the same device as the model
            x_seq_batch = x_seq_batch.to(device)
            pos_vec_batch = pos_vec_batch.to(device)
            prod_batch = prod_batch.to(device)
            runtime_batch = runtime_batch.to(device)

            # Forward pass to get the user representation
            user_vec_batch = model(x_seq_batch, prod_batch, runtime_batch)

            # Compute cosine similarities between user vec and every movie in the DB
            sims = cosine_similarity(user_vec_batch.cpu(), movie_db_tensor.cpu())

            for i in range(len(pos_vec_batch)):
                true_vec = pos_vec_batch[i].cpu().numpy()

                # Try to find the index of the ground truth movie in the full movie DB
                matched_index = -1
                for idx, mid in enumerate(movie_id_list):
                    if np.allclose(movie_id_to_vector[mid], true_vec, atol=1e-5):
                        matched_index = idx
                        break

                # If a match was found, record ground truth and predictions
                if matched_index != -1:
                    all_y_true.append(matched_index)
                    all_y_probs.append(sims[i])

                    # Accuracy = how often the top-1 prediction is correct
                    pred_index = np.argmax(sims[i])
                    if pred_index == matched_index:
                        correct_count += 1
                    total_count += 1

    # Convert accumulated predictions and labels to arrays
    all_y_probs = np.array(all_y_probs)
    all_y_true = np.array(all_y_true)

    # Compute metrics using cosine-based definitions
    precision = precision_at_k_cosine(all_y_true, all_y_probs, movie_db_matrix, k=k, threshold=threshold)
    recall = recall_at_k_cosine(all_y_true, all_y_probs, movie_db_matrix, k=k, threshold=threshold)
    f1 = f1_at_k(precision, recall)
    ndcg = ndcg_cosine_at_k(all_y_true, all_y_probs, movie_db_matrix, k=k, threshold=threshold)
    accuracy = correct_count / total_count if total_count > 0 else 0.0

    # Output results
    print(f"\n--- Evaluation Results ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision@{k}: {precision:.4f}")
    print(f"Recall@{k}: {recall:.4f}")
    print(f"F1@{k}: {f1:.4f}")
    print(f"NDCG@{k}: {ndcg:.4f}")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "ndcg": ndcg
    }


In [None]:
train_metrics = evaluate_lstm_ranking_model(
    model=model,
    test_loader=train_loader,
    movie_db_vectors=movie_db_vectors,
    movie_id_to_vector=movie_id_to_vector,
    movie_id_list=movie_id_list,
    device=device,
    k=10
)
print("\n--- Train Set Metrics ---")
print(train_metrics)


--- Evaluation Results ---
Accuracy: 0.0527
Precision@10: 0.4922
Recall@10: 0.8467
F1@10: 0.6226
NDCG@10: 0.6757

--- Train Set Metrics ---
{'accuracy': 0.05271182133882944, 'precision': 0.4922386289844462, 'recall': 0.8467206866371156, 'f1': 0.6225560777842516, 'ndcg': 0.6756728779756033}


In [None]:
metrics = evaluate_lstm_ranking_model(
    model=model,
    test_loader=test_loader,
    movie_db_vectors=movie_db_vectors,           # full matrix of movie vectors
    movie_id_to_vector=movie_id_to_vector,       # movieId → vector
    movie_id_list=movie_id_list,                 # ordered list of all movieIds
    device=device,
    k=10                                         # or any top-k value
)
print("\n--- Test Set Metrics ---")
print(metrics)



--- Evaluation Results ---
Accuracy: 0.0519
Precision@10: 0.4776
Recall@10: 0.8469
F1@10: 0.6108
NDCG@10: 0.6674

--- Test Set Metrics ---
{'accuracy': 0.05194805194805195, 'precision': 0.47761716544324984, 'recall': 0.8468661773009599, 'f1': 0.6107707210182102, 'ndcg': 0.6673899106824986}


## Generating Top-K Movies Recommendations

In [None]:
import random

def display_top_k_recommendations(model, test_loader, movie_id_to_vector, movie_id_to_title, movie_id_list, device, k=10, num_samples=5):
    model.eval()

    movie_db_matrix = np.stack([np.array(movie_id_to_vector[mid], dtype=np.float32) for mid in movie_id_list])
    movie_db_tensor = torch.tensor(movie_db_matrix, dtype=torch.float32).to(device)

    all_inputs = []

    with torch.no_grad():
        for x_seq_batch, pos_vec_batch, _, prod_batch, runtime_batch in test_loader:
            for i in range(len(pos_vec_batch)):
                all_inputs.append((
                    x_seq_batch[i], pos_vec_batch[i],
                    prod_batch[i], runtime_batch[i]
                ))

        sampled_inputs = random.sample(all_inputs, min(num_samples, len(all_inputs)))

        for idx, (x_seq, pos_vec, prod_id, runtime_id) in enumerate(sampled_inputs):
            x_seq = x_seq.unsqueeze(0).to(device)
            pos_vec = pos_vec.unsqueeze(0).to(device)
            prod_id = prod_id.unsqueeze(0).to(device)
            runtime_id = runtime_id.unsqueeze(0).to(device)

            user_vec = model(x_seq, prod_id, runtime_id)
            sims = cosine_similarity(user_vec.cpu(), movie_db_tensor.cpu())[0]

            true_vec = pos_vec.cpu().numpy()[0]
            matched_movie_id = None
            for mid, vec in movie_id_to_vector.items():
                if np.allclose(vec, true_vec, atol=1e-5):
                    matched_movie_id = mid
                    break

            if matched_movie_id is None:
                continue

            ground_truth_title = movie_id_to_title.get(matched_movie_id, "Unknown")
            top_k_indices = sims.argsort()[::-1][:k]

            print(f"\n=== Sample {idx + 1} ===")
            print(f"Ground Truth: {ground_truth_title}")
            print(f"Top-{k} Predictions:")
            for rank, top_idx in enumerate(top_k_indices, 1):
                pred_mid = movie_id_list[top_idx]
                pred_title = movie_id_to_title.get(pred_mid, "Unknown")
                cos_sim = cosine_similarity([true_vec], [movie_db_matrix[top_idx]])[0][0]
                print(f"{rank}. {pred_title} (cosine similarity: {cos_sim:.4f})")


In [None]:
movie_id_to_title = df_movies_and_keywords.set_index('movieId')['original_title'].to_dict()
display_top_k_recommendations(
    model=model,
    test_loader=test_loader,
    movie_id_to_vector=movie_id_to_vector,
    movie_id_to_title=movie_id_to_title,
    movie_id_list=movie_id_list,
    device=device,
    k=10,
    num_samples=5  # How many test samples you want to print
)



=== Sample 1 ===
Ground Truth: 48 Hrs.
Top-10 Predictions:
1. Strangers on a Train (cosine similarity: 0.8226)
2. Stagecoach (cosine similarity: 0.8388)
3. Back to the Future (cosine similarity: 0.8177)
4. Return of the Jedi (cosine similarity: 0.7758)
5. The Gold Rush (cosine similarity: 0.7819)
6. The Sea Wolf (cosine similarity: 0.7675)
7. Outland (cosine similarity: 0.7308)
8. Steamboat Bill, Jr. (cosine similarity: 0.8294)
9. Reservoir Dogs (cosine similarity: 0.8245)
10. Marnie (cosine similarity: 0.8479)

=== Sample 2 ===
Ground Truth: Stellet Licht
Top-10 Predictions:
1. 生きる (cosine similarity: 0.8791)
2. Close Encounters of the Third Kind (cosine similarity: 0.7568)
3. City of Angels (cosine similarity: 0.8088)
4. The Hours (cosine similarity: 0.9044)
5. A Beautiful Mind (cosine similarity: 0.7878)
6. The Notebook (cosine similarity: 0.8299)
7. City Lights (cosine similarity: 0.7921)
8. Airplane! (cosine similarity: 0.7096)
9. Vertigo (cosine similarity: 0.8268)
10. Short Cut