<a href="https://colab.research.google.com/github/M-Amrollahi/RecSys-movieLens/blob/main/recom_sys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download movielens dataset

In [3]:
# Create data directory if it doesn't exist
#!mkdir -p data

# Download the dataset to the data folder
#!wget -nv https://files.grouplens.org/datasets/movielens/ml-1m.zip -P data

# Unzip the file inside the data folder
#!unzip -qo data/ml-1m.zip -d data

/bin/bash: /home/martin/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
2024-12-14 11:20:29 URL:https://files.grouplens.org/datasets/movielens/ml-1m.zip [5917549/5917549] -> "ml-1m.zip" [1]
/bin/bash: /home/martin/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)


## import packages

In [4]:
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import top_k_accuracy_score
import numpy as np
from pprint import pprint
from tqdm.notebook import tqdm

from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F

from sentence_transformers import SentenceTransformer

import faiss

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 100

## Read datasets and prepare to join dataframes 

In [5]:
# Read the data files with the correct separator and column names
df_movies = pd.read_csv("data/ml-1m/movies.dat", 
                       sep="::", 
                       header=None, 
                       names=['item_id', 'title', 'genres'],
                       encoding='latin-1',
                       engine='python')

# Extract unique genres
all_genres = set()
for genres in df_movies['genres'].str.split('|'):
    all_genres.update(genres)

# Replace "Children's" with "Children" to match format
all_genres = {genre.replace("Children's", "Children") for genre in all_genres}
# Replace "Film-Noir" with "Film_Noir" to match format
all_genres = {genre.replace("Film-Noir", "Film_Noir") for genre in all_genres}

df_ratings = pd.read_csv("data/ml-1m/ratings.dat", 
                        sep="::", 
                        header=None, 
                        names=['user_id', 'item_id', 'rating', 'timestamp'],
                        encoding='latin-1',
                        engine='python')

df_users = pd.read_csv("data/ml-1m/users.dat", 
                      sep="::", 
                      header=None, 
                      names=['user_id', 'gender', 'age', 'occupation', 'zip_code'],
                      encoding='latin-1',
                      engine='python')

# Create genre columns, including 'unknown'
genres = sorted(list(all_genres))
genres.append('unknown')  # Add 'unknown' to the genre list
genre_columns = {genre: [] for genre in genres}

# Fill genre columns, handling missing genres
for _, row in df_movies.iterrows():
    movie_genres = row['genres'].split('|')
    # Replace genre names to match format
    movie_genres = [g.replace("Children's", "Children").replace("Film-Noir", "Film_Noir") for g in movie_genres]
    
    if not movie_genres:  # Check if movie_genres is empty
        for genre in genres:
            genre_columns[genre].append(1 if genre == 'unknown' else 0)
    else:
        for genre in genres:
            genre_columns[genre].append(1 if genre in movie_genres else 0)

# Add genre columns to movies dataframe
for genre in genres:
    df_movies[genre] = genre_columns[genre]

# Drop the original genres column
df_movies = df_movies.drop('genres', axis=1)

# Create gender dummy variables
df_users = pd.get_dummies(df_users, prefix=['gen'], columns=['gender'])

# Merge all dataframes
df_combined = pd.merge(df_ratings, df_users, on='user_id', how='inner')
df_combined = pd.merge(df_combined, df_movies, on='item_id', how='inner')


# Reorder columns to match desired format
column_order = ['user_id', 'item_id', 'rating', 'timestamp', 'age', 'occupation', 
                'zip_code', 'title'] + genres + ['gen_F', 'gen_M']

df_combined = df_combined[column_order]

In [6]:
df_combined.head()

Unnamed: 0,user_id,item_id,rating,timestamp,age,occupation,zip_code,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,unknown,gen_F,gen_M
0,1,1193,5,978300760,1,10,48067,One Flew Over the Cuckoo's Nest (1975),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,True,False
1,1,661,3,978302109,1,10,48067,James and the Giant Peach (1996),0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,True,False
2,1,914,3,978301968,1,10,48067,My Fair Lady (1964),0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,True,False
3,1,3408,4,978300275,1,10,48067,Erin Brockovich (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,True,False
4,1,2355,5,978824291,1,10,48067,"Bug's Life, A (1998)",0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,True,False


## Encode the gender column to one-hot

## normalize the age and timestamp

In [7]:
model_norm = StandardScaler()
df_combined[["age","timestamp"]] = model_norm.fit_transform(df_combined[["age", "timestamp"]])

## smooth cleaning on movie title

In [8]:
df_combined["title"] = df_combined["title"].str.lower().str.replace(r"[\:\&\,]","")

In [9]:
df_combined["user_id"] = df_combined["user_id"] - 1
df_combined["item_id"] = df_combined["item_id"] - 1

## Preparing negative samples
For simplicity, we consider each record in dataset as positive and each potential record as negative.\
Then we create the y as target and name it "like" and label it as 0 or 1.

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import torch
from torch.utils.data import Dataset, DataLoader


NEGATIVE_SAMPLES_PER_USER = 80
def preprocess_title(title):
    """
    Preprocess the movie title by converting to lowercase and removing specific punctuation.
    """
    if pd.isna(title):
        raise ValueError(f"Title is NaN: {title}")
    return title.lower().replace(':', '').replace('&', '').replace(',', '')

def generate_negative_samples(df_source, user_ids, all_item_ids, user_positive_items, df_movies, df_users, sample_size=NEGATIVE_SAMPLES_PER_USER):
    """
    Generate negative samples with complete item and user information.

    Args:
        df_source (pd.DataFrame): The source DataFrame containing user-item interactions.
        user_ids (list): List of unique user IDs.
        all_item_ids (list): List of all unique item IDs.
        user_positive_items (dict): Dictionary mapping user IDs to sets of positive item IDs.
        df_movies (pd.DataFrame): DataFrame containing movie information.
        df_users (pd.DataFrame): DataFrame containing user information.
        sample_size (int): Number of negative samples to generate per user.

    Returns:
        pd.DataFrame: DataFrame containing negative samples with item and user information.
    """
    neg_samples = {
        'user_id': [],
        'item_id': []
    }

    for user in user_ids:
        positive_items = user_positive_items.get(user, set())
        negative_candidates = np.setdiff1d(all_item_ids, list(positive_items))

        if len(negative_candidates) == 0:
            print(f"User {user} has interacted with all items. Skipping negative sampling.")
            continue

        if len(negative_candidates) >= sample_size:
            sampled_items = np.random.choice(negative_candidates, size=sample_size, replace=False)
        else:
            sampled_items = np.random.choice(negative_candidates, size=sample_size, replace=True)

        neg_samples['user_id'].extend([user] * len(sampled_items))
        neg_samples['item_id'].extend(sampled_items)

    # Create DataFrame with negative samples
    df_neg = pd.DataFrame(neg_samples)

    # Merge with users data to get user features using an inner join to ensure all user_ids are valid
    df_neg = df_neg.merge(df_users, on='user_id', how='inner')

    # Merge with movies data to get item features
    df_neg = df_neg.merge(df_movies, on='item_id', how='left')  # Assuming all item_ids are present

    # Assign default values for negative samples
    df_neg['rating'] = 0  # Assuming 0 indicates no rating
    df_neg['timestamp'] = df_source['timestamp'].max() + 1  # Assign a timestamp after the last interaction
    df_neg['like'] = 0  # Negative samples have 'like' as 0

    # Identify all columns that need to be present
    required_columns = df_source.columns.tolist()

    # Fill missing columns with default values
    for col in required_columns:
        if col not in df_neg.columns:
            if col in ['age', 'occupation', 'zip_code']:
                df_neg[col] = 0  # Assign a default integer value
            elif col in ['gen_F', 'gen_M']:
                df_neg[col] = False  # Assign a default boolean value
            else:
                df_neg[col] = 0  # Assign a generic default value
        else:
            # For user-specific columns, ensure no NaNs
            if col in ['age', 'occupation', 'zip_code']:
                df_neg[col] = df_neg[col].fillna(0)
            elif col in ['gen_F', 'gen_M']:
                df_neg[col] = df_neg[col].fillna(False)

    return df_neg
df_combined['like'] = (df_combined['rating'] >= 4).astype(int)

# Step 1: Split data
print("Splitting data into training and testing sets...")
df_train, df_test = train_test_split(df_combined, train_size=0.8, random_state=42, shuffle=True)
print(f"Training set size: {df_train.shape}")
print(f"Testing set size: {df_test.shape}")
print(df_train.isnull().sum())

# Step 2: Create user-item interaction dictionary
print("\nCreating user-item interaction dictionary...")
user_positive_items = df_combined.groupby('user_id')['item_id'].apply(set).to_dict()
print(f"Number of users with interactions: {len(user_positive_items)}")

# Step 3: Get all unique item IDs
print("\nRetrieving all unique item IDs...")
all_item_ids = df_combined['item_id'].unique()
print(f"Total unique items: {len(all_item_ids)}")
# Verify that all item_ids exist in df_movies
missing_item_ids = set(all_item_ids) - set(df_movies['item_id'])
if missing_item_ids:
    print(f"Warning: {len(missing_item_ids)} item_id(s) are missing in df_movies.")
    # Optionally, remove these item_ids from all_item_ids to prevent sampling them
    all_item_ids = np.array(list(set(all_item_ids) - missing_item_ids))
    print(f"Updated total unique items after removing missing item_ids: {len(all_item_ids)}")
else:
    print("All item_ids are present in df_movies.")
# Step 4: Generate negative samples for training and testing
print("\nGenerating negative samples for training set...")
df_neg_train = generate_negative_samples(
    df_source=df_combined,
    user_ids=df_train['user_id'].unique(),
    all_item_ids=all_item_ids,
    user_positive_items=user_positive_items,
    df_movies=df_movies,
    df_users=df_users,
    sample_size=NEGATIVE_SAMPLES_PER_USER
)

print("\nGenerating negative samples for testing set...")
df_neg_test = generate_negative_samples(
    df_source=df_combined,
    user_ids=df_test['user_id'].unique(),
    all_item_ids=all_item_ids,
    user_positive_items=user_positive_items,
    df_movies=df_movies,
    df_users=df_users,
    sample_size=NEGATIVE_SAMPLES_PER_USER
)

# Handle missing titles after negative sampling
#First count the number of missing titles
print("\nHandling missing titles...")
print("Missing titles in training negatives:", df_neg_train['title'].isna().sum())
print("Missing titles in testing negatives:", df_neg_test['title'].isna().sum())
# print the missing titles in the normal dataset
print("Missing titles in normal dataset:", df_combined['title'].isna().sum())
print("Missing titles handled.")

# Step 5: Set labels
print("\nSetting labels for positive and negative samples...")
num_nan_like = df_combined['like'].isna().sum()
print(f"Number of NaNs in 'like' after assignment: {num_nan_like}")
df_neg_train['like'] = 0
df_neg_test['like'] = 0

# Step 6: Combine positive and negative samples
print("\nCombining positive and negative samples for training set...")
df_train = pd.concat([df_train, df_neg_train], axis=0).reset_index(drop=True)
print(f"Combined training set size: {df_train.shape}")

print("\nCombining positive and negative samples for testing set...")
df_test = pd.concat([df_test, df_neg_test], axis=0).reset_index(drop=True)
print(f"Combined testing set size: {df_test.shape}")

# Apply title preprocessing to df_train and df_test after combining
df_train["title"] = df_train["title"].apply(preprocess_title)
df_test["title"] = df_test["title"].apply(preprocess_title)

# Step 7: Create and apply item_id mapping
print("\nCreating and applying item_id mapping...")
unique_item_ids = df_combined['item_id'].unique()
item_id_mapping = {original_id: new_id for new_id, original_id in enumerate(unique_item_ids)}

# Apply mapping to all DataFrames
for df_name, df in zip(['df_combined', 'df_train', 'df_test'], [df_combined, df_train, df_test]):
    print(f"\nMapping 'item_id' for {df_name}...")
    df['item_id'] = df['item_id'].map(item_id_mapping)
    missing_items = df['item_id'].isna().sum()
    if missing_items > 0:
        print(f"Warning: {df_name} has {missing_items} missing 'item_id'(s) after mapping.")
    else:
        print(f"{df_name} has no missing 'item_id'(s) after mapping.")

# Step 8: Drop any rows with missing 'item_id's
print("\nDropping rows with missing 'item_id's in training set...")
initial_train_size = df_train.shape[0]
df_train = df_train.dropna(subset=['item_id']).reset_index(drop=True)
final_train_size = df_train.shape[0]
print(f"Dropped {initial_train_size - final_train_size} rows from training set.")

print("\nDropping rows with missing 'item_id's in testing set...")
initial_test_size = df_test.shape[0]
df_test = df_test.dropna(subset=['item_id']).reset_index(drop=True)
final_test_size = df_test.shape[0]
print(f"Dropped {initial_test_size - final_test_size} rows from testing set.")

# Step 9: Convert 'item_id' to integer type
print("\nConverting 'item_id' to integer type...")
for df_name, df in zip(['df_combined', 'df_train', 'df_test'], [df_combined, df_train, df_test]):
    df['item_id'] = df['item_id'].astype(int)
    print(f"'item_id' for {df_name} converted to {df['item_id'].dtype}.")

# Step 10: Shuffle the final datasets
print("\nShuffling the training and testing datasets...")
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
print("Shuffling completed.")

# Step 11: Verify the results
print("\nVerifying the results...")
print("\nTraining set shape:", df_train.shape)
print("Testing set shape:", df_test.shape)

print("\nTraining set 'like' distribution:")
print(df_train['like'].value_counts(normalize=True))

print("\nTesting set 'like' distribution:")
print(df_test['like'].value_counts(normalize=True))

# Step 12: Verify data integrity
print("\nChecking for missing values in training set:")
print(df_train.isnull().sum())

print("\nChecking for missing values in testing set:")
print(df_test.isnull().sum())

Splitting data into training and testing sets...
Training set size: (800167, 30)
Testing set size: (200042, 30)
user_id        0
item_id        0
rating         0
timestamp      0
age            0
occupation     0
zip_code       0
title          0
Action         0
Adventure      0
Animation      0
Children       0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film_Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
unknown        0
gen_F          0
gen_M          0
like           0
dtype: int64

Creating user-item interaction dictionary...
Number of users with interactions: 6040

Retrieving all unique item IDs...
Total unique items: 3706
Updated total unique items after removing missing item_ids: 3648

Generating negative samples for training set...

Generating negative samples for testing set...

Handling missing titles...
Missing titles in training ne

In [11]:
# Verify 'like' column in training set
print("NaNs in Training 'like' column:", df_train['like'].isna().sum())
print(df_train['like'].head())

# Verify 'like' column in testing set
print("NaNs in Testing 'like' column:", df_test['like'].isna().sum())
print(df_test['like'].head())

NaNs in Training 'like' column: 0
0    0
1    0
2    1
3    0
4    0
Name: like, dtype: int64
NaNs in Testing 'like' column: 0
0    0
1    0
2    0
3    0
4    0
Name: like, dtype: int64


In [12]:
# Check for NaNs in 'rating'
num_nan_ratings = df_combined['rating'].isna().sum()
print(f"Number of NaNs in 'rating': {num_nan_ratings}")

# If there are NaNs, decide to either drop them or fill them
if num_nan_ratings > 0:
    # Option 1: Drop rows with NaN ratings
    df_combined = df_combined.dropna(subset=['rating'])
    print(f"Dropped {num_nan_ratings} rows with NaN ratings.")
    
    # Option 2: Fill NaNs with a default rating (e.g., median rating)
    # median_rating = df_combined['rating'].median()
    # df_combined['rating'] = df_combined['rating'].fillna(median_rating)
    # print("Filled NaNs in 'rating' with median value.")

Number of NaNs in 'rating': 0


In [13]:
df_test = df_test.reset_index(drop=True )
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)


df_train.sample(3)

Unnamed: 0,user_id,item_id,rating,timestamp,age,occupation,zip_code,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,unknown,gen_F,gen_M,like
394122,418,352,4,5.978712,-0.998837,3,55422,austin powers the spy who shagged me (1999),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,False,True,1
997940,4073,911,0,7.106609,25.0,6,70003,i know what you did last summer (1997),0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,False,True,0
1136841,5486,2175,0,7.106609,25.0,0,17821,guilty as sin (1993),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,False,True,0


In [14]:
# Check for NaNs in item_id
for name, df in zip(['df_combined', 'df_train', 'df_test', 'df_neg_train', 'df_neg_test'],
                    [df_combined, df_train, df_test, df_neg_train, df_neg_test]):
    missing = df['item_id'].isna().sum()
    if missing > 0:
        print(f"{name} has {missing} missing item_id(s).")
    else:
        print(f"{name} has no missing item_id(s).")

df_combined has no missing item_id(s).
df_train has no missing item_id(s).
df_test has no missing item_id(s).
df_neg_train has no missing item_id(s).
df_neg_test has no missing item_id(s).


In [15]:
   # After generating negative samples
print("Missing titles in training negatives:", df_neg_train['title'].isna().sum())
print("Missing titles in testing negatives:", df_neg_test['title'].isna().sum())

Missing titles in training negatives: 0
Missing titles in testing negatives: 0


In [16]:
print(df_train.columns)

Index(['user_id', 'item_id', 'rating', 'timestamp', 'age', 'occupation',
       'zip_code', 'title', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western', 'unknown', 'gen_F', 'gen_M', 'like'],
      dtype='object')


In [17]:
df_train.rating.value_counts()


rating
0    483120
4    279323
3    209081
5    180796
2     86194
1     44773
Name: count, dtype: int64

## Encode the title of the movie with SentenceTransformer and save then in a dict to be used next

In [18]:
from sentence_transformers import SentenceTransformer

# Initialize the Sentence Transformer model
model_strans = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the dictionary to store embeddings
dict_titlEmb = {}

# Combine unique titles from both training and testing sets
all_unique_titles = pd.concat([df_train, df_test])["title"].unique()

# Encode each unique title and store it in the dictionary
for title in all_unique_titles:
    dict_titlEmb[title] = torch.tensor(model_strans.encode(title))

# Verify that all titles are embedded
combined_unique_titles = set(df_train['title'].unique()).union(set(df_test['title'].unique()))
missing_titles = combined_unique_titles - set(dict_titlEmb.keys())

if missing_titles:
    print(f"Number of missing titles in dict_titlEmb: {len(missing_titles)}")
    print("Sample of missing titles:", list(missing_titles)[:10])
    # Optionally, handle missing titles
    for title in missing_titles:
        dict_titlEmb[title] = torch.tensor(model_strans.encode(title))
else:
    print("All titles are successfully embedded.")

All titles are successfully embedded.


# Deep model in PyTorch

In [19]:
import torch
from torch.utils.data import Dataset

class cls_dataset(Dataset):
    def __init__(self, data, user_count, item_count, title_emb_tensor, title_to_idx):
        super().__init__()
        self.m_data = data.reset_index(drop=True)
        self.user_count = user_count
        self.item_count = item_count
        self.title_emb_tensor = title_emb_tensor  # Keep on CPU
        self.title_to_idx = title_to_idx

    def __len__(self):
        """Return the total number of samples in the dataset."""
        return len(self.m_data)

    def __getitem__(self, index):
        """Retrieve a single sample from the dataset."""
        user_id = self.m_data.at[index, "user_id"]
        item_id = self.m_data.at[index, "item_id"]

        # User features
        user_features = torch.tensor(
            self.m_data.iloc[index][["user_id", "age", "occupation"]].values.astype(np.float32),
            dtype=torch.float32
        )
        user_features = torch.cat((
            user_features,
            torch.tensor(
                [self.m_data.at[index, "gen_F"], self.m_data.at[index, "gen_M"]],
                dtype=torch.float32
            )
        ))

        # Item features
        item_features = torch.tensor(
            self.m_data.iloc[index][[
                "item_id", "timestamp", "unknown", "Action", "Adventure", "Animation", "Children",
                "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film_Noir", "Horror",
                "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
            ]].values.astype(np.float32),
            dtype=torch.float32
        )

        # Title embedding
        title = self.m_data.at[index, "title"]
        title_idx = self.title_to_idx.get(title, -1)
        if title_idx != -1:
            ts_title = self.title_emb_tensor[title_idx]
        else:
            ts_title = torch.zeros(384)  # Keep on CPU

        # Concatenate item features with title embeddings
        concatenated = torch.cat((item_features, ts_title), dim=0)

        return (
            user_features,
            concatenated,
            torch.tensor(int(self.m_data.at[index, "like"]), dtype=torch.long),
            torch.tensor(float(self.m_data.at[index, "rating"]), dtype=torch.float32)
        )

In [20]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduction == 'mean':
            return torch.mean(F_loss)
        elif self.reduction == 'sum':
            return torch.sum(F_loss)
        else:
            return F_loss

# Initialize FocalLoss

## Build a two-tower model

In [21]:
class cls_model(nn.Module):
    def __init__(self, userCount, itemCount, user_embSize=32, item_embSize=32):
        super().__init__()

        self.m_userEmb = nn.Embedding(userCount, user_embSize)
        self.m_itemEmb = nn.Embedding(itemCount, item_embSize)

        # Updated input size based on actual concatenated item features
        item_input_size = 404 + item_embSize  # 404 from item features + 32 from item embedding = 436

        self.m_modelUser = nn.Sequential(
            nn.Linear(36, 64),  # 4 user features + 32 embedding
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, user_embSize)
        )

        self.m_modelItem = nn.Sequential(
            nn.Linear(item_input_size, 128),  # Updated from 437 to 436
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, item_embSize)
        )

        self.m_modelClassify = nn.Sequential(
            nn.Linear(user_embSize + item_embSize, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, dataUser, dataItem):
        embu = self.m_userEmb(dataUser[:, 0].long())              # [batch_size, 32]
        embi = self.m_itemEmb(dataItem[:, 0].long())              # [batch_size, 32]

        inputU = torch.cat((dataUser[:, 1:].float(), embu), dim=1)  # [batch_size, 36]
        inputI = torch.cat((dataItem[:, 1:].float(), embi), dim=1)  # [batch_size, 436]

        logitsU = self.m_modelUser(inputU)                         # [batch_size, 32]
        logitsI = self.m_modelItem(inputI)                         # [batch_size, 32]

        logits = self.m_modelClassify(torch.cat((logitsU, logitsI), dim=1))  # [batch_size, 2]
        return logits

    def predict(self, userID: torch.tensor):
        """
        Get the similarity between userIDs and all available items
        """
        embu = self.m_userEmb(userID.long())                      # [batch_size, 32]
        embi = self.m_itemEmb.weight.data                          # [itemCount, 32]

        res = embu @ embi.T                                         # [batch_size, itemCount]
        normU = torch.linalg.norm(embu, dim=1, ord=2)              # [batch_size]
        normI = torch.linalg.norm(embi, dim=1, ord=2)              # [itemCount]

        normU = normU.unsqueeze(dim=1)                             # [batch_size, 1]
        normI = normI.unsqueeze(dim=0)                             # [1, itemCount]

        res = res / (normU @ normI)                                 # [batch_size, itemCount]

        return res

In [22]:
def debug_shapes(model, dataUser, dataItem):
    print(f"User data shape: {dataUser.shape}")
    print(f"Item data shape: {dataItem.shape}")
    print(f"User IDs shape: {dataUser[:, 0].shape}")
    print(f"User features shape: {dataUser[:, 1:].shape}")
    print(f"Item IDs shape: {dataItem[:, 0].shape}")
    print(f"Item features shape: {dataItem[:, 1:].shape}")

## Training and evaluating

In [23]:
import os
import time
import torch
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader
import torch.nn as nn
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from torch.utils.tensorboard import SummaryWriter
import multiprocessing

# Environment settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Initialize TensorBoard writer
writer = SummaryWriter(log_dir='runs/recommendation_experiment')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device is {device}")

# ... [Data preparation steps] ...
# Determine the maximum user_id and item_id
max_user_id = df_combined["user_id"].max()
max_item_id = df_combined["item_id"].max()

# Set userCount and itemCount to max IDs + 1
userCount = max_user_id + 1
itemCount = max_item_id + 1

epochs = 5
all_unique_titles = pd.concat([df_train, df_test])["title"].unique()

# Title to index mapping
title_to_idx = {title: idx for idx, title in enumerate(all_unique_titles)}
list_titlEmb = [dict_titlEmb[title] for title in title_to_idx]
title_emb_tensor = torch.stack(list_titlEmb)

# Dataset and DataLoader
ds_train = cls_dataset(df_train, userCount, itemCount, title_emb_tensor, title_to_idx)
ds_test = cls_dataset(df_test, userCount, itemCount, title_emb_tensor, title_to_idx)

num_workers = multiprocessing.cpu_count()
ds_trainLoader = DataLoader(ds_train, batch_size=512, num_workers=num_workers, pin_memory=True)
ds_testLoader = DataLoader(ds_test, batch_size=1000, num_workers=num_workers, pin_memory=True)

# Model initialization
modelRec = cls_model(userCount, itemCount, user_embSize=32, item_embSize=32)
if torch.cuda.device_count() > 1:
    modelRec = nn.DataParallel(modelRec)
modelRec = modelRec.to(device)

# Loss and optimizer
criterion = FocalLoss(alpha=1, gamma=2, reduction='mean').to(device)
optim = torch.optim.Adam(modelRec.parameters(), lr=1e-3)

# GradScaler for mixed precision
scaler = GradScaler()

for epoch in range(epochs):
    modelRec.train()
    loss_acc = 0
    for i, (x1, x2, y, _) in enumerate(tqdm(ds_trainLoader, desc=f"Epoch {epoch+1}")):
        x1 = x1.to(device, non_blocking=True)
        x2 = x2.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        optim.zero_grad()
        with autocast():
            logits = modelRec(x1, x2)
            loss = criterion(logits, y.squeeze())

        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()

        loss_acc += loss.item()

        # Optional: Log every N batches
        if (i + 1) % 100 == 0:
            avg_loss = loss_acc / 100
            print(f"Epoch {epoch+1}, Batch {i+1}, Train Loss: {avg_loss:.4f}")
            writer.add_scalar('Train/Loss', avg_loss, epoch * len(ds_trainLoader) + i + 1)
            loss_acc = 0

    # Validation at the end of each epoch
    modelRec.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for x1_val, x2_val, y_val, _ in tqdm(ds_testLoader, desc="Validation"):
            x1_val = x1_val.to(device, non_blocking=True)
            x2_val = x2_val.to(device, non_blocking=True)
            y_val = y_val.to(device, non_blocking=True)

            with autocast():
                logits_val = modelRec(x1_val, x2_val)
                loss_val = criterion(logits_val, y_val.squeeze())

            val_loss += loss_val.item()
            preds = torch.argmax(logits_val, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_val.squeeze().cpu().numpy())

    avg_val_loss = val_loss / len(ds_testLoader)
    val_accuracy = accuracy_score(all_labels, all_preds)

    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
    writer.add_scalar('Validation/Loss', avg_val_loss, epoch)
    writer.add_scalar('Validation/Accuracy', val_accuracy, epoch)

writer.close()

Device is cuda


  scaler = GradScaler()


Epoch 1:   0%|          | 0/2507 [00:00<?, ?it/s]

  with autocast():


Epoch 1, Batch 100, Train Loss: 0.1089
Epoch 1, Batch 200, Train Loss: 0.1009
Epoch 1, Batch 300, Train Loss: 0.0993
Epoch 1, Batch 400, Train Loss: 0.0976
Epoch 1, Batch 500, Train Loss: 0.0979
Epoch 1, Batch 600, Train Loss: 0.0967
Epoch 1, Batch 700, Train Loss: 0.0958
Epoch 1, Batch 800, Train Loss: 0.0954
Epoch 1, Batch 900, Train Loss: 0.0953
Epoch 1, Batch 1000, Train Loss: 0.0943
Epoch 1, Batch 1100, Train Loss: 0.0935
Epoch 1, Batch 1200, Train Loss: 0.0934
Epoch 1, Batch 1300, Train Loss: 0.0931
Epoch 1, Batch 1400, Train Loss: 0.0934
Epoch 1, Batch 1500, Train Loss: 0.0920
Epoch 1, Batch 1600, Train Loss: 0.0923
Epoch 1, Batch 1700, Train Loss: 0.0916
Epoch 1, Batch 1800, Train Loss: 0.0916
Epoch 1, Batch 1900, Train Loss: 0.0914
Epoch 1, Batch 2000, Train Loss: 0.0911
Epoch 1, Batch 2100, Train Loss: 0.0906
Epoch 1, Batch 2200, Train Loss: 0.0905
Epoch 1, Batch 2300, Train Loss: 0.0902
Epoch 1, Batch 2400, Train Loss: 0.0905
Epoch 1, Batch 2500, Train Loss: 0.0904


Validation:   0%|          | 0/684 [00:00<?, ?it/s]

  with autocast():


Epoch 1 Validation Loss: 0.0422, Accuracy: 0.9156


Epoch 2:   0%|          | 0/2507 [00:00<?, ?it/s]

  with autocast():


Epoch 2, Batch 100, Train Loss: 0.0892
Epoch 2, Batch 200, Train Loss: 0.0886
Epoch 2, Batch 300, Train Loss: 0.0891
Epoch 2, Batch 400, Train Loss: 0.0893
Epoch 2, Batch 500, Train Loss: 0.0895
Epoch 2, Batch 600, Train Loss: 0.0893
Epoch 2, Batch 700, Train Loss: 0.0892
Epoch 2, Batch 800, Train Loss: 0.0886
Epoch 2, Batch 900, Train Loss: 0.0896
Epoch 2, Batch 1000, Train Loss: 0.0885
Epoch 2, Batch 1100, Train Loss: 0.0880
Epoch 2, Batch 1200, Train Loss: 0.0886
Epoch 2, Batch 1300, Train Loss: 0.0887
Epoch 2, Batch 1400, Train Loss: 0.0890
Epoch 2, Batch 1500, Train Loss: 0.0880
Epoch 2, Batch 1600, Train Loss: 0.0885
Epoch 2, Batch 1700, Train Loss: 0.0879
Epoch 2, Batch 1800, Train Loss: 0.0881
Epoch 2, Batch 1900, Train Loss: 0.0882
Epoch 2, Batch 2000, Train Loss: 0.0878
Epoch 2, Batch 2100, Train Loss: 0.0877
Epoch 2, Batch 2200, Train Loss: 0.0876
Epoch 2, Batch 2300, Train Loss: 0.0873
Epoch 2, Batch 2400, Train Loss: 0.0882
Epoch 2, Batch 2500, Train Loss: 0.0879


Validation:   0%|          | 0/684 [00:00<?, ?it/s]

  with autocast():


Epoch 2 Validation Loss: 0.0414, Accuracy: 0.9179


Epoch 3:   0%|          | 0/2507 [00:00<?, ?it/s]

  with autocast():


Epoch 3, Batch 100, Train Loss: 0.0870
Epoch 3, Batch 200, Train Loss: 0.0864
Epoch 3, Batch 300, Train Loss: 0.0870
Epoch 3, Batch 400, Train Loss: 0.0873
Epoch 3, Batch 500, Train Loss: 0.0880
Epoch 3, Batch 600, Train Loss: 0.0875
Epoch 3, Batch 700, Train Loss: 0.0876
Epoch 3, Batch 800, Train Loss: 0.0869
Epoch 3, Batch 900, Train Loss: 0.0879
Epoch 3, Batch 1000, Train Loss: 0.0871
Epoch 3, Batch 1100, Train Loss: 0.0864
Epoch 3, Batch 1200, Train Loss: 0.0870
Epoch 3, Batch 1300, Train Loss: 0.0872
Epoch 3, Batch 1400, Train Loss: 0.0874
Epoch 3, Batch 1500, Train Loss: 0.0866
Epoch 3, Batch 1600, Train Loss: 0.0870
Epoch 3, Batch 1700, Train Loss: 0.0868
Epoch 3, Batch 1800, Train Loss: 0.0867
Epoch 3, Batch 1900, Train Loss: 0.0872
Epoch 3, Batch 2000, Train Loss: 0.0863
Epoch 3, Batch 2100, Train Loss: 0.0867
Epoch 3, Batch 2200, Train Loss: 0.0864
Epoch 3, Batch 2300, Train Loss: 0.0863
Epoch 3, Batch 2400, Train Loss: 0.0871
Epoch 3, Batch 2500, Train Loss: 0.0867


Validation:   0%|          | 0/684 [00:00<?, ?it/s]

  with autocast():


Epoch 3 Validation Loss: 0.0410, Accuracy: 0.9187


Epoch 4:   0%|          | 0/2507 [00:00<?, ?it/s]

  with autocast():


In [24]:
torch.save(modelRec.state_dict(), "modelRec.pth")

In [25]:
# Initialize the model
modelRec = cls_model(userCount, itemCount, user_embSize=32, item_embSize=32)
modelRec.load_state_dict(torch.load('modelRec.pth'))
modelRec.to(device)  # Move the model to the appropriate device
modelRec.eval()  # Set the model to evaluation mode

  modelRec.load_state_dict(torch.load('modelRec.pth'))


cls_model(
  (m_userEmb): Embedding(6040, 32)
  (m_itemEmb): Embedding(3706, 32)
  (m_modelUser): Sequential(
    (0): Linear(in_features=36, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=64, bias=True)
    (4): ReLU()
    (5): Linear(in_features=64, out_features=32, bias=True)
  )
  (m_modelItem): Sequential(
    (0): Linear(in_features=436, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Linear(in_features=64, out_features=32, bias=True)
  )
  (m_modelClassify): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [26]:
def get_user_top_rated_movies(user_id, k=10):
    """
    Retrieve top k rated movies for a specific user.
    
    Args:
    - user_id: ID of the user
    - k: Number of top rated movies to retrieve
    
    Returns:
    - DataFrame of top rated movies
    """
    # Filter ratings for the specific user
    user_ratings = df_combined[df_combined['user_id'] == user_id]
    
    # Check if the DataFrame is empty
    if user_ratings.empty:
        print(f"No ratings found for user ID {user_id}.")
        return None  # or handle accordingly

    # Sort by rating in descending order and get top k
    top_rated = user_ratings.sort_values('rating', ascending=False).head(k)
    
    # List of genre columns
    genre_columns = [
        'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]
    
    # Return the relevant columns including genre columns
    return top_rated[['item_id', 'title', 'rating'] + genre_columns]

def get_top_k_recommendations(model, user_id, k=10, device=device):
    """
    Get top k item recommendations for a given user with more detailed information.
    
    Args:
    - model: Trained recommendation model
    - user_id: ID of the user to get recommendations for
    - k: Number of top recommendations to retrieve
    - device: Device to run the model on (cuda/cpu)
    
    Returns:
    - List of top k item recommendations with their predicted scores
    """
    # Ensure the model is in evaluation mode
    model.eval()
    
    # Convert user_id to tensor and move to device
    user_tensor = torch.tensor([user_id]).to(device)
    
    # Get similarity scores for the user with all items
    with torch.no_grad():
        similarity_scores = model.predict(user_tensor).squeeze().cpu()
    
    # Sort items by similarity scores in descending order
    _, top_item_indices = torch.topk(similarity_scores, k=k)
    
    # Prepare recommendations with item details
    recommendations = []
    for item_idx in top_item_indices:
        # Find the original item ID
        original_item_id = list(item_id_mapping.keys())[list(item_id_mapping.values()).index(item_idx.item())]
        
        # Get movie details
        movie_info = df_movies[df_movies['item_id'] == original_item_id]
        
        # Check if movie_info is empty
        if movie_info.empty:
            print(f"No movie information found for item ID {original_item_id}.")
            continue  # Skip this iteration if no movie info is found
        
        # Extract genre information safely
        genre_columns = [
            'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
            'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical',
            'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
        ]
        
        # Ensure all genre columns exist, if not, create them
        movie_genres = []
        for genre in genre_columns:
            if genre in movie_info.columns:
                movie_genres.append(movie_info[genre].values[0])
            else:
                movie_genres.append(0)
        
        recommendations.append({
            'item_id': original_item_id,
            'title': movie_info['title'].values[0],
            'score': similarity_scores[item_idx].item(),
            'genres': movie_genres  # Store the one-hot encoded genres
        })
    
    return recommendations

def detailed_user_recommendations(user_id, k=10):
    """
    Provide a comprehensive recommendation analysis for a user.
    """
    print(f"Recommendation Analysis for User ID {user_id}:\n")
    
    modelRec.to(device)
    modelRec.eval()
    
    # Initialize counters for prediction statistics
    liked_correct = 0
    liked_total = 0
    disliked_correct = 0
    disliked_total = 0
    
    # Get user data
    user_rated_movies = df_combined[df_combined['user_id'] == user_id]
    
    # Separate training and test data for this user
    user_train_data = df_train[df_train['user_id'] == user_id]
    user_test_data = df_test[df_test['user_id'] == user_id]
    
    print(f"\nData Distribution for User {user_id}:")
    print(f"Total ratings: {len(user_rated_movies)}")
    print(f"Training set ratings: {len(user_train_data)}")
    print(f"Test set ratings: {len(user_test_data)}")
    
    # Analyze only test data
    liked_movies = user_test_data[user_test_data['rating'] >= 4]
    disliked_movies = user_test_data[user_test_data['rating'] < 4]
    
    print(f"\nAnalyzing only test set data:")
    print(f"Number of liked movies in test set: {len(liked_movies)}")
    print(f"Number of disliked movies in test set: {len(disliked_movies)}\n")
    
    # Prepare user features
    user_data = df_combined[df_combined['user_id'] == user_id].iloc[0]
    user_features = torch.tensor([
        user_data['user_id'],
        user_data['age'],
        user_data['occupation'],
        user_data['gen_F'],
        user_data['gen_M']
    ], dtype=torch.float32).unsqueeze(0).to(device)
    
    # Predict likes for liked movies (test set only)
    print("\nPredictions for Movies User Liked in Test Set (Rating >= 4):")
    for _, movie in liked_movies.iterrows():
        liked_total += 1
        
        # Prepare item features
        item_features = torch.tensor(
            movie[[
                'item_id', 'timestamp', 'unknown', 'Action', 'Adventure', 'Animation',
                'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                'Thriller', 'War', 'Western'
            ]].values.astype(np.float32)
        ).unsqueeze(0).to(device)
        
        # Get title embedding
        title = movie['title']
        title_idx = title_to_idx.get(title, -1)
        if title_idx != -1:
            title_emb = title_emb_tensor[title_idx].to(device)
        else:
            title_emb = torch.zeros(384, device=device)
        
        # Concatenate item features with title embedding
        item_features = torch.cat((item_features, title_emb.unsqueeze(0)), dim=1)
        
        with torch.no_grad():
            logits = modelRec(user_features, item_features)
            predicted_like = torch.argmax(logits, dim=1).item()
            
            if predicted_like == 1:  # Model predicted "like"
                liked_correct += 1
                
            print(f"Movie: {movie['title']}")
            print(f"Actual Rating: {movie['rating']:.1f}")
            print(f"Predicted: {'Like' if predicted_like == 1 else 'Dislike'}")
            print(f"Prediction {'Correct' if predicted_like == 1 else 'Incorrect'}\n")

    # Predict likes for disliked movies (test set only)
    print("\nPredictions for Movies User Disliked in Test Set (Rating < 4):")
    for _, movie in disliked_movies.iterrows():
        disliked_total += 1
        
        # Prepare item features
        item_features = torch.tensor(
            movie[[
                'item_id', 'timestamp', 'unknown', 'Action', 'Adventure', 'Animation',
                'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                'Thriller', 'War', 'Western'
            ]].values.astype(np.float32)
        ).unsqueeze(0).to(device)
        
        # Get title embedding
        title = movie['title']
        title_idx = title_to_idx.get(title, -1)
        if title_idx != -1:
            title_emb = title_emb_tensor[title_idx].to(device)
        else:
            title_emb = torch.zeros(384, device=device)
        
        # Concatenate item features with title embedding
        item_features = torch.cat((item_features, title_emb.unsqueeze(0)), dim=1)
        
        with torch.no_grad():
            logits = modelRec(user_features, item_features)
            predicted_like = torch.argmax(logits, dim=1).item()
            
            if predicted_like == 0:  # Model predicted "dislike"
                disliked_correct += 1
                
            print(f"Movie: {movie['title']}")
            print(f"Actual Rating: {movie['rating']:.1f}")
            print(f"Predicted: {'Like' if predicted_like == 1 else 'Dislike'}")
            print(f"Prediction {'Correct' if predicted_like == 0 else 'Incorrect'}\n")

    # Calculate and display statistics (test set only)
    print("\n=== Prediction Statistics (Test Set Only) ===")
    
    # Liked movies statistics
    liked_accuracy = (liked_correct / liked_total * 100) if liked_total > 0 else 0
    print(f"\nLiked Movies (Rating >= 4):")
    print(f"Total in test set: {liked_total}")
    print(f"Correctly Predicted: {liked_correct}")
    print(f"Accuracy: {liked_accuracy:.2f}%")
    
    # Disliked movies statistics
    disliked_accuracy = (disliked_correct / disliked_total * 100) if disliked_total > 0 else 0
    print(f"\nDisliked Movies (Rating < 4):")
    print(f"Total in test set: {disliked_total}")
    print(f"Correctly Predicted: {disliked_correct}")
    print(f"Accuracy: {disliked_accuracy:.2f}%")
    
    # Overall statistics
    total_movies = liked_total + disliked_total
    total_correct = liked_correct + disliked_correct
    overall_accuracy = (total_correct / total_movies * 100) if total_movies > 0 else 0
    print(f"\nOverall Test Set Statistics:")
    print(f"Total Movies in test set: {total_movies}")
    print(f"Total Correct Predictions: {total_correct}")
    print(f"Overall Accuracy: {overall_accuracy:.2f}%")

# Test with a specific user ID
test_user_id = 10  # You can change this to any user ID in your dataset
detailed_user_recommendations(test_user_id, k=10)

Recommendation Analysis for User ID 10:


Data Distribution for User 10:
Total ratings: 137
Training set ratings: 192
Test set ratings: 105

Analyzing only test set data:
Number of liked movies in test set: 14
Number of disliked movies in test set: 91


Predictions for Movies User Liked in Test Set (Rating >= 4):
Movie: strictly ballroom (1992)
Actual Rating: 4.0
Predicted: Like
Prediction Correct

Movie: kids in the hall brain candy (1996)
Actual Rating: 4.0
Predicted: Dislike
Prediction Incorrect

Movie: kalifornia (1993)
Actual Rating: 5.0
Predicted: Dislike
Prediction Incorrect

Movie: trainspotting (1996)
Actual Rating: 4.0
Predicted: Like
Prediction Correct

Movie: american beauty (1999)
Actual Rating: 5.0
Predicted: Like
Prediction Correct

Movie: my cousin vinny (1992)
Actual Rating: 4.0
Predicted: Like
Prediction Correct

Movie: jurassic park (1993)
Actual Rating: 4.0
Predicted: Like
Prediction Correct

Movie: so i married an axe murderer (1993)
Actual Rating: 5.0
Predicted: D

In [27]:
def analyze_multiple_users(num_users=100, seed=42):
    """
    Analyze predictions for multiple random users.
    
    Args:
    - num_users: Number of users to analyze
    - seed: Random seed for reproducibility
    """
    np.random.seed(seed)
    
    # Get unique users from test set
    unique_users = df_test['user_id'].unique()
    
    # Randomly sample users
    selected_users = np.random.choice(unique_users, min(num_users, len(unique_users)), replace=False)
    
    # Initialize global statistics
    total_liked_correct = 0
    total_liked = 0
    total_disliked_correct = 0
    total_disliked = 0
    
    # Store individual user statistics
    user_stats = []
    
    modelRec.to(device)
    modelRec.eval()
    
    print(f"Analyzing {len(selected_users)} users...\n")
    
    for user_id in tqdm(selected_users):
        # Get user test data
        user_test_data = df_test[df_test['user_id'] == user_id]
        
        if user_test_data.empty:
            continue
            
        # Split into liked and disliked
        liked_movies = user_test_data[user_test_data['rating'] >= 4]
        disliked_movies = user_test_data[user_test_data['rating'] < 4]
        
        # Initialize user statistics
        user_liked_correct = 0
        user_disliked_correct = 0
        
        # Prepare user features
        user_data = df_combined[df_combined['user_id'] == user_id].iloc[0]
        user_features = torch.tensor([
            user_data['user_id'],
            user_data['age'],
            user_data['occupation'],
            user_data['gen_F'],
            user_data['gen_M']
        ], dtype=torch.float32).unsqueeze(0).to(device)
        
        # Process liked movies
        for _, movie in liked_movies.iterrows():
            item_features = torch.tensor(
                movie[[
                    'item_id', 'timestamp', 'unknown', 'Action', 'Adventure', 'Animation',
                    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                    'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                    'Thriller', 'War', 'Western'
                ]].values.astype(np.float32)
            ).unsqueeze(0).to(device)
            
            title = movie['title']
            title_idx = title_to_idx.get(title, -1)
            if title_idx != -1:
                title_emb = title_emb_tensor[title_idx].to(device)
            else:
                title_emb = torch.zeros(384, device=device)
            
            item_features = torch.cat((item_features, title_emb.unsqueeze(0)), dim=1)
            
            with torch.no_grad():
                logits = modelRec(user_features, item_features)
                predicted_like = torch.argmax(logits, dim=1).item()
                
                if predicted_like == 1:
                    user_liked_correct += 1
        
        # Process disliked movies
        for _, movie in disliked_movies.iterrows():
            item_features = torch.tensor(
                movie[[
                    'item_id', 'timestamp', 'unknown', 'Action', 'Adventure', 'Animation',
                    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                    'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                    'Thriller', 'War', 'Western'
                ]].values.astype(np.float32)
            ).unsqueeze(0).to(device)
            
            title = movie['title']
            title_idx = title_to_idx.get(title, -1)
            if title_idx != -1:
                title_emb = title_emb_tensor[title_idx].to(device)
            else:
                title_emb = torch.zeros(384, device=device)
            
            item_features = torch.cat((item_features, title_emb.unsqueeze(0)), dim=1)
            
            with torch.no_grad():
                logits = modelRec(user_features, item_features)
                predicted_like = torch.argmax(logits, dim=1).item()
                
                if predicted_like == 0:
                    user_disliked_correct += 1
        
        # Update global statistics
        total_liked_correct += user_liked_correct
        total_liked += len(liked_movies)
        total_disliked_correct += user_disliked_correct
        total_disliked += len(disliked_movies)
        
        # Calculate user accuracies
        liked_acc = (user_liked_correct / len(liked_movies) * 100) if len(liked_movies) > 0 else 0
        disliked_acc = (user_disliked_correct / len(disliked_movies) * 100) if len(disliked_movies) > 0 else 0
        overall_acc = ((user_liked_correct + user_disliked_correct) / 
                      (len(liked_movies) + len(disliked_movies)) * 100)
        
        # Store user statistics
        user_stats.append({
            'user_id': user_id,
            'liked_accuracy': liked_acc,
            'disliked_accuracy': disliked_acc,
            'overall_accuracy': overall_acc,
            'total_movies': len(liked_movies) + len(disliked_movies)
        })
    
    # Calculate global statistics
    print("\n=== Global Statistics ===")
    
    # Liked movies
    global_liked_acc = (total_liked_correct / total_liked * 100) if total_liked > 0 else 0
    print(f"\nLiked Movies (Rating >= 4):")
    print(f"Total: {total_liked}")
    print(f"Correctly Predicted: {total_liked_correct}")
    print(f"Accuracy: {global_liked_acc:.2f}%")
    
    # Disliked movies
    global_disliked_acc = (total_disliked_correct / total_disliked * 100) if total_disliked > 0 else 0
    print(f"\nDisliked Movies (Rating < 4):")
    print(f"Total: {total_disliked}")
    print(f"Correctly Predicted: {total_disliked_correct}")
    print(f"Accuracy: {global_disliked_acc:.2f}%")
    
    # Overall
    total_movies = total_liked + total_disliked
    total_correct = total_liked_correct + total_disliked_correct
    global_acc = (total_correct / total_movies * 100) if total_movies > 0 else 0
    print(f"\nOverall Statistics:")
    print(f"Total Movies: {total_movies}")
    print(f"Total Correct Predictions: {total_correct}")
    print(f"Overall Accuracy: {global_acc:.2f}%")
    
    # Calculate distribution of accuracies
    accuracies = np.array([stat['overall_accuracy'] for stat in user_stats])
    print(f"\nAccuracy Distribution:")
    print(f"Mean: {np.mean(accuracies):.2f}%")
    print(f"Median: {np.median(accuracies):.2f}%")
    print(f"Std Dev: {np.std(accuracies):.2f}%")
    print(f"Min: {np.min(accuracies):.2f}%")
    print(f"Max: {np.max(accuracies):.2f}%")
    
    return user_stats

# Analyze 100 random users
user_stats = analyze_multiple_users(num_users=100)

Analyzing 100 users...



  0%|          | 0/100 [00:00<?, ?it/s]


=== Global Statistics ===

Liked Movies (Rating >= 4):
Total: 1747
Correctly Predicted: 1407
Accuracy: 80.54%

Disliked Movies (Rating < 4):
Total: 9366
Correctly Predicted: 8317
Accuracy: 88.80%

Overall Statistics:
Total Movies: 11113
Total Correct Predictions: 9724
Overall Accuracy: 87.50%

Accuracy Distribution:
Mean: 87.98%
Median: 92.63%
Std Dev: 15.11%
Min: 22.22%
Max: 100.00%
