In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import sys
# add root folder to path
folder = "../../"
sys.path.append(folder)
from src.utils import load_data
from src.utils import plot_metrics_grid
from src.utils import load_baseline_rec_result
from src.metrics import evaluate_recommender_system

In [2]:
users,ratings,movies = load_data('../../data/ml-1m')
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [3]:
# import pandas as pd
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from sklearn.model_selection import train_test_split
# from torch.utils.data import Dataset, DataLoader
# from sklearn.preprocessing import StandardScaler, LabelEncoder
# import itertools

# # Assuming users, ratings, and movies are already loaded as DataFrames
# # Merge the datasets

# # Convert categorical columns to numeric
# label_encoders = {}
# for column in ['gender', 'occupation', 'genres']:
#     le = LabelEncoder()
#     data[column] = le.fit_transform(data[column])
#     label_encoders[column] = le

# # Extract year from title
# data['year'] = data['title'].str.extract(r'\((\d{4})\)').astype(float)
# data = data.drop(columns=['title', 'timestamp', 'zip'])

# # Fill missing year values with the median year
# data['year'].fillna(data['year'].median(), inplace=True)

# # Split the data into train and test sets
# train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# # Generate pairs for pairwise ranking
# def generate_pairs(data, k=100):
#     pairs = []
#     grouped = data.groupby('user_id')
#     for _, group in grouped:
#         group = group.sample(frac=1).reset_index(drop=True)  # Shuffle the group
#         n = len(group)
#         if n < 2:
#             continue
#         pair_indices = list(itertools.combinations(range(n), 2))
#         pair_indices = pair_indices[:k]  # Limit to k pairs per user
#         for i, j in pair_indices:
#             pairs.append((group.iloc[i], group.iloc[j]))
#     return pairs

# train_pairs = generate_pairs(train_data, k=100)
# test_pairs = generate_pairs(test_data, k=100)

# # Define dataset class for pairs
# class PairwiseMovieDataset(Dataset):
#     def __init__(self, pairs):
#         self.pairs = pairs
#         self.scaler = StandardScaler()
#         all_data = pd.concat([pd.DataFrame([pair[0], pair[1]]) for pair in pairs])
#         self.scaler.fit(all_data[['age', 'year']])
    
#     def __len__(self):
#         return len(self.pairs)
    
#     def __getitem__(self, idx):
#         pos_item, neg_item = self.pairs[idx]
#         pos_user_features = torch.tensor([pos_item['gender'], pos_item['age'], pos_item['occupation']], dtype=torch.float)
#         pos_movie_features = torch.tensor([pos_item['genres'], pos_item['year']], dtype=torch.float)
#         neg_user_features = torch.tensor([neg_item['gender'], neg_item['age'], neg_item['occupation']], dtype=torch.float)
#         neg_movie_features = torch.tensor([neg_item['genres'], neg_item['year']], dtype=torch.float)
        
#         pos_user_features[1] = torch.tensor(self.scaler.transform(pos_user_features[1].reshape(1, -1)), dtype=torch.float)
#         neg_user_features[1] = torch.tensor(self.scaler.transform(neg_user_features[1].reshape(1, -1)), dtype=torch.float)
        
#         return pos_user_features, pos_movie_features, neg_user_features, neg_movie_features

# # Create DataLoader
# train_dataset = PairwiseMovieDataset(train_pairs)
# test_dataset = PairwiseMovieDataset(test_pairs)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [4]:
from sklearn.model_selection import train_test_split
# Merge the datasets
data = ratings.merge(users, on='user_id').merge(movies, on='movie_id')

# Extract year from title
data['year'] = data['title'].str.extract(r'\((\d{4})\)').astype(float)

# Split genres into separate columns
data['genres'] = data['genres'].str.split('|')

# Create a DataFrame for each unique genre and merge them into the main DataFrame
genres_expanded = data['genres'].explode().unique()
genre_columns = pd.get_dummies(data['genres'].explode()).groupby(level=0).max()

# Join the new genre columns to the main DataFrame
data = data.join(genre_columns)

# Drop unnecessary columns
data = data.drop(columns=['title', 'genres', 'movie_id', 'zip'])

# Fill missing year values with the median year
data['year'].fillna(data['year'].median(), inplace=True)

data['gender'] = data['gender'].apply(lambda x: x == 'M')
data.drop(columns=['timestamp'], inplace=True)
# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the resulting DataFrames to check the changes
train_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['year'].fillna(data['year'].median(), inplace=True)


Unnamed: 0,user_id,rating,gender,age,occupation,year,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
416292,2507,2,True,25,4,1955.0,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
683230,4087,4,True,1,4,1999.0,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2434,19,3,True,1,10,1993.0,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
688533,4118,4,True,25,3,1983.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
472584,2907,4,False,35,5,1996.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
import numpy as np
import itertools

def generate_pairs(data, k=100_000):
    grouped_data = data.groupby('user_id')
    all_dfs = []
    for user_idx, sub_df in grouped_data:
        total_users = sub_df.shape[0]
        all_combinations = list(itertools.combinations(range(total_users), 2))
        min_samples = 1 if k//len(grouped_data) < 1 else k//len(grouped_data)
        selected_combinations_idx = np.random.choice(range(len(all_combinations)), min(min_samples, len(all_combinations)), replace=False)
        
        selected_combinations = [all_combinations[i] for i in selected_combinations_idx]
        for combination in selected_combinations:
            # print(sub_df.iloc[combination[0:1]], sub_df.iloc[combination[1]])
            all_dfs.append(pd.merge(sub_df.iloc[combination[0]:combination[0]+1], sub_df.iloc[combination[1]:combination[1]+1], on='user_id', suffixes=('_1', '_2')))

    return pd.concat(all_dfs, axis=0)

train_pairs = generate_pairs(train_data, k=100_000)
test_pairs = generate_pairs(test_data, k=10_000)

In [6]:
train_pairs['label'] = train_pairs['rating_1'] > train_pairs['rating_2']
test_pairs['label'] = test_pairs['rating_1'] > test_pairs['rating_2']
test_pairs.columns

Index(['user_id', 'rating_1', 'gender_1', 'age_1', 'occupation_1', 'year_1',
       'Action_1', 'Adventure_1', 'Animation_1', 'Children's_1', 'Comedy_1',
       'Crime_1', 'Documentary_1', 'Drama_1', 'Fantasy_1', 'Film-Noir_1',
       'Horror_1', 'Musical_1', 'Mystery_1', 'Romance_1', 'Sci-Fi_1',
       'Thriller_1', 'War_1', 'Western_1', 'rating_2', 'gender_2', 'age_2',
       'occupation_2', 'year_2', 'Action_2', 'Adventure_2', 'Animation_2',
       'Children's_2', 'Comedy_2', 'Crime_2', 'Documentary_2', 'Drama_2',
       'Fantasy_2', 'Film-Noir_2', 'Horror_2', 'Musical_2', 'Mystery_2',
       'Romance_2', 'Sci-Fi_2', 'Thriller_2', 'War_2', 'Western_2', 'label'],
      dtype='object')

In [7]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

class PairwiseMovieDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
        self.scaler = StandardScaler()
        features = pairs.drop(columns='label')
        self.scaler.fit(features)
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        row = self.pairs.iloc[idx]
        user_features = torch.tensor([
            row['gender_1'], row['age_1'], row['occupation_1'],
            row['rating_2'], row['gender_2'], row['age_2'], row['occupation_2']
        ], dtype=torch.float32)
        
        film1_features = torch.tensor([
            row['year_1'], row['Action_1'], row['Adventure_1'], row['Animation_1'],
            row['Children\'s_1'], row['Comedy_1'], row['Crime_1'], row['Documentary_1'],
            row['Drama_1'], row['Fantasy_1'], row['Film-Noir_1'], row['Horror_1'],
            row['Musical_1'], row['Mystery_1'], row['Romance_1'], row['Sci-Fi_1'],
            row['Thriller_1'], row['War_1'], row['Western_1']
        ], dtype=torch.float32)
        
        film2_features = torch.tensor([
            row['year_2'], row['Action_2'], row['Adventure_2'], row['Animation_2'],
            row['Children\'s_2'], row['Comedy_2'], row['Crime_2'], row['Documentary_2'],
            row['Drama_2'], row['Fantasy_2'], row['Film-Noir_2'], row['Horror_2'],
            row['Musical_2'], row['Mystery_2'], row['Romance_2'], row['Sci-Fi_2'],
            row['Thriller_2'], row['War_2'], row['Western_2']
        ], dtype=torch.float32)
        
        label = torch.tensor(row['label'], dtype=torch.float32)
        
        return user_features, film1_features, film2_features, label

# Create DataLoader
train_dataset = PairwiseMovieDataset(train_pairs)
test_dataset = PairwiseMovieDataset(test_pairs)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

class PairwiseRankingModel(nn.Module):
    def __init__(self):
        super(PairwiseRankingModel, self).__init__()
        # Define the architecture
        self.fc1 = nn.Linear(7 + 19, 64)  # User features (7) + Film1 features (19)
        self.fc2 = nn.Linear(64 + 19, 32)  # Add Film2 features (19)
        self.fc3 = nn.Linear(32, 1)  # Output

    def forward(self, user_features, film1_features, film2_features):
        # Concatenate user features and film1 features
        x = torch.cat([user_features, film1_features], dim=1)
        x = torch.relu(self.fc1(x))
        # Concatenate with film2 features
        x = torch.cat([x, film2_features], dim=1)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [11]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm

# Instantiate the model and move to CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = PairwiseRankingModel().to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_labels = []
    all_outputs = []
    all_preds = []
    for user_features, film1_features, film2_features, labels in tqdm(dataloader, desc="Training"):
        # Move data to the same device as the model
        user_features = user_features.to(device)
        film1_features = film1_features.to(device)
        film2_features = film2_features.to(device)
        labels = labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(user_features, film1_features, film2_features).squeeze()
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Update running loss and collect outputs for metrics
        running_loss += loss.item() * user_features.size(0)
        all_labels.extend(labels.cpu().numpy())
        all_outputs.extend(torch.sigmoid(outputs).detach().cpu().numpy())
        all_preds.extend((torch.sigmoid(outputs) > 0.5).cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader.dataset)
    auc = roc_auc_score(all_labels, all_outputs)
    acc = accuracy_score(all_labels, all_preds)
    print(f"Training loss: {epoch_loss:.4f}, AUC: {auc:.4f}, Accuracy: {acc:.4f}")

# Evaluation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_labels = []
    all_outputs = []
    all_preds = []
    with torch.no_grad():
        for user_features, film1_features, film2_features, labels in tqdm(dataloader, desc="Evaluating"):
            user_features = user_features.to(device)
            film1_features = film1_features.to(device)
            film2_features = film2_features.to(device)
            labels = labels.to(device)
            
            outputs = model(user_features, film1_features, film2_features).squeeze()
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * user_features.size(0)
            all_labels.extend(labels.cpu().numpy())
            all_outputs.extend(torch.sigmoid(outputs).cpu().numpy())
            all_preds.extend((torch.sigmoid(outputs) > 0.5).cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader.dataset)
    auc = roc_auc_score(all_labels, all_outputs)
    acc = accuracy_score(all_labels, all_preds)
    print(f"Validation loss: {epoch_loss:.4f}, AUC: {auc:.4f}, Accuracy: {acc:.4f}")

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train(model, train_loader, criterion, optimizer, device)
    evaluate(model, test_loader, criterion, device)


Epoch 1/10


Training: 100%|██████████| 3020/3020 [00:24<00:00, 122.39it/s]


Training loss: 0.6215, AUC: 0.6634, Accuracy: 0.6718


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 158.62it/s]


Validation loss: 0.5608, AUC: 0.8650, Accuracy: 0.7090
Epoch 2/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 141.28it/s]


Training loss: 0.5071, AUC: 0.8050, Accuracy: 0.7516


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 176.34it/s]


Validation loss: 0.6227, AUC: 0.8660, Accuracy: 0.6569
Epoch 3/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 139.64it/s]


Training loss: 0.4877, AUC: 0.8227, Accuracy: 0.7603


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 168.19it/s]


Validation loss: 0.4712, AUC: 0.8662, Accuracy: 0.7542
Epoch 4/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 141.62it/s]


Training loss: 0.4804, AUC: 0.8290, Accuracy: 0.7639


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 171.35it/s]


Validation loss: 0.5688, AUC: 0.8671, Accuracy: 0.7262
Epoch 5/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 140.64it/s]


Training loss: 0.4768, AUC: 0.8324, Accuracy: 0.7652


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 167.60it/s]


Validation loss: 0.4572, AUC: 0.8666, Accuracy: 0.7881
Epoch 6/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 142.49it/s]


Training loss: 0.4721, AUC: 0.8362, Accuracy: 0.7674


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 161.46it/s]


Validation loss: 0.4481, AUC: 0.8666, Accuracy: 0.7727
Epoch 7/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 141.87it/s]


Training loss: 0.4685, AUC: 0.8400, Accuracy: 0.7693


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 167.83it/s]


Validation loss: 0.5059, AUC: 0.8666, Accuracy: 0.7491
Epoch 8/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 139.18it/s]


Training loss: 0.4667, AUC: 0.8415, Accuracy: 0.7700


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 167.01it/s]


Validation loss: 0.4399, AUC: 0.8656, Accuracy: 0.7851
Epoch 9/10


Training: 100%|██████████| 3020/3020 [00:22<00:00, 135.97it/s]


Training loss: 0.4631, AUC: 0.8445, Accuracy: 0.7720


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 167.57it/s]


Validation loss: 0.4421, AUC: 0.8650, Accuracy: 0.7822
Epoch 10/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 140.35it/s]


Training loss: 0.4624, AUC: 0.8448, Accuracy: 0.7729


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 163.45it/s]

Validation loss: 0.4392, AUC: 0.8662, Accuracy: 0.7885



