In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import sys
# add root folder to path
folder = "../../"
sys.path.append(folder)
from src.utils import load_data
from src.utils import plot_metrics_grid
from src.utils import load_baseline_rec_result
from src.metrics import evaluate_recommender_system

In [2]:
users,ratings,movies = load_data('../../data/ml-1m')
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [3]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
from sklearn.model_selection import train_test_split
# Merge the datasets
data = ratings.merge(users, on='user_id').merge(movies, on='movie_id')

# Extract year from title
data['year'] = data['title'].str.extract(r'\((\d{4})\)').astype(float)

# Split genres into separate columns
data['genres'] = data['genres'].str.split('|')

# Create a DataFrame for each unique genre and merge them into the main DataFrame
genres_expanded = data['genres'].explode().unique()
genre_columns = pd.get_dummies(data['genres'].explode()).groupby(level=0).max()

# Join the new genre columns to the main DataFrame
data = data.join(genre_columns)

# Drop unnecessary columns
data = data.drop(columns=['title', 'genres', 'zip'])

# Fill missing year values with the median year
data['year'].fillna(data['year'].median(), inplace=True)

data['gender'] = data['gender'].apply(lambda x: x == 'M')
data.drop(columns=['timestamp'], inplace=True)
# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the resulting DataFrames to check the changes
train_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['year'].fillna(data['year'].median(), inplace=True)


Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,year,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
416292,2507,3035,2,True,25,4,1955.0,False,False,False,...,False,False,False,False,False,False,False,False,True,False
683230,4087,2840,4,True,1,4,1999.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2434,19,457,3,True,1,10,1993.0,True,False,False,...,False,False,False,False,False,False,False,True,False,False
688533,4118,2804,4,True,25,3,1983.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
472584,2907,805,4,False,35,5,1996.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
test_data

Unnamed: 0,user_id,movie_id,rating,gender,age,occupation,year,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
895536,5412,2683,2,True,25,12,1999.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
899739,5440,904,5,False,45,2,1954.0,False,False,False,...,False,False,False,False,True,False,False,True,False,False
55687,368,3717,4,True,25,0,2000.0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
63727,425,1721,4,True,25,12,1997.0,False,False,False,...,False,False,False,False,False,True,False,False,False,False
822011,4942,3697,1,True,45,12,1990.0,True,False,False,...,False,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756007,4505,1396,4,True,50,7,1992.0,False,False,False,...,False,False,False,False,False,False,True,False,False,False
477775,2934,724,1,False,35,20,1996.0,False,False,False,...,False,False,True,False,False,False,False,False,False,False
424188,2572,968,5,True,35,14,1968.0,False,False,False,...,False,False,True,False,False,False,True,False,False,False
293600,1748,1625,3,True,50,1,1997.0,False,False,False,...,False,False,False,False,True,False,False,True,False,False


In [6]:
import numpy as np
import itertools

def generate_pairs(data, k=100_000):
    grouped_data = data.groupby('user_id')
    all_dfs = []
    for user_idx, sub_df in grouped_data:
        total_users = sub_df.shape[0]
        all_combinations = list(itertools.combinations(range(total_users), 2))
        min_samples = 1 if k//len(grouped_data) < 1 else k//len(grouped_data)
        selected_combinations_idx = np.random.choice(range(len(all_combinations)), min(min_samples, len(all_combinations)), replace=False)
        
        selected_combinations = [all_combinations[i] for i in selected_combinations_idx]
        for combination in selected_combinations:
            # print(sub_df.iloc[combination[0:1]], sub_df.iloc[combination[1]])
            all_dfs.append(pd.merge(sub_df.iloc[combination[0]:combination[0]+1], sub_df.iloc[combination[1]:combination[1]+1], on='user_id', suffixes=('_1', '_2')))

    return pd.concat(all_dfs, axis=0)

train_pairs = generate_pairs(train_data, k=100_000)
test_pairs = generate_pairs(test_data, k=10_000)

In [7]:
train_pairs['label'] = train_pairs['rating_1'] > train_pairs['rating_2']
test_pairs['label'] = test_pairs['rating_1'] > test_pairs['rating_2']
test_pairs.columns

Index(['user_id', 'movie_id_1', 'rating_1', 'gender_1', 'age_1',
       'occupation_1', 'year_1', 'Action_1', 'Adventure_1', 'Animation_1',
       'Children's_1', 'Comedy_1', 'Crime_1', 'Documentary_1', 'Drama_1',
       'Fantasy_1', 'Film-Noir_1', 'Horror_1', 'Musical_1', 'Mystery_1',
       'Romance_1', 'Sci-Fi_1', 'Thriller_1', 'War_1', 'Western_1',
       'movie_id_2', 'rating_2', 'gender_2', 'age_2', 'occupation_2', 'year_2',
       'Action_2', 'Adventure_2', 'Animation_2', 'Children's_2', 'Comedy_2',
       'Crime_2', 'Documentary_2', 'Drama_2', 'Fantasy_2', 'Film-Noir_2',
       'Horror_2', 'Musical_2', 'Mystery_2', 'Romance_2', 'Sci-Fi_2',
       'Thriller_2', 'War_2', 'Western_2', 'label'],
      dtype='object')

In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

class PairwiseMovieDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
        self.scaler = StandardScaler()
        features = pairs.drop(columns='label')
        self.scaler.fit(features)
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        row = self.pairs.iloc[idx]
        user_features = torch.tensor([
            row['gender_1'], row['age_1'], row['occupation_1']
        ], dtype=torch.float32)
        
        film1_features = torch.tensor([
            row['year_1'], row['Action_1'], row['Adventure_1'], row['Animation_1'],
            row['Children\'s_1'], row['Comedy_1'], row['Crime_1'], row['Documentary_1'],
            row['Drama_1'], row['Fantasy_1'], row['Film-Noir_1'], row['Horror_1'],
            row['Musical_1'], row['Mystery_1'], row['Romance_1'], row['Sci-Fi_1'],
            row['Thriller_1'], row['War_1'], row['Western_1']
        ], dtype=torch.float32)
        
        film2_features = torch.tensor([
            row['year_2'], row['Action_2'], row['Adventure_2'], row['Animation_2'],
            row['Children\'s_2'], row['Comedy_2'], row['Crime_2'], row['Documentary_2'],
            row['Drama_2'], row['Fantasy_2'], row['Film-Noir_2'], row['Horror_2'],
            row['Musical_2'], row['Mystery_2'], row['Romance_2'], row['Sci-Fi_2'],
            row['Thriller_2'], row['War_2'], row['Western_2']
        ], dtype=torch.float32)
        
        label = torch.tensor(row['label'], dtype=torch.float32)
        
        return user_features, film1_features, film2_features, label

# Create DataLoader
train_dataset = PairwiseMovieDataset(train_pairs)
test_dataset = PairwiseMovieDataset(test_pairs)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

class PairwiseRankingModel(nn.Module):
    def __init__(self):
        super(PairwiseRankingModel, self).__init__()
        # Define the architecture
        self.fc1 = nn.Linear(3 + 19, 64)  # User features (7) + Film1 features (19)
        self.fc2 = nn.Linear(64 + 19, 32)  # Add Film2 features (19)
        self.fc3 = nn.Linear(32, 1)  # Output

    def forward(self, user_features, film1_features, film2_features):
        # Concatenate user features and film1 features
        x = torch.cat([user_features, film1_features], dim=1)
        x = torch.relu(self.fc1(x))
        # Concatenate with film2 features
        x = torch.cat([x, film2_features], dim=1)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [10]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm

# Instantiate the model and move to CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = PairwiseRankingModel().to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    all_labels = []
    all_outputs = []
    all_preds = []
    for user_features, film1_features, film2_features, labels in tqdm(dataloader, desc="Training"):
        # Move data to the same device as the model
        user_features = user_features.to(device)
        film1_features = film1_features.to(device)
        film2_features = film2_features.to(device)
        labels = labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(user_features, film1_features, film2_features).squeeze()
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Update running loss and collect outputs for metrics
        running_loss += loss.item() * user_features.size(0)
        all_labels.extend(labels.cpu().numpy())
        all_outputs.extend(torch.sigmoid(outputs).detach().cpu().numpy())
        all_preds.extend((torch.sigmoid(outputs) > 0.5).cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader.dataset)
    auc = roc_auc_score(all_labels, all_outputs)
    acc = accuracy_score(all_labels, all_preds)
    print(f"Training loss: {epoch_loss:.4f}, AUC: {auc:.4f}, Accuracy: {acc:.4f}")

# Evaluation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_labels = []
    all_outputs = []
    all_preds = []
    with torch.no_grad():
        for user_features, film1_features, film2_features, labels in tqdm(dataloader, desc="Evaluating"):
            user_features = user_features.to(device)
            film1_features = film1_features.to(device)
            film2_features = film2_features.to(device)
            labels = labels.to(device)
            
            outputs = model(user_features, film1_features, film2_features).squeeze()
            loss = criterion(outputs, labels)
            
            running_loss += loss.item() * user_features.size(0)
            all_labels.extend(labels.cpu().numpy())
            all_outputs.extend(torch.sigmoid(outputs).cpu().numpy())
            all_preds.extend((torch.sigmoid(outputs) > 0.5).cpu().numpy())
    
    epoch_loss = running_loss / len(dataloader.dataset)
    auc = roc_auc_score(all_labels, all_outputs)
    acc = accuracy_score(all_labels, all_preds)
    print(f"Validation loss: {epoch_loss:.4f}, AUC: {auc:.4f}, Accuracy: {acc:.4f}")

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train(model, train_loader, criterion, optimizer, device)
    evaluate(model, test_loader, criterion, device)


Epoch 1/10


Training: 100%|██████████| 3020/3020 [00:23<00:00, 128.85it/s]


Training loss: 0.8460, AUC: 0.5287, Accuracy: 0.5831


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 170.71it/s]


Validation loss: 0.7501, AUC: 0.5819, Accuracy: 0.4477
Epoch 2/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 146.73it/s]


Training loss: 0.7536, AUC: 0.5312, Accuracy: 0.5929


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 176.76it/s]


Validation loss: 0.6577, AUC: 0.5888, Accuracy: 0.6175
Epoch 3/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 147.19it/s]


Training loss: 0.7041, AUC: 0.5379, Accuracy: 0.6090


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 172.45it/s]


Validation loss: 0.6428, AUC: 0.5875, Accuracy: 0.6446
Epoch 4/10


Training: 100%|██████████| 3020/3020 [00:21<00:00, 143.67it/s]


Training loss: 0.6886, AUC: 0.5371, Accuracy: 0.6133


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 180.89it/s]


Validation loss: 0.6418, AUC: 0.5864, Accuracy: 0.6477
Epoch 5/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 148.64it/s]


Training loss: 0.6679, AUC: 0.5382, Accuracy: 0.6272


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 184.14it/s]


Validation loss: 0.6412, AUC: 0.5863, Accuracy: 0.6447
Epoch 6/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 145.90it/s]


Training loss: 0.6547, AUC: 0.5439, Accuracy: 0.6396


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 177.46it/s]


Validation loss: 0.6581, AUC: 0.5847, Accuracy: 0.6429
Epoch 7/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 145.31it/s]


Training loss: 0.6488, AUC: 0.5384, Accuracy: 0.6473


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 181.52it/s]


Validation loss: 0.6497, AUC: 0.5811, Accuracy: 0.6429
Epoch 8/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 145.78it/s]


Training loss: 0.6457, AUC: 0.5244, Accuracy: 0.6527


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 178.59it/s]


Validation loss: 0.6510, AUC: 0.5462, Accuracy: 0.6429
Epoch 9/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 147.59it/s]


Training loss: 0.6460, AUC: 0.5154, Accuracy: 0.6529


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 163.15it/s]


Validation loss: 0.6506, AUC: 0.5534, Accuracy: 0.6429
Epoch 10/10


Training: 100%|██████████| 3020/3020 [00:20<00:00, 144.82it/s]


Training loss: 0.6460, AUC: 0.5173, Accuracy: 0.6528


Evaluating: 100%|██████████| 189/189 [00:01<00:00, 173.84it/s]

Validation loss: 0.6502, AUC: 0.5584, Accuracy: 0.6429





# Now let's measure how well our model performs for ranking

In [11]:
user_features = test_data[['user_id', 'gender', 'age', 'occupation']].drop_duplicates().set_index('user_id')
movie_features = test_data[['movie_id', 'year', 'Action', 'Adventure', 'Animation', "Children's", 
                            'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                            'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 
                            'War', 'Western']].drop_duplicates().set_index('movie_id')

In [12]:
def generate_random_recommendations(user_ids, movie_ids, num_recommendations=5):
    recommendations = {}
    for user_id in user_ids:
        recommendations[user_id] = np.random.choice(movie_ids, num_recommendations, replace=False).tolist()
    return recommendations

user_ids = user_features.index.tolist()
movie_ids = movie_features.index.tolist()
random_recommendations = generate_random_recommendations(user_ids, movie_ids)

user_features = user_features.apply(pd.to_numeric, errors='coerce')
movie_features = movie_features.apply(pd.to_numeric, errors='coerce')

In [27]:
def rank_recommendations(model, user_features, movie_features, recommendations):
    model.eval()
    ranked_recommendations = {}
    
    with torch.no_grad():
        for user_id, movies in recommendations.items():
            user_feat = torch.tensor(user_features.loc[user_id].values.astype(np.float32), dtype=torch.float32).unsqueeze(0).to(device)
            victory_count = {movie_id: 0 for movie_id in movies}
            
            for i, film1_id in enumerate(movies):
                for film2_id in movies[i+1:]:
                    film1_feat = torch.tensor(movie_features.loc[film1_id].values.astype(np.float32), dtype=torch.float32).unsqueeze(0).to(device)
                    film2_feat = torch.tensor(movie_features.loc[film2_id].values.astype(np.float32), dtype=torch.float32).unsqueeze(0).to(device)
                    
                    output = model(user_feat, film1_feat, film2_feat).item()
                    
                    if output > 0.5:
                        victory_count[film1_id] += 1
                    else:
                        victory_count[film2_id] += 1
            
            # Sort movies by victory count in descending order
            
            sorted_movies = sorted(victory_count.keys(), key=lambda x: victory_count[x], reverse=True)
            
            ranked_recommendations[user_id] = sorted_movies
    
    return ranked_recommendations


ranked_recommendations = rank_recommendations(model, user_features, movie_features, random_recommendations)


In [31]:
total_amount_of_movies = len(movie_ids)
random_res = evaluate_recommender_system(random_recommendations, ratings, total_amount_of_movies)

In [32]:
random_res

{'Precision@K': np.float64(0.047532295462073534),
 'Recall@K': np.float64(0.0015017692261627075),
 'NDCG@K': np.float64(0.6149229805672399),
 'MAP@K': np.float64(0.481709669920967),
 'MRR': np.float64(0.09653582864083028),
 'Hit Rate@K': np.float64(0.19791321629678701),
 'Coverage@K': 0.9997096399535423}

In [33]:
total_amount_of_movies = len(movie_ids)
ranked_random_res = evaluate_recommender_system(ranked_recommendations, ratings, total_amount_of_movies)

In [34]:
ranked_random_res

{'Precision@K': np.float64(0.047532295462073534),
 'Recall@K': np.float64(0.0015017692261627075),
 'NDCG@K': np.float64(0.6202371693930591),
 'MAP@K': np.float64(0.4879474662947466),
 'MRR': np.float64(0.09793805895992051),
 'Hit Rate@K': np.float64(0.19791321629678701),
 'Coverage@K': 0.9997096399535423}

The ranking metrics showed a slight improvement after applying the ranking technique. Specifically, there was a increase in NDCG@K, MAP@K, and MRR, which are critical measures for ranking performance. This suggests that the model's ability to rank items correctly has improved, even though the changes are incremental.

Overall, while the model has demonstrated some improvements in terms of ranking metrics, there is still room for further optimization and enhancement. Future efforts could focus on exploring different architectures, fine-tuning hyperparameters, or employing more advanced ranking techniques to achieve better performance.

In [36]:
#save model

torch.save(model.state_dict(), '../../artifacts/pairwise_ranking_model.pth')
