In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split


In [49]:
class CollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=16, hidden_dim=32, output_dim=16):
        super(CollaborativeFiltering, self).__init__()
        
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        self.ffnn_item = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        
        self.ffnn_user = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        
    def forward(self, user_ids, item_ids):
        user_embed = self.user_embedding(user_ids)
        item_embed = self.item_embedding(item_ids)
        item_scores = self.ffnn_item(item_embed)
        user_scores = self.ffnn_user(user_embed)
        
        return user_scores, item_scores

In [31]:
data_path = "kaggle_data/collab_filtering_prelim_data.csv"
data = pd.read_csv(data_path)

In [32]:
data.shape

(308678, 3)

In [33]:
# create a dictionary to map string ids to integer indices
user_to_idx = {user_id: idx for idx, user_id in enumerate(data['customer_id'].unique())}
item_to_idx = {item_id: idx for idx, item_id in enumerate(data['article_id'].unique())}

# add integer index columns to the DataFrame
data['user_idx'] = data['customer_id'].map(user_to_idx)
data['item_idx'] = data['article_id'].map(item_to_idx)

In [50]:
class HandMDataset(Dataset):
    def __init__(self, df):
        self.customer_ids = df['user_idx'].values
        self.article_ids = df['item_idx'].values
        self.labels = df['label'].values
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.customer_ids[idx], self.article_ids[idx], self.labels[idx]

In [51]:
import time

# start timer
start_time = time.time()


# end timer
end_time = time.time()

# print elapsed time
print(f"Elapsed time: {end_time - start_time} seconds")

Elapsed time: 2.5033950805664062e-05 seconds


In [52]:
10%20000

10

In [37]:
num_users=data['customer_id'].nunique()
num_items=data['article_id'].nunique()

In [38]:
train_data, int_data = train_test_split(data, test_size=0.1, random_state=42)
val_data, test_data  = train_test_split(int_data, test_size=0.5, random_state=42)

In [39]:
train_dataset = HandMDataset(train_data)
val_dataset = HandMDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True, num_workers=4)

In [40]:
train_len=len(train_data)
val_len=len(val_data)

In [41]:
model = CollaborativeFiltering(num_users, num_items)

In [42]:
# Define the loss function and optimizer
criterion = nn.CosineEmbeddingLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [43]:
data.tail()

Unnamed: 0,customer_id,article_id,label,user_idx,item_idx
308673,6d58705bc45b09dfc9ee700cc2f8c02d65444a7be9f444...,869070005,1,1597,2384
308674,08ae22429179bdb5f42c3f6c31d8361df962f18f5a328e...,571706010,1,327,12455
308675,f62307ef91a8c2ac16d8a2b920beb5361f4b54f9a6a6ba...,837306003,1,893,1479
308676,ce3886acb6dcbddb0d58ccbd015f5875ba2d65e09910ea...,910096002,-1,1325,8805
308677,e2fcb273201a0354f6fda384f67d5ef4d8b95274129797...,904416001,-1,351,6027


In [53]:
torch.save(model, 'kaggle_data/model_mid_training.pt')

PicklingError: Can't pickle <class '__main__.CollaborativeFiltering'>: it's not the same object as __main__.CollaborativeFiltering

In [None]:
val_loss = 0.0
model.eval()
with torch.no_grad():
    for user_ids, item_ids, ratings in val_loader:
        user_scores,item_scores = model(user_ids, item_ids)
        loss = criterion(user_scores,item_scores,ratings)
        val_loss += loss.item()
print(f"Loss {val_loss / val_len:.4f}")

start_time = time.time()
for epoch in range(40):
    running_loss = 0.0
    val_loss = 0.0
    t=0
    
    model.train()

    for user_ids, item_ids, ratings in train_loader:
        # Zero the gradients
        optimizer.zero_grad()


        # Forward pass
        user_scores,item_scores = model(user_ids, item_ids)
        

        
        
        # target_rating = torch.tensor(rating, dtype=torch.float)
        
        # target_rating = torch.tensor([[rating]], dtype=torch.float)
        
        # Compute the loss
        # predicted_rating = torch.round(torch.sigmoid(model(user_ids, item_ids)))
        loss = criterion(user_scores,item_scores, ratings)

        # Backward pass
        loss.backward()

        # Update the weights
        optimizer.step()

        # Accumulate the loss
        running_loss += loss.item()
        
        current_time = time.time()
        
        if t%4000==0:
            # print(t)
            # print(predicted_rating)
            print(f"Elapsed time: {current_time - start_time} seconds after {t} runs")
            # print(f"user_id: {user_id} item_id: {item_id} rating: {rating}")
        t=t+1
    
    model.eval()
    with torch.no_grad():
        for user_ids, item_ids, ratings in val_loader:
            user_scores,item_scores = model(user_ids, item_ids)
            loss = criterion(user_scores,item_scores,ratings)
            val_loss += loss.item()

    print(f"Epoch {epoch}: Loss {running_loss / train_len:.4f}")
    print(f"Epoch {epoch}: Loss {val_loss / val_len:.4f}")

Loss 0.0098
Elapsed time: 0.14185261726379395 seconds after 0 runs
Elapsed time: 43.661386489868164 seconds after 4000 runs
Elapsed time: 83.15505790710449 seconds after 8000 runs
Epoch 0: Loss 0.0088
Epoch 0: Loss 0.0098
Elapsed time: 94.25689625740051 seconds after 0 runs
Elapsed time: 139.18778252601624 seconds after 4000 runs
Elapsed time: 187.51252222061157 seconds after 8000 runs
Epoch 1: Loss 0.0088
Epoch 1: Loss 0.0098
Elapsed time: 195.11572241783142 seconds after 0 runs
Elapsed time: 242.09832239151 seconds after 4000 runs
Elapsed time: 287.26224851608276 seconds after 8000 runs
Epoch 2: Loss 0.0087
Epoch 2: Loss 0.0099
Elapsed time: 297.4813530445099 seconds after 0 runs
Elapsed time: 336.89497470855713 seconds after 4000 runs
Elapsed time: 380.21383476257324 seconds after 8000 runs
Epoch 3: Loss 0.0087
Epoch 3: Loss 0.0098
Elapsed time: 388.7944242954254 seconds after 0 runs
Elapsed time: 434.7913875579834 seconds after 4000 runs
Elapsed time: 482.6916778087616 seconds afte

In [None]:
torch.tensor([[1.0]], dtype=torch.float)

In [138]:
"Target size (torch.Size([32])) must be the same as input size (torch.Size([32, 32]))" when completeing this line of code "criterion(predicted_rating,ratings)"

SyntaxError: invalid syntax (4056817658.py, line 1)