In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import lightning as L
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


class MatrixFactorization(L.LightningModule):
    def __init__(self, num_users, num_items, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

    def forward(self, user_id, item_id):
        user_vector = self.user_embedding(user_id)
        item_vector = self.item_embedding(item_id)
        return (user_vector * item_vector).sum(1)

    def training_step(self, batch, batch_idx):
        user_id, item_id, rating = batch
        prediction = self(user_id, item_id)
        loss = nn.functional.mse_loss(prediction, rating)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=0.01)


class MovieLensDataset(Dataset):
    def __init__(self, dataframe=None):
        self.dataframe = self.load_movielens_data() if dataframe is None else dataframe
        self.num_users = self.dataframe['user_id'].nunique()
        self.num_items = self.dataframe['item_id'].nunique()

    @staticmethod
    def load_movielens_data(path_u='data/ml-100k/u.data'):
        column_names = ['user_id', 'item_id', 'rating', 'timestamp']
        df = pd.read_csv(path_u, sep='\t', names=column_names)
        df['user_id'] -= 1
        df['item_id'] -= 1
        return df

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        user_id = self.dataframe.iloc[idx, 0]
        item_id = self.dataframe.iloc[idx, 1]
        rating = self.dataframe.iloc[idx, 2]
        return (
            torch.tensor(user_id, dtype=torch.long),
            torch.tensor(item_id, dtype=torch.long),
            torch.tensor(rating, dtype=torch.float)
        )

In [21]:
df = MovieLensDataset.load_movielens_data()

# Split the data into training and validation sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['user_id'])

In [22]:
train_dataset = MovieLensDataset(train_df)
test_dataset = MovieLensDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

model = MatrixFactorization(train_dataset.num_users, train_dataset.num_items, 32)
trainer = L.Trainer(max_epochs=10)
trainer.fit(model, train_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name           | Type      | Params | Mode 
-----------------------------------------------------
0 | user_embedding | Embedding | 30.2 K | train
1 | item_embedding | Embedding | 52.7 K | train
-----------------------------------------------------
82.9 K    Trainable params
0         Non-trainable params
82.9 K    Total params
0.332     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode
/Users/dominykas.seputis/github/msc-thesis/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 1250/1250 [00:06<00:00, 187.37it/s, v_num=72]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1250/1250 [00:06<00:00, 187.28it/s, v_num=72]


In [27]:
predictions = []
for batch in test_loader:
    user_id, item_id, rating = batch
    prediction = model(user_id, item_id)
    predictions.append(prediction)
    print(prediction)

predictions[0]

tensor([-0.8109, -4.2718, -0.9283, -1.0590, -2.5068,  4.4384,  3.7847,  0.6953,
         1.1353,  3.0365,  4.0712, -1.5655,  8.3063, -6.2752, -3.1119,  1.8457,
         0.0949,  0.4974, -1.5860, -2.3402,  4.1899, -0.3536,  5.8201,  1.7977,
         0.7605,  0.9641, -4.0269, -1.7355, -0.3192,  6.5090, -2.0786,  1.2006,
         1.8513, -0.5446, -0.0166,  2.3374,  0.3110,  1.8306,  1.2042,  0.3505,
         3.8013,  3.6846,  0.0374,  4.1648, -0.8510, -5.0725,  0.9266, -4.5456,
        -3.3205, -1.5469,  0.6198,  2.7928,  0.4988, -0.8167, -0.4994, -2.5567,
         1.8124,  3.7027, -0.2175, -0.1840,  3.2345, -5.6300, -0.2207, -1.9232],
       grad_fn=<SumBackward1>)
tensor([ -0.0623,   2.9896,  -2.7594,   3.9754,   6.8108,   1.2885,  -2.7222,
          5.3615,   1.0612,   0.8897,  -3.0274,   3.1576,  -1.3941, -12.9348,
         -2.1775,  -2.1606,   2.9174,   0.4766,   0.9635,   0.8593,  -0.7423,
         -2.4109,   3.0689,  -7.5407,   4.7294,   0.9155,   0.5348,   6.2378,
          3.7361

IndexError: index out of range in self