In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

import pandas as pd
from tqdm import tqdm

In [None]:
root = '.'
# root = 'ml-latest-small'

num_workers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device, num_workers

(device(type='cuda'), 2)

In [None]:
df = pd.read_csv(f'{root}/ratings.csv').drop('timestamp', axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [None]:
user_ids = df['userId'].unique()
item_ids = df['movieId'].unique()

user_map = {user: idx for idx, user in enumerate(user_ids)}
item_map = {item: idx for idx, item in enumerate(item_ids)}

df['userId'] = df['userId'].map(user_map)
df['movieId'] = df['movieId'].map(item_map)

print(f'There are {len(user_ids)} unique users and {len(item_ids)} unique items in {len(df)} user-item pairs.')

There are 610 unique users and 9724 unique items in 100836 user-item pairs.


In [None]:
class UIDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.users = torch.tensor(df['userId'].values, dtype=torch.long)
        self.items = torch.tensor(df['movieId'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float)
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

    def watched_items(self, user_id):
        watched = self.df[self.df['userId'] == user_id]
        return watched['movieId']

df_train, df_val = train_test_split(df, test_size=0.2)

train_set = UIDataset(df_train)
val_set = UIDataset(df_val)

print(f'Train size={len(train_set)}, Validation size={len(val_set)}')

Train size=80668, Validation size=20168


In [None]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_set, batch_size=64, shuffle=False, num_workers=num_workers)

len(train_loader), len(val_loader)

(1261, 316)

In [None]:
class CollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(CollaborativeFiltering, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, user_id, item_id):
        user_vecs = self.user_embedding(user_id)
        item_vecs = self.item_embedding(item_id)
        interaction = user_vecs * item_vecs
        output = self.fc(interaction)
        return output.squeeze()

model = CollaborativeFiltering(len(user_ids), len(item_ids), embedding_dim=50).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [19]:
num_epochs = 10

for epoch in range(1, 1+num_epochs):
    model.train()
    train_loss = 0

    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc='Training')
    for idx, (user_id, item_id, rating) in pbar:
        user_id, item_id, rating = user_id.to(device), item_id.to(device), rating.to(device)

        preds = model(user_id, item_id)
        loss = criterion(preds, rating)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        if idx % (len(train_loader) // 10) == 0:
            pbar.set_postfix(f'Batch {idx}/{len(train_loader)}, Loss: {loss.item():.4f}')

    train_loss /= len(train_loader)
    print(f'Ecpoh {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}')

Batch 1260/1261, Loss: 0.5150: 100%|██████████| 1261/1261 [00:04<00:00, 255.65it/s]

Ecpoh 1/10, Train Loss: 0.2400



Batch 1260/1261, Loss: 0.1964: 100%|██████████| 1261/1261 [00:04<00:00, 256.01it/s]

Ecpoh 2/10, Train Loss: 0.1880



Batch 1260/1261, Loss: 0.1802: 100%|██████████| 1261/1261 [00:07<00:00, 168.49it/s]

Ecpoh 3/10, Train Loss: 0.1594



Batch 1260/1261, Loss: 0.2059: 100%|██████████| 1261/1261 [00:06<00:00, 201.29it/s]

Ecpoh 4/10, Train Loss: 0.1400



Batch 1260/1261, Loss: 0.1142: 100%|██████████| 1261/1261 [00:06<00:00, 204.03it/s]

Ecpoh 5/10, Train Loss: 0.1266



Batch 1260/1261, Loss: 0.1088: 100%|██████████| 1261/1261 [00:04<00:00, 257.18it/s]

Ecpoh 6/10, Train Loss: 0.1162



Batch 1260/1261, Loss: 0.1446: 100%|██████████| 1261/1261 [00:06<00:00, 209.24it/s]

Ecpoh 7/10, Train Loss: 0.1083



Batch 1260/1261, Loss: 0.1338: 100%|██████████| 1261/1261 [00:05<00:00, 238.17it/s]

Ecpoh 8/10, Train Loss: 0.1029



Batch 1260/1261, Loss: 0.1762: 100%|██████████| 1261/1261 [00:05<00:00, 247.11it/s]

Ecpoh 9/10, Train Loss: 0.0973



Batch 1260/1261, Loss: 0.1231: 100%|██████████| 1261/1261 [00:06<00:00, 209.46it/s]

Ecpoh 10/10, Train Loss: 0.0943





In [20]:
model.eval()
with torch.no_grad():
    test_loss = 0
    for user_id, item_id, rating in val_loader:
        user_id, item_id, rating = user_id.to(device), item_id.to(device), rating.to(device)

        preds = model(user_id, item_id)
        loss = criterion(preds, rating)
        test_loss += loss.item()

    test_loss /= len(val_loader)
    print(f"Test Loss: {test_loss:.4f}")


Test Loss: 1.1929


In [22]:
def recommend_for_user(user_id, top_k=5):
    user_idx = torch.tensor([user_map[user_id]], dtype=torch.long)
    watched_items = train_set.watched_items(user_idx.item()).to_list()
    watched_items = []
    all_items = list(item_map.values())
    new_items = torch.tensor([item for item in all_items if (item not in watched_items)], dtype=torch.long)

    with torch.no_grad():
        scores = model(user_idx.repeat(len(new_items)).to(device), new_items.to(device))
        top_items = torch.topk(scores.cpu(), top_k).indices

    recommended_items = [list(item_map.keys())[new_items[idx]] for idx in top_items.tolist()]
    return recommended_items, watched_items

recom_item_ids, watched_items = recommend_for_user(1)
recom_item_ids, watched_items

([3836, 26587, 1393, 1203, 2409], [])

In [24]:
movie_names = pd.read_csv(f'{root}/movies.csv')
movie_names

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [25]:
movie_names[movie_names['movieId'].isin(recom_item_ids)]

Unnamed: 0,movieId,title,genres
905,1203,12 Angry Men (1957),Drama
1073,1393,Jerry Maguire (1996),Drama|Romance
1809,2409,Rocky II (1979),Action|Drama
2868,3836,Kelly's Heroes (1970),Action|Comedy|War
5532,26587,"Decalogue, The (Dekalog) (1989)",Crime|Drama|Romance
