In [22]:
import os
import csv
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

DATA_DIR = '/kaggle/input/movielens-100k-dataset/ml-100k'
OUTPUT_DIR = './'

class Config:
    device='cpu'
    epochs=40
    seed=17
    train_bs=8
    valid_bs=8
    embedding_dim=20
    lr=1e-2
    num_workers=None       
    verbose_step=100
    
def torch_seed_everything(seed_value=777):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

config=Config()
torch_seed_everything(config.seed)

# load data

In [23]:
df = pd.read_csv(os.path.join(DATA_DIR, 'u.data'), sep='\t', header=None)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
#df = df.sort_values('timestamp').reset_index(drop=True)
n_user = df.user_id.nunique()
n_item = df.item_id.nunique()
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [24]:
print('user_num', n_user)
print('item_num', n_item)

user_num 943
item_num 1682


# split data

In [25]:
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['user_id'], random_state=config.seed)
assert train_df.user_id.nunique() == valid_df.user_id.nunique()
print(train_df.shape, valid_df.shape)
#print(valid_df.user_id.nunique())

(80000, 4) (20000, 4)


# Dataset

In [26]:
class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_id, item_id, rating, _ = self.df.iloc[idx]
        # index starts with 0
        sample = {"user": user_id - 1, "item": item_id - 1, "rating": rating}
        return sample

# model

In [27]:
class MatrixFactorizationPyTorch(nn.Module):
    def __init__(self, n_user, n_item, k=20):
        """
        n_user: user num
        n_item: item num
        k: embedding dim
        """
        super().__init__()
        self.user_factors = nn.Embedding(n_user, k, sparse=True)
        self.item_factors = nn.Embedding(n_item, k, sparse=True)

    def forward(self, user, item):
        #print(user, item)
        u_emb = self.user_factors(user)
        i_emb = self.item_factors(item)
        # print(u_emb.shape, i_emb.shape)
        # print((u_emb * i_emb).shape)
        # print((u_emb * i_emb).sum(axis=1).shape)
        return (u_emb * i_emb).sum(axis=1)

In [28]:
train_loader = DataLoader(MovieLensDataset(train_df), batch_size=2, shuffle=True,)
next(iter(train_loader))

{'user': tensor([670, 485]),
 'item': tensor([ 11, 251]),
 'rating': tensor([5, 3])}

In [29]:
data = next(iter(train_loader))
user, item = data['user'], data['item']
model = MatrixFactorizationPyTorch(n_user, n_item, k=config.embedding_dim)
model(user, item)

tensor([5.9666, 9.7543], grad_fn=<SumBackward1>)

# train

In [30]:
def train_one_epoch(epoch, model, loss_fn, optimizer,
                    train_loader, device, scheduler=None):
    model.train()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    data_cnt = 0
    total_loss = 0.0

    # 学習データをシャッフルしてループ
    for step, data in pbar:
        user = data['user']
        item = data['item']
        rating = data['rating']
        data_cnt += user.shape[0]

        # 勾配リセット
        optimizer.zero_grad()

        #順伝搬、逆伝搬
        outputs = model(user, item)
        #print('outupts', outputs)
        #print(rating)
        loss = loss_fn(outputs,  rating.float())
        #print('loss', loss)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        #print(total_loss)
        if ((step + 1) % config.verbose_step == 0) or ((step + 1) == len(train_loader)):
            description = f'train epoch {epoch} loss: {total_loss / data_cnt:.4f}'
            pbar.set_description(description)

    total_loss = total_loss / len(train_loader)
    print('train loss = {:.4f}'.format(total_loss))

def valid_one_epoch(epoch, model, loss_fn, val_loader, device):

    model.eval()
    total_loss = 0.0
    data_cnt = 0
    #preds = []
    pbar = tqdm(enumerate(val_loader), total=len(val_loader))

    for step, data in pbar:
        user = data['user']
        item = data['item']
        rating = data['rating']
        data_cnt += user.shape[0]

        outputs = model(user, item)
        loss = loss_fn(outputs, rating)
        total_loss += loss
        
        # preds.append(outputs.detach().cpu().numpy())

        if ((step + 1) % config.verbose_step == 0) or ((step + 1) == len(val_loader)):
            description = f'val epoch {epoch} loss: {total_loss / data_cnt:.4f}'
            pbar.set_description(description)
        

    valid_loss = total_loss / len(val_loader)
    print('val loss = {:.4f}'.format(valid_loss))
    return valid_loss 

def run_train(train_loader, valid_loader):
    device = torch.device(config.device)
    model = MatrixFactorizationPyTorch(n_user, n_item, k=config.embedding_dim)
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=config.lr)
    best_loss=1e10
    for epoch in range(config.epochs):
        train_one_epoch(epoch, model, loss_fn, optimizer, train_loader, device)

        with torch.no_grad():
            val_loss = valid_one_epoch(epoch, model, loss_fn, valid_loader, device)
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_rmse = torch.sqrt(best_loss)
            best_epoch = epoch
            # TODO: save model,  figure
            best_path =  os.path.join(OUTPUT_DIR,f'best_model.bin')
            torch.save({'model':model.state_dict(),},
                           best_path)
    print(f'----- result ------')
    print(f'Best epoch: {epoch}')
    print(f'Best loss: {best_loss}, RMSE: {best_rmse}')

In [31]:
train_loader = DataLoader(MovieLensDataset(train_df), batch_size=config.train_bs, shuffle=True,)
valid_loader = DataLoader(MovieLensDataset(valid_df), batch_size=config.valid_bs, shuffle=False,)
run_train(train_loader, valid_loader)

train epoch 0 loss: 3.2163: 100%|██████████| 10000/10000 [00:21<00:00, 474.10it/s]
val epoch 0 loss: 2.5378:   2%|▏         | 59/2500 [00:00<00:04, 587.42it/s]

train loss = 25.7305


val epoch 0 loss: 2.5837: 100%|██████████| 2500/2500 [00:03<00:00, 661.52it/s]
  1%|          | 51/10000 [00:00<00:19, 502.44it/s]

val loss = 20.6697


train epoch 1 loss: 2.0273: 100%|██████████| 10000/10000 [00:21<00:00, 471.84it/s]
val epoch 1 loss: 1.9015:   3%|▎         | 73/2500 [00:00<00:03, 726.09it/s]

train loss = 16.2183


val epoch 1 loss: 1.9673: 100%|██████████| 2500/2500 [00:03<00:00, 719.90it/s]
  0%|          | 49/10000 [00:00<00:20, 484.92it/s]

val loss = 15.7385


train epoch 2 loss: 1.3020: 100%|██████████| 10000/10000 [00:21<00:00, 463.96it/s]
val epoch 2 loss: 1.2111:   2%|▏         | 62/2500 [00:00<00:03, 612.90it/s]

train loss = 10.4161


val epoch 2 loss: 1.2931: 100%|██████████| 2500/2500 [00:03<00:00, 683.04it/s]
  0%|          | 48/10000 [00:00<00:20, 479.18it/s]

val loss = 10.3452


train epoch 3 loss: 0.7676: 100%|██████████| 10000/10000 [00:21<00:00, 466.98it/s]
val epoch 3 loss: 0.8274:   2%|▏         | 60/2500 [00:00<00:04, 597.50it/s]

train loss = 6.1411


val epoch 3 loss: 0.8965: 100%|██████████| 2500/2500 [00:04<00:00, 606.76it/s]
  0%|          | 36/10000 [00:00<00:27, 357.08it/s]

val loss = 7.1724


train epoch 4 loss: 0.5071: 100%|██████████| 10000/10000 [00:21<00:00, 465.36it/s]
val epoch 4 loss: 0.6390:   3%|▎         | 77/2500 [00:00<00:03, 765.76it/s]

train loss = 4.0565


val epoch 4 loss: 0.6907: 100%|██████████| 2500/2500 [00:03<00:00, 685.03it/s]
train epoch 5 loss: 0.4228:   0%|          | 50/10000 [00:00<00:20, 497.05it/s]

val loss = 5.5255


train epoch 5 loss: 0.3730: 100%|██████████| 10000/10000 [00:22<00:00, 435.72it/s]
val epoch 5 loss: 0.5315:   3%|▎         | 76/2500 [00:00<00:03, 752.56it/s]

train loss = 2.9843


val epoch 5 loss: 0.5708: 100%|██████████| 2500/2500 [00:03<00:00, 659.58it/s]
train epoch 6 loss: 0.2946:   1%|          | 52/10000 [00:00<00:19, 511.76it/s]

val loss = 4.5661


train epoch 6 loss: 0.2954: 100%|██████████| 10000/10000 [00:21<00:00, 460.42it/s]
val epoch 6 loss: 0.4635:   3%|▎         | 68/2500 [00:00<00:03, 674.37it/s]

train loss = 2.3636


val epoch 6 loss: 0.4941: 100%|██████████| 2500/2500 [00:04<00:00, 621.75it/s]
  0%|          | 39/10000 [00:00<00:26, 381.67it/s]

val loss = 3.9527


train epoch 7 loss: 0.2465: 100%|██████████| 10000/10000 [00:21<00:00, 474.89it/s]
val epoch 7 loss: 0.4163:   2%|▏         | 60/2500 [00:00<00:04, 593.60it/s]

train loss = 1.9720


val epoch 7 loss: 0.4414: 100%|██████████| 2500/2500 [00:03<00:00, 696.42it/s]
train epoch 8 loss: 0.2336:   0%|          | 50/10000 [00:00<00:19, 498.03it/s]

val loss = 3.5313


train epoch 8 loss: 0.2135: 100%|██████████| 10000/10000 [00:21<00:00, 459.62it/s]
val epoch 8 loss: 0.3832:   2%|▏         | 59/2500 [00:00<00:04, 587.96it/s]

train loss = 1.7078


val epoch 8 loss: 0.4036: 100%|██████████| 2500/2500 [00:03<00:00, 653.16it/s]
train epoch 9 loss: 0.1777:   0%|          | 49/10000 [00:00<00:20, 484.54it/s]

val loss = 3.2289


train epoch 9 loss: 0.1900: 100%|██████████| 10000/10000 [00:21<00:00, 474.22it/s]
val epoch 9 loss: 0.3583:   3%|▎         | 76/2500 [00:00<00:03, 752.66it/s]

train loss = 1.5203


val epoch 9 loss: 0.3753: 100%|██████████| 2500/2500 [00:03<00:00, 636.14it/s]
  0%|          | 49/10000 [00:00<00:20, 482.43it/s]

val loss = 3.0023


train epoch 10 loss: 0.1727: 100%|██████████| 10000/10000 [00:20<00:00, 480.58it/s]
val epoch 10 loss: 0.3379:   3%|▎         | 69/2500 [00:00<00:03, 689.93it/s]

train loss = 1.3815


val epoch 10 loss: 0.3532: 100%|██████████| 2500/2500 [00:03<00:00, 702.50it/s]
  0%|          | 38/10000 [00:00<00:26, 372.09it/s]

val loss = 2.8260


train epoch 11 loss: 0.1594: 100%|██████████| 10000/10000 [00:21<00:00, 467.20it/s]
val epoch 11 loss: 0.3231:   3%|▎         | 77/2500 [00:00<00:03, 764.94it/s]

train loss = 1.2752


val epoch 11 loss: 0.3358: 100%|██████████| 2500/2500 [00:03<00:00, 718.83it/s]
  0%|          | 48/10000 [00:00<00:20, 478.54it/s]

val loss = 2.6866


train epoch 12 loss: 0.1489: 100%|██████████| 10000/10000 [00:20<00:00, 478.15it/s]
val epoch 12 loss: 0.3104:   3%|▎         | 73/2500 [00:00<00:03, 720.36it/s]

train loss = 1.1914


val epoch 12 loss: 0.3216: 100%|██████████| 2500/2500 [00:03<00:00, 680.09it/s]
  1%|          | 52/10000 [00:00<00:19, 512.66it/s]

val loss = 2.5732


train epoch 13 loss: 0.1405: 100%|██████████| 10000/10000 [00:22<00:00, 450.72it/s]
val epoch 13 loss: 0.3005:   3%|▎         | 70/2500 [00:00<00:03, 698.31it/s]

train loss = 1.1242


val epoch 13 loss: 0.3098: 100%|██████████| 2500/2500 [00:03<00:00, 690.94it/s]
  0%|          | 47/10000 [00:00<00:21, 469.02it/s]

val loss = 2.4785


train epoch 14 loss: 0.1336: 100%|██████████| 10000/10000 [00:22<00:00, 452.65it/s]
val epoch 14 loss: 0.2920:   3%|▎         | 71/2500 [00:00<00:03, 708.77it/s]

train loss = 1.0690


val epoch 14 loss: 0.3001: 100%|██████████| 2500/2500 [00:03<00:00, 670.47it/s]
train epoch 15 loss: 0.1305:   1%|          | 51/10000 [00:00<00:19, 509.08it/s]

val loss = 2.4005


train epoch 15 loss: 0.1279: 100%|██████████| 10000/10000 [00:21<00:00, 462.89it/s]
val epoch 15 loss: 0.2849:   3%|▎         | 74/2500 [00:00<00:03, 731.31it/s]

train loss = 1.0228


val epoch 15 loss: 0.2917: 100%|██████████| 2500/2500 [00:03<00:00, 655.99it/s]
train epoch 16 loss: 0.1170:   1%|          | 52/10000 [00:00<00:19, 511.64it/s]

val loss = 2.3338


train epoch 16 loss: 0.1230: 100%|██████████| 10000/10000 [00:21<00:00, 462.58it/s]
val epoch 16 loss: 0.2792:   3%|▎         | 70/2500 [00:00<00:03, 693.36it/s]

train loss = 0.9840


val epoch 16 loss: 0.2846: 100%|██████████| 2500/2500 [00:03<00:00, 702.56it/s]
train epoch 17 loss: 0.1137:   1%|          | 53/10000 [00:00<00:19, 521.78it/s]

val loss = 2.2766


train epoch 17 loss: 0.1188: 100%|██████████| 10000/10000 [00:22<00:00, 450.49it/s]
val epoch 17 loss: 0.2736:   2%|▏         | 58/2500 [00:00<00:04, 570.44it/s]

train loss = 0.9506


val epoch 17 loss: 0.2782: 100%|██████████| 2500/2500 [00:03<00:00, 680.96it/s]
  0%|          | 47/10000 [00:00<00:21, 466.34it/s]

val loss = 2.2254


train epoch 18 loss: 0.1152: 100%|██████████| 10000/10000 [00:21<00:00, 465.23it/s]
val epoch 18 loss: 0.2693:   3%|▎         | 75/2500 [00:00<00:03, 745.73it/s]

train loss = 0.9218


val epoch 18 loss: 0.2727: 100%|██████████| 2500/2500 [00:03<00:00, 689.29it/s]
  0%|          | 50/10000 [00:00<00:19, 499.66it/s]

val loss = 2.1817


train epoch 19 loss: 0.1121: 100%|██████████| 10000/10000 [00:21<00:00, 475.70it/s]
val epoch 19 loss: 0.2651:   3%|▎         | 74/2500 [00:00<00:03, 735.53it/s]

train loss = 0.8966


val epoch 19 loss: 0.2679: 100%|██████████| 2500/2500 [00:03<00:00, 643.66it/s]
train epoch 20 loss: 0.1123:   0%|          | 50/10000 [00:00<00:20, 496.08it/s]

val loss = 2.1433


train epoch 20 loss: 0.1093: 100%|██████████| 10000/10000 [00:22<00:00, 446.85it/s]
val epoch 20 loss: 0.2625:   3%|▎         | 69/2500 [00:00<00:03, 685.89it/s]

train loss = 0.8744


val epoch 20 loss: 0.2637: 100%|██████████| 2500/2500 [00:04<00:00, 600.14it/s]
  0%|          | 49/10000 [00:00<00:20, 485.83it/s]

val loss = 2.1095


train epoch 21 loss: 0.1068: 100%|██████████| 10000/10000 [00:20<00:00, 478.83it/s]
val epoch 21 loss: 0.2595:   3%|▎         | 76/2500 [00:00<00:03, 751.30it/s]

train loss = 0.8546


val epoch 21 loss: 0.2600: 100%|██████████| 2500/2500 [00:03<00:00, 645.51it/s]
  0%|          | 36/10000 [00:00<00:28, 354.99it/s]

val loss = 2.0798


train epoch 22 loss: 0.1046: 100%|██████████| 10000/10000 [00:22<00:00, 442.95it/s]
val epoch 22 loss: 0.2563:   3%|▎         | 71/2500 [00:00<00:03, 706.62it/s]

train loss = 0.8370


val epoch 22 loss: 0.2564: 100%|██████████| 2500/2500 [00:03<00:00, 711.56it/s]
  0%|          | 49/10000 [00:00<00:20, 486.02it/s]

val loss = 2.0509


train epoch 23 loss: 0.1026: 100%|██████████| 10000/10000 [00:21<00:00, 457.81it/s]
val epoch 23 loss: 0.2541:   3%|▎         | 69/2500 [00:00<00:03, 683.71it/s]

train loss = 0.8210


val epoch 23 loss: 0.2533: 100%|██████████| 2500/2500 [00:03<00:00, 680.13it/s]
  0%|          | 48/10000 [00:00<00:20, 477.41it/s]

val loss = 2.0264


train epoch 24 loss: 0.1008: 100%|██████████| 10000/10000 [00:21<00:00, 466.62it/s]
val epoch 24 loss: 0.2521:   3%|▎         | 77/2500 [00:00<00:03, 761.12it/s]

train loss = 0.8067


val epoch 24 loss: 0.2505: 100%|██████████| 2500/2500 [00:03<00:00, 711.79it/s]
train epoch 25 loss: 0.0922:   0%|          | 50/10000 [00:00<00:19, 499.08it/s]

val loss = 2.0043


train epoch 25 loss: 0.0992: 100%|██████████| 10000/10000 [00:21<00:00, 476.04it/s]
val epoch 25 loss: 0.2498:   3%|▎         | 73/2500 [00:00<00:03, 728.10it/s]

train loss = 0.7937


val epoch 25 loss: 0.2480: 100%|██████████| 2500/2500 [00:03<00:00, 658.03it/s]
  0%|          | 49/10000 [00:00<00:20, 489.79it/s]

val loss = 1.9839


train epoch 26 loss: 0.0977: 100%|██████████| 10000/10000 [00:21<00:00, 467.83it/s]
val epoch 26 loss: 0.2474:   2%|▏         | 57/2500 [00:00<00:04, 562.31it/s]

train loss = 0.7817


val epoch 26 loss: 0.2457: 100%|██████████| 2500/2500 [00:03<00:00, 691.58it/s]
  0%|          | 36/10000 [00:00<00:28, 353.81it/s]

val loss = 1.9657


train epoch 27 loss: 0.0963: 100%|██████████| 10000/10000 [00:21<00:00, 472.23it/s]
val epoch 27 loss: 0.2463:   2%|▏         | 59/2500 [00:00<00:04, 580.80it/s]

train loss = 0.7706


val epoch 27 loss: 0.2436: 100%|██████████| 2500/2500 [00:03<00:00, 700.51it/s]
  0%|          | 49/10000 [00:00<00:20, 477.87it/s]

val loss = 1.9487


train epoch 28 loss: 0.0950: 100%|██████████| 10000/10000 [00:22<00:00, 452.60it/s]
val epoch 28 loss: 0.2451:   3%|▎         | 78/2500 [00:00<00:03, 772.98it/s]

train loss = 0.7604


val epoch 28 loss: 0.2416: 100%|██████████| 2500/2500 [00:03<00:00, 637.24it/s]
  0%|          | 39/10000 [00:00<00:26, 381.38it/s]

val loss = 1.9331


train epoch 29 loss: 0.0939: 100%|██████████| 10000/10000 [00:21<00:00, 458.03it/s]
val epoch 29 loss: 0.2439:   3%|▎         | 73/2500 [00:00<00:03, 720.13it/s]

train loss = 0.7511


val epoch 29 loss: 0.2398: 100%|██████████| 2500/2500 [00:03<00:00, 665.72it/s]
  0%|          | 49/10000 [00:00<00:20, 488.36it/s]

val loss = 1.9184


train epoch 30 loss: 0.0928: 100%|██████████| 10000/10000 [00:21<00:00, 460.03it/s]
val epoch 30 loss: 0.2424:   2%|▏         | 59/2500 [00:00<00:04, 588.66it/s]

train loss = 0.7423


val epoch 30 loss: 0.2383: 100%|██████████| 2500/2500 [00:03<00:00, 628.25it/s]
  0%|          | 38/10000 [00:00<00:26, 378.26it/s]

val loss = 1.9063


train epoch 31 loss: 0.0917: 100%|██████████| 10000/10000 [00:20<00:00, 492.78it/s]
val epoch 31 loss: 0.2415:   3%|▎         | 75/2500 [00:00<00:03, 740.46it/s]

train loss = 0.7340


val epoch 31 loss: 0.2368: 100%|██████████| 2500/2500 [00:03<00:00, 695.22it/s]
  0%|          | 39/10000 [00:00<00:25, 384.36it/s]

val loss = 1.8942


train epoch 32 loss: 0.0908: 100%|██████████| 10000/10000 [00:23<00:00, 429.64it/s]
val epoch 32 loss: 0.2407:   2%|▏         | 59/2500 [00:00<00:04, 580.29it/s]

train loss = 0.7264


val epoch 32 loss: 0.2354: 100%|██████████| 2500/2500 [00:04<00:00, 604.90it/s]
  1%|          | 51/10000 [00:00<00:19, 508.65it/s]

val loss = 1.8829


train epoch 33 loss: 0.0899: 100%|██████████| 10000/10000 [00:21<00:00, 460.77it/s]
val epoch 33 loss: 0.2396:   3%|▎         | 76/2500 [00:00<00:03, 757.93it/s]

train loss = 0.7189


val epoch 33 loss: 0.2341: 100%|██████████| 2500/2500 [00:03<00:00, 698.29it/s]
  0%|          | 50/10000 [00:00<00:20, 496.12it/s]

val loss = 1.8726


train epoch 34 loss: 0.0890: 100%|██████████| 10000/10000 [00:22<00:00, 437.19it/s]
val epoch 34 loss: 0.2386:   3%|▎         | 75/2500 [00:00<00:03, 749.18it/s]

train loss = 0.7124


val epoch 34 loss: 0.2329: 100%|██████████| 2500/2500 [00:04<00:00, 566.58it/s]
train epoch 35 loss: 0.0906:   0%|          | 50/10000 [00:00<00:19, 499.62it/s]

val loss = 1.8631


train epoch 35 loss: 0.0883: 100%|██████████| 10000/10000 [00:21<00:00, 458.42it/s]
val epoch 35 loss: 0.2378:   2%|▏         | 58/2500 [00:00<00:04, 579.08it/s]

train loss = 0.7061


val epoch 35 loss: 0.2317: 100%|██████████| 2500/2500 [00:03<00:00, 670.68it/s]
  0%|          | 48/10000 [00:00<00:21, 473.71it/s]

val loss = 1.8535


train epoch 36 loss: 0.0875: 100%|██████████| 10000/10000 [00:22<00:00, 446.95it/s]
val epoch 36 loss: 0.2372:   3%|▎         | 77/2500 [00:00<00:03, 766.91it/s]

train loss = 0.7000


val epoch 36 loss: 0.2306: 100%|██████████| 2500/2500 [00:03<00:00, 695.97it/s]
  0%|          | 50/10000 [00:00<00:20, 494.06it/s]

val loss = 1.8450


train epoch 37 loss: 0.0868: 100%|██████████| 10000/10000 [00:21<00:00, 458.93it/s]
val epoch 37 loss: 0.2367:   3%|▎         | 74/2500 [00:00<00:03, 732.57it/s]

train loss = 0.6942


val epoch 37 loss: 0.2297: 100%|██████████| 2500/2500 [00:03<00:00, 670.85it/s]
  0%|          | 48/10000 [00:00<00:20, 476.33it/s]

val loss = 1.8376


train epoch 38 loss: 0.0861: 100%|██████████| 10000/10000 [00:21<00:00, 469.04it/s]
val epoch 38 loss: 0.2360:   3%|▎         | 74/2500 [00:00<00:03, 737.95it/s]

train loss = 0.6887


val epoch 38 loss: 0.2288: 100%|██████████| 2500/2500 [00:03<00:00, 638.92it/s]
  1%|          | 51/10000 [00:00<00:19, 507.40it/s]

val loss = 1.8307


train epoch 39 loss: 0.0854: 100%|██████████| 10000/10000 [00:21<00:00, 459.99it/s]
val epoch 39 loss: 0.2354:   3%|▎         | 74/2500 [00:00<00:03, 731.80it/s]

train loss = 0.6835


val epoch 39 loss: 0.2280: 100%|██████████| 2500/2500 [00:03<00:00, 684.40it/s]

val loss = 1.8237
----- result ------
Best epoch: 39
Best loss: 1.8237441778182983, RMSE: 1.3504607677459717





# get recommendation

In [32]:
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
item_df = pd.read_csv(os.path.join(DATA_DIR, 'u.item'), sep='|', encoding="iso-8859-1", usecols=range(5), names=m_cols)
item_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [109]:
def load_model():
    best_path = os.path.join(OUTPUT_DIR, 'best_model.bin')
    model = MatrixFactorizationPyTorch(n_user, n_item, k=20)
    model.load_state_dict(torch.load(best_path)['model'])
    return model

def predict_rating(rec_df):
    """
    predict unwatched item ratings
    """
    model = load_model()
    model.eval()
    dataloader = DataLoader(MovieLensDataset(rec_df), batch_size=10, shuffle=False,)
    pbar = tqdm(dataloader, total=len(dataloader))
    preds = []
    for data in pbar:
        user_id = data['user']
        item_id = data['item']
        rating = data['rating']

        preds += model(user_id, item_id)

    return torch.stack(preds).detach().numpy()

def recommend_for_user(user_id, rating_df, item_df, top_n=10):
    """
    """
    rec_df = rating_df.query("user_id != @user_id")
    rec_df['user_id'] = user_id
    rec_df = rec_df.drop_duplicates(subset=['user_id','item_id'])
    rec_df['rating'] = predict_rating(rec_df)
    
    # clip rating
    rec_df = rec_df.query('0.5 <= rating <= 5.5 ')

    # add title column 
    d = dict(zip(item_df.movie_id, item_df.title))
    rec_df['title'] = rec_df['item_id'].map(d)
    rec_df = rec_df.sort_values('rating', ascending=False)

    # show recommend movies
    print('-'*30 + 'recommendations' + '-'*30)
    print(rec_df[['title','rating']].head(top_n))
#     for i, row in rec_df.head(top_n).iterrows():
#         title, rating = row['title'],row['rating']
#         print(f'{i:}: title:{title}  score:{rating}')

    # show movies which user have watched before
    user_df = rating_df.query("user_id == @user_id")
    user_df['title'] = user_df['item_id'].map(d)
    user_df = user_df.sort_values('rating', ascending=False)

    print('-'*30 + 'watched_movies' + '-'*30)
    print(user_df[['title','rating']].head(top_n))
#     for i, row in user_df.head(top_n).iterrows():
#         title, rating = row['title'], row['rating']
#         print(f'{i}: title:{title}  score:{rating}')



In [112]:
user_id = random.choice(df.user_id.values)
print(user_id)
recommend_for_user(user_id, df, item_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 33%|███▎      | 56/169 [00:00<00:00, 558.64it/s]

495


100%|██████████| 169/169 [00:00<00:00, 563.28it/s]

------------------------------recommendations------------------------------
                                    title    rating
4682                My Man Godfrey (1936)  5.490833
746   Fast, Cheap & Out of Control (1997)  5.426092
671        American President, The (1995)  5.407290
7288                 Nobody's Fool (1994)  5.400203
6519        Great Day in Harlem, A (1994)  5.365497
1270                   Sling Blade (1996)  5.325870
1676        In the Bleak Midwinter (1995)  5.307295
243               Schindler's List (1993)  5.293337
313      Shawshank Redemption, The (1994)  5.292449
383             Dazed and Confused (1993)  5.281991
------------------------------watched_movies------------------------------
                                title  rating
23410  Raiders of the Lost Ark (1981)       5
33036    Pink Floyd - The Wall (1982)       5
60868   Independence Day (ID4) (1996)       5
39673              Up in Smoke (1978)       5
60993  Remains of the Day, The (1993)       5
3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
