In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import urllib.request
import zipfile
from functools import partial

import torch
import torch.nn as nn
from torch.optim import AdamW, lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, RandomSampler, BatchSampler
)

# Local
from recommender_split import split_dataframe_by_group, report_splits

In [None]:
from miniai.datasets import DataLoaders
from miniai.learner import MetricsCB, DeviceCB, ProgressCB, TrainLearner
from miniai.activations import set_seed
from miniai.sgd import BatchSchedCB, RecorderCB

In [None]:
USER = 'user_id'
ITEM = 'title'
TARGET = 'rating'

## Load Movielens 1M

In [None]:
movielens_1M_url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'

filename = Path(movielens_1M_url).name
zip_path = Path('.')
extract_path = zip_path / Path(filename).stem

In [None]:
if not Path(filename).exists():
    with urllib.request.urlopen(movielens_1M_url) as response:
        with open(filename, "wb") as f:
          f.write(response.read())

if not extract_path.exists():
    with zipfile.ZipFile(filename, "r") as zip_ref:
        zip_ref.extractall(zip_path)

In [None]:
ratings_1m = pd.read_csv(
    extract_path/'ratings.dat', delimiter='::', header=None, engine='python',
    names=[USER, ITEM, TARGET, 'timestamp'])

ratings_1m = ratings_1m[[USER, ITEM, TARGET]].copy()

movies_1m = pd.read_csv(
    extract_path/'movies.dat', delimiter='::', encoding='ISO-8859-1', # encoding='latin-1',
    usecols=(0,1), header=None, engine='python', names=('item_id','title')
)

ratings_1m = ratings_1m.merge(movies_1m)
ratings_1m

Unnamed: 0,user_id,item_id,rating,title
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...
1000204,5949,2198,5,Modulations (1998)
1000205,5675,2703,3,Broken Vessels (1998)
1000206,5780,2845,1,White Boys (1999)
1000207,5851,3607,5,One Little Indian (1973)


In [None]:
data = ratings_1m

## Split train, valid

In [None]:
splits = split_dataframe_by_group(
    data, ratio=0.8, filter_by="item",
    seed=104577657, col_user=USER, col_item=ITEM
)

In [None]:
train_df = data.iloc[splits[0]]
valid_df = data.iloc[splits[1]]

In [None]:
report_splits(data, splits, USER, ITEM, ITEM)

Number of title with one review: 114
Number of users in train: 6040
Number of items in train: 3706
Number of users in valid: 6037
Number of items in valid: 3592


There are users not present in valid split

In [None]:
users_not_in_valid = set(train_df[USER]) - set(valid_df[USER])
users_not_in_valid

{160, 298, 2490}

Items that are not in valid have only one review

In [None]:
number_of_reviews_by_item = data.groupby(ITEM)[ITEM].transform('count')

In [None]:
assert sum(number_of_reviews_by_item==1) == len(train_df[ITEM].unique()) - len(valid_df[ITEM].unique())

### Force splits to have all users in valid

In [None]:
sort_column = 'numrev_by_item'
tmp_filter = data[USER].isin(users_not_in_valid)

df_new_splits = data[tmp_filter].copy()
df_new_splits[sort_column] = number_of_reviews_by_item[tmp_filter]

df_new_splits.head()

Unnamed: 0,user_id,item_id,rating,title,numrev_by_item
1747,298,661,4,James and the Giant Peach (1996),525
2926,160,3408,4,Erin Brockovich (2000),1315
16791,160,2791,3,Airplane! (1980),1731
21737,298,2321,4,Pleasantville (1998),1158
25956,298,527,5,Schindler's List (1993),2304


In [None]:
new_splits = split_dataframe_by_group(
    df_new_splits, ratio=0.8, filter_by="user",
    seed=104577657, col_user=USER, col_item=ITEM, sort_column=sort_column
)

In [None]:
final_splits = [splits[0] + new_splits[0], splits[1] + new_splits[1]]

In [None]:
train_df = data.iloc[final_splits[0]]
valid_df = data.iloc[final_splits[1]]

In [None]:
len(train_df), len(valid_df)

(798812, 201466)

In [None]:
report_splits(data, final_splits, USER, ITEM, ITEM)

Number of title with one review: 114
Number of users in train: 6040
Number of items in train: 3706
Number of users in valid: 6040
Number of items in valid: 3592


## PyTorch Dataset

user to index and title to index

In [None]:
users = sorted(list(data[USER].unique()))
items = sorted(list(data[ITEM].unique()))

uidx2u = {k:v for k,v in enumerate(users)}
u2uidx = {k:v for v,k in uidx2u.items()}

tidx2t = {k:v for k,v in enumerate(items)}
t2tidx = {k:v for v,k in tidx2t.items()}

In [None]:
len(uidx2u), len(tidx2t)

(6040, 3706)

In [None]:
tidx2t[890], tidx2t[2452]

('Decline of Western Civilization, The (1981)',
 "One Flew Over the Cuckoo's Nest (1975)")

In [None]:
class CollabDataset(Dataset):

    def __init__(self, df:pd.DataFrame, user_col='user', item_col='item', target='rating'):
        self.indices = torch.tensor(df.index)
        self.df = df[[user_col, item_col, target]]
        self.user_col = user_col
        self.item_col = item_col
        self.target = target
    
    def __len__(self):
        return len(self.df)

    def apply_x_transforms(self):
        users = self.df.loc[self.df_idxs][self.user_col].map(u2uidx)
        items = self.df.loc[self.df_idxs][self.item_col].map(t2tidx)
        sample_df = pd.DataFrame(zip(users, items))
        return torch.tensor(sample_df.values)

    def apply_y_transforms(self):
        targets = self.df.loc[self.df_idxs][self.target].values
        return torch.tensor(targets, dtype=torch.float)
           
    def __getitem__(self, idx_list):
        self.df_idxs = self.indices[idx_list]
        x = self.apply_x_transforms()
        y = self.apply_y_transforms()
        return (x, y)

In [None]:
class CollabDatasetOpt(Dataset):
    def __init__(self, df:pd.DataFrame, user_col='user', item_col='item', target='rating'):
        self.users = df[user_col].map(u2uidx).to_numpy()
        self.items = df[item_col].map(t2tidx).to_numpy()
        self.target = df[target].to_numpy()
    
    def __len__(self):
        return len(self.users)
    
    def get_batch(self):
        users_b = self.users[self.batch_idxs]
        items_b = self.items[self.batch_idxs]
        x_b = np.column_stack((users_b, items_b))
        target_b = self.target[self.batch_idxs]
        y_b = torch.tensor(target_b, dtype=torch.float)
        return torch.tensor(x_b), y_b
    
    def __getitem__(self, batch_idxs):
        self.batch_idxs = batch_idxs
        return self.get_batch()

In [None]:
train_dataset = CollabDataset(train_df, user_col=USER, item_col=ITEM)

valid_dataset = CollabDataset(valid_df, user_col=USER, item_col=ITEM)

## BatchSampler

In [None]:
bs = 64

t_sampler = BatchSampler(
    RandomSampler(train_dataset),
    batch_size=bs,
    drop_last=False)

v_sampler = BatchSampler(
    RandomSampler(valid_dataset),
    batch_size=bs,
    drop_last=False)

In [None]:
len(t_sampler)

12482

In [None]:
fixed_sampler = list(t_sampler)[:100]

In [None]:
%%time
for _ in fixed_sampler: 
    train_dataset[_]

CPU times: user 46.6 s, sys: 7.4 ms, total: 46.6 s
Wall time: 46.6 s


In [None]:
t_opt_dataset = CollabDatasetOpt(train_df, user_col=USER, item_col=ITEM)

v_opt_dataset = CollabDatasetOpt(valid_df, user_col=USER, item_col=ITEM)

In [None]:
%%time

for _ in t_sampler: 
    t_opt_dataset[_]

CPU times: user 2.71 s, sys: 51 ms, total: 2.76 s
Wall time: 1.31 s


## PyTorch DataLoader

In [None]:
bs = 64 * 3
t_sampler = BatchSampler(RandomSampler(train_dataset), batch_size=bs, drop_last=False)
v_sampler = BatchSampler(RandomSampler(valid_dataset), batch_size=bs, drop_last=False)

In [None]:
t_dataloader = DataLoader(t_opt_dataset, sampler=t_sampler, batch_size=None, num_workers=6)

v_dataloader = DataLoader(v_opt_dataset, sampler=v_sampler, batch_size=None, num_workers=6)

In [None]:
%%time
for _ in t_dataloader: pass

CPU times: user 11.7 s, sys: 6.6 s, total: 18.3 s
Wall time: 17.8 s


Sin optimización esto tomaba 6:50 min

## miniai DataLoaders

In [None]:
dls = DataLoaders(t_dataloader, v_dataloader)

In [None]:
dt = dls.train
xb, yb = next(iter(dt))

In [None]:
xb[:5,:], yb[:5]

(tensor([[4385,  861],
         [2115, 3007],
         [1536, 3137],
         [3680,  208],
         [1283, 3416]]),
 tensor([2., 4., 4., 3., 3.]))

In [None]:
n_users = len(data[USER].unique())
n_title = len(data[ITEM].unique())
n_users, n_title

(6040, 3706)

## Model

In [None]:
# functions from fastai
def sigmoid_range(x, low, high):
    "Sigmoid function with range `(low, high)`"
    return torch.sigmoid(x) * (high - low) + low

def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization (approximation)"
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

class Embedding(nn.Embedding):
    "Embedding layer with truncated normal initialization"
    def __init__(self, ni, nf, std=0.01):
        super().__init__(ni, nf)
        trunc_normal_(self.weight.data, std=std)

In [None]:
class DotProductBias(nn.Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0, 5.5)):
        super().__init__()
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range).flatten()

In [None]:
model = DotProductBias(n_users, n_title, 50)

In [None]:
model(xb)[:5]

tensor([2.7459, 2.7571, 2.7557, 2.7377, 2.7327], grad_fn=<SliceBackward0>)

## miniai training

In [None]:
metrics = MetricsCB()
cbs = [DeviceCB(), metrics, ProgressCB(plot=False)]

In [None]:
def _lr(cb): return cb.pg['lr']
def _beta1(cb): return cb.pg['betas'][0]

rec = RecorderCB(lr=_lr, mom=_beta1)

In [None]:
set_seed(42)
lr, epochs = 5e-3, 5

In [None]:
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)

xtra = [BatchSchedCB(sched), rec]

loss_f = nn.MSELoss()

In [None]:
learn = TrainLearner(
    model, dls, loss_f, lr=lr, cbs=cbs+xtra, 
    opt_func=partial(AdamW, weight_decay=0.15)
)

In [None]:
learn.fit(epochs)

loss,epoch,train
1.164,0,train
0.869,0,eval
0.848,1,train
0.831,1,eval
0.797,2,train
0.786,2,eval
0.732,3,train
0.74,3,eval
0.66,4,train
0.728,4,eval


## Pruebas de Velocidad

In [None]:
%%time
torch.tensor(data[USER].map(u2uidx))

CPU times: user 15.9 s, sys: 0 ns, total: 15.9 s
Wall time: 15.9 s


tensor([   0,    1,   11,  ..., 5779, 5850, 5937])

In [None]:
%%time
data[USER].map(u2uidx).to_numpy()

CPU times: user 34.1 ms, sys: 0 ns, total: 34.1 ms
Wall time: 29.2 ms


array([   0,    1,   11, ..., 5779, 5850, 5937])

In [None]:
%%time
torch.tensor(data[USER].map(u2uidx).to_numpy())

CPU times: user 58.2 ms, sys: 0 ns, total: 58.2 ms
Wall time: 36.4 ms


tensor([   0,    1,   11,  ..., 5779, 5850, 5937])

In [None]:
class CollabDatasetOptParts(Dataset):
    def __init__(self, df:pd.DataFrame, user_col='user', item_col='item', target='rating'):
        self.users = df[user_col]
        self.items = df[item_col]
        self.target = df[target].to_numpy()
    
    def __len__(self):
        return len(self.users)
    
    def get_recommend_values(self):
        users_b = self.users[self.batch_idxs].map(u2uidx).to_numpy()
        items_b = self.items[self.batch_idxs].map(t2tidx).to_numpy()
        x_b = np.column_stack((users_b, items_b))
        target_b = self.target[self.batch_idxs]
        y_b = torch.tensor(target_b, dtype=torch.float)
        return torch.tensor(x_b), y_b
    
    def __getitem__(self, batch_idxs):
        self.batch_idxs = batch_idxs
        return self.get_recommend_values()

In [None]:
full_dataset_01 = CollabDatasetOpt(data, user_col=USER, item_col=ITEM)
full_dataset_02 = CollabDatasetOptParts(data, user_col=USER, item_col=ITEM)

In [None]:
bs = 32 
full_sampler = BatchSampler(RandomSampler(full_dataset_01), batch_size=bs, drop_last=False)

In [None]:
full_dataloader_01 = DataLoader(
    full_dataset_01, sampler=full_sampler, batch_size=None, num_workers=6)

full_dataloader_02 = DataLoader(
    full_dataset_02, sampler=full_sampler, batch_size=None, num_workers=6)

In [None]:
%%time
for _ in full_dataloader_01: pass

CPU times: user 1min 21s, sys: 58.1 s, total: 2min 19s
Wall time: 2min 22s


In [None]:
%%time
for _ in full_dataloader_02: pass

CPU times: user 1min 28s, sys: 1min 5s, total: 2min 33s
Wall time: 2min 58s


In [None]:
class CustomDataLoader():
    def __init__(self, dataset, sampler):
        self.dataset = dataset
        self.sampler = sampler
    
    def __iter__(self):
        for indices in self.sampler:
            yield(self.dataset[indices])

In [None]:
full_custom_dl = CustomDataLoader(full_dataset_01, full_sampler)

In [None]:
%%time
for _ in full_custom_dl: pass

CPU times: user 4.94 s, sys: 49.5 ms, total: 4.99 s
Wall time: 3.35 s


In [None]:
full_custom_dl = CustomDataLoader(full_dataset_02, full_sampler)

In [None]:
%%time
for _ in full_custom_dl: pass

CPU times: user 7min 57s, sys: 0 ns, total: 7min 57s
Wall time: 7min 55s


In [None]:
full_dataloader_03 = DataLoader(
    full_dataset_01, sampler=full_sampler, batch_size=None, num_workers=4)

In [None]:
%%time
for _ in full_dataloader_01: pass

CPU times: user 1min 16s, sys: 57.4 s, total: 2min 14s
Wall time: 2min 18s
