In [146]:
from datetime import datetime
import json
from collections import Counter

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import torch
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, Dataset
import torchmetrics

from deeprec.torch.trainer import Trainer, set_device
from deeprec import ROOT

In [69]:
with open('../data/metadata.json', 'r') as fp:
    meta = json.load(fp)

meta.keys()

dict_keys(['title_emb_size', 'string_na', 'genres', 'ages', 'occupations', 'user', 'movie', 'city', 'state'])

In [64]:
class Vocab(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, x):
        c = Counter(x)
        self.lookup_ = {
            v.lower(): k for k, v in enumerate([x[0] for x in sorted(c.items(), key=lambda x: x[1], reverse=True)])
        }

    def transform(self, x):
        return [self.lookup_.get(xx.lower(), 99999) for xx in x]

In [70]:
state_enc = Vocab()
state_enc.fit(meta['state'])

city_enc = Vocab()
city_enc.fit(meta['city'])

In [112]:
df = pd.read_parquet('../data/train.parq.gzip').drop('rating', axis=1)
df.head()

Unnamed: 0_level_0,user,movie,hour,day_of_week,month,gender,age,occupation,city,state,...,embed_15,embed_16,embed_17,embed_18,embed_19,embed_20,embed_21,embed_22,embed_23,embed_24
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
956151,6036,3132,1,2,4,1,25,15,Gainesville,FL,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956152,6037,3132,3,2,4,1,45,1,Arlington,TX,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956149,5960,3132,17,5,4,1,45,0,Slidell,LA,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956150,6016,3132,20,2,4,0,45,1,Nashville,TN,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956146,5643,3132,6,6,5,1,35,1,Salt Lake City,UT,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034


In [118]:
df = pd.read_parquet('../data/train.parq.gzip', columns=['rating'])
df.head()

Unnamed: 0_level_0,rating
index,Unnamed: 1_level_1
956151,5
956152,4
956149,5
956150,3
956146,4


In [119]:
set(df['rating'])

{1, 2, 3, 4, 5}

In [178]:
class MovieDataset(Dataset):
    def __init__(self, filename, state_vocab, city_vocab):
        x = pd.read_parquet(filename).drop('rating', axis=1)
        y = pd.read_parquet(filename, columns=['rating'])

        x['state'] = state_vocab.transform(x['state'])
        x['city'] = city_vocab.transform(x['city'])

        self.feature_names = x.columns
        self.x = torch.tensor(x.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


In [179]:
train = MovieDataset('../data/train.parq.gzip', state_vocab=state_enc, city_vocab=city_enc)
test = MovieDataset('../data/test.parq.gzip', state_vocab=state_enc, city_vocab=city_enc)

In [180]:
print(len(train.feature_names))
train.feature_names

54


Index(['user', 'movie', 'hour', 'day_of_week', 'month', 'gender', 'age',
       'occupation', 'city', 'state', 'year', 'genre_action',
       'genre_adventure', 'genre_animation', 'genre_childrens', 'genre_comedy',
       'genre_crime', 'genre_documentary', 'genre_drama', 'genre_fantasy',
       'genre_filmnoir', 'genre_horror', 'genre_musical', 'genre_mystery',
       'genre_romance', 'genre_scifi', 'genre_thriller', 'genre_war',
       'genre_western', 'embed_0', 'embed_1', 'embed_2', 'embed_3', 'embed_4',
       'embed_5', 'embed_6', 'embed_7', 'embed_8', 'embed_9', 'embed_10',
       'embed_11', 'embed_12', 'embed_13', 'embed_14', 'embed_15', 'embed_16',
       'embed_17', 'embed_18', 'embed_19', 'embed_20', 'embed_21', 'embed_22',
       'embed_23', 'embed_24'],
      dtype='object')

In [181]:
next(iter(train))

(tensor([ 6.0360e+03,  3.1320e+03,  1.0000e+00,  2.0000e+00,  4.0000e+00,
          1.0000e+00,  2.5000e+01,  1.5000e+01,  6.1000e+01,  9.0000e+00,
          1.9190e+03,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.2278e+00,
         -9.0525e-02,  1.2616e+00,  6.5476e-01,  5.8712e-01,  1.1163e+00,
          3.0645e+00, -3.9494e-01, -1.4985e+00,  7.4800e-01, -8.4274e-01,
          5.4927e-01, -1.1358e+01, -4.5558e-01, -5.6822e-01,  5.1167e-01,
          1.4649e+00, -2.4697e+00, -1.1962e+00, -7.9460e-01, -9.4620e-02,
          2.8478e+00,  2.1752e+00,  1.0343e+00, -7.5034e-01]),
 tensor([5.]))

In [182]:
embd = nn.Embedding(
    num_embeddings=len(meta['state'].keys())+ 1,
    embedding_dim=25
)

In [183]:
embd(torch.tensor([[9]]))

tensor([[[ 1.1727,  2.5632, -0.3023, -0.0697,  0.7881, -1.2798, -0.4087,
           0.3787,  0.8369, -1.0754,  0.1834,  0.6412, -0.3960, -0.8481,
          -0.5621,  1.2170,  1.1314,  0.4742,  2.6531,  1.4624, -0.4234,
          -0.2124, -0.5837,  1.1908, -0.2630]]], grad_fn=<EmbeddingBackward0>)

In [202]:
class RecModel(nn.Module):
    def __init__(self, metadata, n_features=54):
        super().__init__()
        self.meta = metadata
        self.loss_func = nn.MSELoss()

        self.model = nn.Sequential(
            nn.Linear(in_features=n_features, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=1)
        )

    def forward(self, x):
        x = self.model(x)
        return x

    def predict(self, x):
        return self(x)

In [203]:
NOW = datetime.now().strftime('%Y%m%d-%H%M')
LOG_DIR = ROOT.joinpath('runs', NOW)
BATCH = 10_000

train_loader = DataLoader(train, batch_size=BATCH, shuffle=True)
test_loader = DataLoader(test, batch_size=BATCH)

device = set_device()
mod = RecModel(metadata=meta)
opt = torch.optim.AdamW(mod.parameters(), lr=0.01)
trainer = Trainer(
    mod, epochs=15, device=device, log_dir=LOG_DIR, checkpoint_file=LOG_DIR.joinpath('model.pt'),
    optimizer=opt, score_funcs={'mse': torchmetrics.MeanSquaredError()}
)
trainer.fit(train_loader, test_loader)

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]
Batch:   0%|          | 0/96 [00:00<?, ?it/s][A
Batch:   1%|          | 1/96 [00:00<00:23,  4.05it/s][A
Batch:   2%|▏         | 2/96 [00:00<00:19,  4.75it/s][A
Batch:   3%|▎         | 3/96 [00:00<00:17,  5.23it/s][A
Batch:   4%|▍         | 4/96 [00:00<00:15,  5.87it/s][A
Batch:   5%|▌         | 5/96 [00:01<00:39,  2.30it/s][A
Batch:   7%|▋         | 7/96 [00:01<00:23,  3.82it/s][A
Batch:   9%|▉         | 9/96 [00:02<00:16,  5.27it/s][A
Batch:  11%|█▏        | 11/96 [00:02<00:13,  6.51it/s][A
Batch:  14%|█▎        | 13/96 [00:02<00:10,  7.58it/s][A
Batch:  15%|█▍        | 14/96 [00:02<00:18,  4.48it/s][A
Batch:  17%|█▋        | 16/96 [00:03<00:14,  5.62it/s][A
Batch:  18%|█▊        | 17/96 [00:03<00:13,  6.07it/s][A
Batch:  19%|█▉        | 18/96 [00:03<00:12,  6.35it/s][A
Batch:  20%|█▉        | 19/96 [00:03<00:11,  6.67it/s][A
Batch:  21%|██        | 20/96 [00:03<00:10,  7.02it/s][A
Batch:  22%|██▏       | 21/96 [00:03<00:10

<deeprec.torch.trainer.Trainer at 0x1defecac0>

In [204]:
{e:v for e,v in enumerate(torch.sqrt(torch.tensor(trainer.results['valid_mse'])))}

{0: tensor(3.0250),
 1: tensor(1.8583),
 2: tensor(1.4569),
 3: tensor(1.3608),
 4: tensor(1.3319),
 5: tensor(1.2874),
 6: tensor(1.2636),
 7: tensor(1.2510),
 8: tensor(1.2395),
 9: tensor(1.2493),
 10: tensor(1.2354),
 11: tensor(1.2224),
 12: tensor(1.2147),
 13: tensor(1.2219),
 14: tensor(1.2104)}