In [1]:
from datetime import datetime
import json
from collections import Counter

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import torch
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, Dataset
import torchmetrics

from deeprec.torch.trainer import Trainer, set_device
from deeprec import ROOT

In [2]:
with open('../data/metadata.json', 'r') as fp:
    meta = json.load(fp)

meta.keys()

dict_keys(['title_emb_size', 'string_na', 'genres', 'ages', 'occupations', 'user', 'movie', 'city', 'state'])

In [27]:
class Vocab(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, x):
        c = Counter(x)
        self.lookup_ = {
            str(v).lower(): k for k, v in enumerate([x[0] for x in sorted(c.items(), key=lambda x: x[1], reverse=True)])
        }

    def transform(self, x):
        return [self.lookup_.get(str(xx).lower(), 99999) for xx in x]

In [28]:
state_enc = Vocab()
state_enc.fit(meta['state'])

city_enc = Vocab()
city_enc.fit(meta['city'])

user_enc = Vocab()
user_enc.fit(meta['user'])

In [5]:
df = pd.read_parquet('../data/train.parq.gzip').drop('rating', axis=1)
df.head()

Unnamed: 0_level_0,user,movie,hour,day_of_week,month,gender,age,occupation,city,state,...,embed_15,embed_16,embed_17,embed_18,embed_19,embed_20,embed_21,embed_22,embed_23,embed_24
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
956151,6036,3132,1,2,4,1,25,15,Gainesville,FL,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956152,6037,3132,3,2,4,1,45,1,Arlington,TX,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956149,5960,3132,17,5,4,1,45,0,Slidell,LA,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956150,6016,3132,20,2,4,0,45,1,Nashville,TN,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034
956146,5643,3132,6,6,5,1,35,1,Salt Lake City,UT,...,0.511667,1.46494,-2.46967,-1.196152,-0.7946,-0.09462,2.84776,2.17518,1.03427,-0.75034


In [6]:
df = pd.read_parquet('../data/train.parq.gzip', columns=['rating'])
df.head()

Unnamed: 0_level_0,rating
index,Unnamed: 1_level_1
956151,5
956152,4
956149,5
956150,3
956146,4


In [7]:
set(df['rating'])

{1, 2, 3, 4, 5}

In [24]:
class MovieDataset(Dataset):
    def __init__(self, filename, state_vocab, city_vocab, user_vocab):
        x = pd.read_parquet(filename).drop('rating', axis=1)
        y = pd.read_parquet(filename, columns=['rating'])

        x['state'] = state_vocab.transform(x['state'])
        x['city'] = city_vocab.transform(x['city'])
        x['user'] = user_vocab.transform(x['user'])

        self.feature_names = x.columns
        self.x = x.to_dict('records')
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


In [29]:
train = MovieDataset('../data/train.parq.gzip', state_vocab=state_enc, city_vocab=city_enc, user_vocab=user_enc)
test = MovieDataset('../data/test.parq.gzip', state_vocab=state_enc, city_vocab=city_enc, user_vocab=user_enc)

In [30]:
print(len(train.feature_names))
train.feature_names

54


Index(['user', 'movie', 'hour', 'day_of_week', 'month', 'gender', 'age',
       'occupation', 'city', 'state', 'year', 'genre_action',
       'genre_adventure', 'genre_animation', 'genre_childrens', 'genre_comedy',
       'genre_crime', 'genre_documentary', 'genre_drama', 'genre_fantasy',
       'genre_filmnoir', 'genre_horror', 'genre_musical', 'genre_mystery',
       'genre_romance', 'genre_scifi', 'genre_thriller', 'genre_war',
       'genre_western', 'embed_0', 'embed_1', 'embed_2', 'embed_3', 'embed_4',
       'embed_5', 'embed_6', 'embed_7', 'embed_8', 'embed_9', 'embed_10',
       'embed_11', 'embed_12', 'embed_13', 'embed_14', 'embed_15', 'embed_16',
       'embed_17', 'embed_18', 'embed_19', 'embed_20', 'embed_21', 'embed_22',
       'embed_23', 'embed_24'],
      dtype='object')

In [31]:
dl = DataLoader(train, 4)
next(iter(dl))

[{'user': tensor([  66, 1566, 1665,   60]),
  'movie': tensor([3132, 3132, 3132, 3132]),
  'hour': tensor([ 1,  3, 17, 20]),
  'day_of_week': tensor([2, 2, 5, 2]),
  'month': tensor([4, 4, 4, 4]),
  'gender': tensor([1, 1, 1, 0]),
  'age': tensor([25, 45, 45, 45]),
  'occupation': tensor([15,  1,  0,  1]),
  'city': tensor([   61,    34, 99999,    38]),
  'state': tensor([    9,     4, 99999, 99999]),
  'year': tensor([1919, 1919, 1919, 1919]),
  'genre_action': tensor([0, 0, 0, 0]),
  'genre_adventure': tensor([0, 0, 0, 0]),
  'genre_animation': tensor([0, 0, 0, 0]),
  'genre_childrens': tensor([0, 0, 0, 0]),
  'genre_comedy': tensor([1, 1, 1, 1]),
  'genre_crime': tensor([0, 0, 0, 0]),
  'genre_documentary': tensor([0, 0, 0, 0]),
  'genre_drama': tensor([0, 0, 0, 0]),
  'genre_fantasy': tensor([0, 0, 0, 0]),
  'genre_filmnoir': tensor([0, 0, 0, 0]),
  'genre_horror': tensor([0, 0, 0, 0]),
  'genre_musical': tensor([0, 0, 0, 0]),
  'genre_mystery': tensor([0, 0, 0, 0]),
  'genre_roman

In [36]:
z = next(iter(dl))

In [37]:
def stack_features(inputs, feat):
    return torch.stack([v for k, v in inputs.items() if feat in k], 1)


stack_features(z[0], 'genre')

tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [43]:
class RecModel(nn.Module):
    def __init__(self, metadata, n_features=54):
        super().__init__()
        self.meta = metadata
        self.embed_dims = {
            'large': 25,
            'med': 7,
            'small': 3
        }
        self.loss_func = nn.MSELoss()

        self.user_embeds = nn.Embedding(
            num_embeddings=len(meta['user'].keys()) + 1,
            embedding_dim=self.embed_dims['large']
        )

        self.city_embeds = nn.Embedding(
            num_embeddings=len(meta['city'].keys()) + 1,
            embedding_dim=self.embed_dims['med']
        )

        self.state_embeds = nn.Embedding(
            num_embeddings=len(meta['state'].keys()) + 1,
            embedding_dim=self.embed_dims['small']
        )

        self.age_embeds = nn.Embedding(
            num_embeddings=len(meta['ages']) + 1,
            embedding_dim=self.embed_dims['small']
        )

        self.occ_embeds = nn.Embedding(
            num_embeddings=len(meta['occupations']) + 1,
            embedding_dim=self.embed_dims['small']
        )

        self.user_model = nn.Sequential(
            nn.LazyLinear(out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64),
            nn.ReLU()
        )

        self.model = nn.Sequential(
            nn.LazyLinear(out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=1)
        )

    def forward(self, x):
        x_user = torch.concat(
            (
                self.user_embeds(x['user']),
                self.city_embeds(x['city']),
                self.state_embeds(x['state']),
                self.age_embeds(x['age']),
                self.occ_embeds([x['occupation']]),
                x['gender'],
                x['hour'],
                x['day_of_week'],
                x['month']
            ),
            dim=1
        )
        x = self.model(x_user)
        return x

    def predict(self, x):
        return self(x)

In [44]:
NOW = datetime.now().strftime('%Y%m%d-%H%M')
LOG_DIR = ROOT.joinpath('runs', NOW)
BATCH = 10_000

train_loader = DataLoader(train, batch_size=BATCH, shuffle=True)
test_loader = DataLoader(test, batch_size=BATCH)

device = set_device()
mod = RecModel(metadata=meta)
opt = torch.optim.AdamW(mod.parameters(), lr=0.01)
trainer = Trainer(
    mod, epochs=15, device=device, log_dir=LOG_DIR, checkpoint_file=LOG_DIR.joinpath('model.pt'),
    optimizer=opt, score_funcs={'mse': torchmetrics.MeanSquaredError()}
)
trainer.fit(train_loader, test_loader)

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]
Batch:   0%|          | 0/96 [00:00<?, ?it/s][A
Epoch:   0%|          | 0/15 [00:00<?, ?it/s][A


IndexError: index out of range in self

In [None]:
{e:v for e,v in enumerate(torch.sqrt(torch.tensor(trainer.results['valid_mse'])))}

In [48]:
dl = DataLoader(train, 32)
z = next(iter(dl))
z

[{'user': tensor([   66,  1566,  1665,    60,    30,  2355,  1692,    64,   934,  1067,
           1065, 99999,   566,  1637,   862,   129,   219,  1960,    17,    28,
            441,   327,    53,   431,    27,   812,  2028,   803,   137, 99999,
           1158,   682]),
  'movie': tensor([3132, 3132, 3132, 3132, 3132, 2821, 3132, 3132, 3132, 3132, 3132, 3132,
          3132, 3132, 3132, 3132, 3132, 3132, 3132, 2823, 3132, 3132, 3132, 3132,
          3132, 3132, 3132, 3132, 3132, 3132, 3132, 2823]),
  'hour': tensor([ 1,  3, 17, 20,  6, 21,  6,  7, 23, 23, 12, 21, 21, 21,  0,  0,  2, 14,
          15, 18,  4, 13, 16, 16, 17,  8, 21,  2,  3,  4,  6,  8]),
  'day_of_week': tensor([2, 2, 5, 2, 6, 2, 0, 5, 3, 3, 2, 5, 4, 5, 4, 5, 0, 3, 3, 0, 6, 3, 0, 1,
          5, 1, 2, 4, 1, 0, 2, 2]),
  'month': tensor([ 4,  4,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
           8,  8,  9,  9,  9,  9,  9, 10, 10, 11, 11, 11, 11, 11]),
  'gender': tensor([1, 1, 1, 0, 1, 0, 1, 1,

In [49]:
user_embeds = nn.Embedding(
    num_embeddings=len(meta['user'].keys()) + 1,
    embedding_dim=25
)

In [53]:
user_embeds(torch.tensor([99999]))

IndexError: index out of range in self