In [1]:
import pandas as pd
import numpy as np
q = pd.read_parquet('./data_mms/q_f2k_20unk.pq')

In [2]:
q.shape

(415317, 41)

In [3]:
num_features = ['request_cnt', 'price', 'diff_days', 'men_share', 'women_share', 'men_bucket_share_0',
       'women_bucket_share_0', 'people_in_bucket_0', 'men_bucket_share_1',
       'women_bucket_share_1', 'people_in_bucket_1', 'men_bucket_share_2',
       'women_bucket_share_2', 'people_in_bucket_2', 'men_bucket_share_3',
       'women_bucket_share_3', 'people_in_bucket_3', 'men_bucket_share_4',
       'women_bucket_share_4', 'people_in_bucket_4', 'men_bucket_share_5',
       'women_bucket_share_5', 'people_in_bucket_5', 'men_bucket_share_6',
       'women_bucket_share_6', 'people_in_bucket_6']
cat_features = ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name',
                'cpe_type_cd', 'cpe_model_os_type', 'part_of_day', 
                'url_host','dow'] #
cat_embs_size = [(81, 3), (985, 3), (37, 6), (599, 32),
                 (4, 2), (3, 2), (4, 2), 
                 (23761, 200),  (7, 3)] #

In [4]:
import torch
import numpy as np
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
import gc
import tqdm
from torch.utils.data import DataLoader
import math

class AlphaDataset(Dataset):
    def __init__(self, df, is_train = False):
        self.df = df.copy()
        self.is_train = is_train
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,idx):
        s = self.df.iloc[idx]
        uid = s.user_id
        
        cat_features_vals = np.stack(s[cat_features].values).T
        num_features_vals = np.stack(s[num_features].values).T
        idxs = np.arange(len(s['url_host']))
        np.random.shuffle(idxs)
        cat_features_vals = cat_features_vals[idxs]
        num_features_vals = num_features_vals[idxs]
        urls = np.array(s['url_host'])[idxs]
        n = len(s['url_host'])
        hist_len = math.ceil(0.8 * n)
        target_len = math.ceil(0.8 * n)
        cat_features_vals = cat_features_vals[:hist_len]
        num_features_vals = num_features_vals[:hist_len]
        mask = np.ones(hist_len)
        target = np.zeros(23761)
        target[urls[-target_len:]] = 1
        
        return cat_features_vals, num_features_vals, target
    

def pad_matrix(mat, max_len):
    n = mat.shape[0]
    if len(mat.shape) == 1:
        if max_len <= n:
            return torch.tensor(mat[-max_len:])
        return torch.cat([torch.tensor(mat), torch.zeros(max_len - n)])
    if max_len <= n:
        return torch.tensor(mat[-max_len:])
    return torch.cat([
        torch.tensor(mat),
        torch.zeros((max_len - n, mat.shape[1]))
    ])

def collate_batch(batch):
    #cat_features_vals, num_features_vals, user_features_vals, mask, targets = batch
    max_len = 0
    for c in batch:
        if c[0].shape[0] > max_len:
            max_len = c[0].shape[0]
            
    if max_len > 1200:
        max_len = 1200

    return torch.stack([
        pad_matrix(m[0], max_len) for m in batch
    ]), torch.stack([
        pad_matrix(m[1], max_len) for m in batch
    ]), torch.stack([torch.tensor(m[2]) for m in batch])
      
#dataloader = DataLoader(ds_test, batch_size=64, collate_fn=collate_batch, shuffle=False)


In [5]:
q['data_len'] = q['region_name'].apply(len)

In [6]:
import numpy as np
from random import shuffle
from torch.utils.data import Sampler

class BySequenceLengthSampler(Sampler):
    def __init__(self, data_source, batch_size=64):
        ind_n_len = []
        pps = data_source['data_len'].values
        for i, p in enumerate(pps):
            ind_n_len.append( (i, p) )
        self.ind_n_len = ind_n_len
        self.bucket_boundaries = np.quantile(pps, [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8])
        self.batch_size = batch_size
        
    def __iter__(self):
        data_buckets = dict()
        # where p is the id number and seq_len is the length of this id number. 
        for p, seq_len in self.ind_n_len:
            pid = self.element_to_bucket_id(p,seq_len)
            if pid in data_buckets.keys():
                data_buckets[pid].append(p)
            else:
                data_buckets[pid] = [p]

        for k in data_buckets.keys():
            data_buckets[k] = np.asarray(data_buckets[k])

        iter_list = []
        for k in data_buckets.keys():
            np.random.shuffle(data_buckets[k])
            iter_list += (np.array_split(data_buckets[k]
                           , int(data_buckets[k].shape[0]/self.batch_size)))
        shuffle(iter_list) # shuffle all the batches so they arent ordered by bucket
        # size
        for i in iter_list: 
            yield i.tolist() # as it was stored in an array
    
    def __len__(self):
        return self.data_source.shape[0]
    
    def element_to_bucket_id(self, x, seq_length):
        boundaries = list(self.bucket_boundaries)
        buckets_min = [np.iinfo(np.int32).min] + boundaries
        buckets_max = boundaries + [np.iinfo(np.int32).max]
        conditions_c = np.logical_and(
          np.less_equal(buckets_min, seq_length),
          np.less(seq_length, buckets_max))
        bucket_id = np.min(np.where(conditions_c))
        return bucket_id

In [7]:
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

class CFG:
    learning_rate=1.0e-3
    batch_size=64
    num_workers=4
    print_freq=100
    test_freq=1
    start_epoch=0
    num_train_epochs=3
    warmup_steps=30
    max_grad_norm=1000
    gradient_accumulation_steps=1
    weight_decay=0.01    
    dropout=0.0
    emb_size=100
    hidden_size=160
    nlayers=2
    nheads=8
    seq_len=1200
    target_size = 7
    num_fts_len = 64
    fts_len = sum([x[1] for x in cat_embs_size]) + len(num_features)
    
class GRUBaseModel(nn.Module):
    def __init__(self, cfg):
        super(GRUBaseModel, self).__init__()
        
        self.cfg = cfg
        
        self.embeds = torch.nn.ModuleList([
            nn.Embedding(a,b) for a,b in cat_embs_size
        ])
        
        input_len = cfg.fts_len

        self.encoder = nn.GRU(input_size = cfg.fts_len,
                              hidden_size = cfg.hidden_size,
                              num_layers = cfg.nlayers,
                              batch_first = True,
                              bidirectional = True
                             )            
        def get_reg():
            return nn.Sequential(
                nn.Linear(4*cfg.hidden_size, cfg.hidden_size), 
                nn.LayerNorm(cfg.hidden_size),
                nn.Mish()           
            ) 
        self.reg_layer = get_reg()
        self.male_age = nn.Linear(cfg.hidden_size, 7)
        self.unsuper  = nn.Linear(cfg.hidden_size, 23761)
        
    def forward(self, cat_features_vals, num_features_vals):        
        batch_size = cat_features_vals.size(0)
            
        seq_emb = torch.cat([m(cat_features_vals[:,:,i]) 
                             for i,m in enumerate(self.embeds)] + [num_features_vals], dim=-1)
        
        _, sequence_output = self.encoder(seq_emb)
        sequence_output = sequence_output.transpose(0,1).flatten(start_dim=1)
        x = sequence_output
        emb = self.reg_layer(x)
        
        male_age = self.male_age(emb)
        unsuper = self.unsuper(emb)
        
        return male_age, unsuper

In [None]:
bs=256
ds_train = AlphaDataset(q)
sampler = BySequenceLengthSampler(q, bs)
train_dl = DataLoader(ds_train, batch_size=1, 
                      batch_sampler=sampler, 
                      num_workers=4,
                      collate_fn=collate_batch,
                      drop_last=False, 
                      pin_memory=False)

model = GRUBaseModel(CFG).cuda()
model.load_state_dict(torch.load('./ckps_mms/base_model_20_54ep.pth'))
loss_ce = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
model.train()
optimizer.zero_grad()
best_loss = 1e9
for epoch in range(60):
    losses = []
    for x in train_dl:
        _, out = model(x[0].long().cuda(), x[1].float().cuda())
        loss = loss_ce(out, x[2].float().cuda())
        losses.append(loss.item())
        loss.backward()
        optimizer.step() 
        optimizer.zero_grad()
    mean_loss = np.mean(losses)
    print(epoch, mean_loss)
    if mean_loss < best_loss:
        torch.save(model.state_dict(), f'./ckps_mms/base_model_20_60ep.pth')