### 02MOD.ipynb

show off my super neat solution

05/27/2020

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '3'
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
tra_df = pd.read_pickle('extracted/train.pkl')
val_df = pd.read_pickle('extracted/valid.pkl')
test_df = pd.read_pickle('extracted/testB.pkl')

### Dataset Preprocess

In [5]:
def print_longest_token():
    print(max(tra_df['query_tokenid'].str.len()),\
          max(val_df['query_tokenid'].str.len()),\
          max(test_df['query_tokenid'].str.len()))
    
def print_longest_roi():
    print(max(tra_df['features'].apply(lambda x: x.shape[0])),\
          max(val_df['features'].apply(lambda x: x.shape[0])),\
          max(test_df['features'].apply(lambda x: x.shape[0])))
print_longest_token()

print_longest_roi()
print(len(tra_df))
tra_df_mask = (tra_df['features'].apply(lambda x: x.shape[0]) <= 40)
tra_df = tra_df[tra_df_mask]
print(len(tra_df))

29 15 15
91 30 33
2999528
2999043


In [6]:
def pad_token(x,l=30):
    tokens = np.zeros(l,dtype=np.long)
    attns = np.zeros(l,dtype=np.float)
    tokens[:len(x)]=x
    attns[:len(x)]=1
    return tokens, attns

def pad_rois(x,l=40):
    rois = np.zeros([l,2048],dtype=np.float)
    attns = np.zeros(l,dtype=np.float)
    rois[:len(x),:]=x
    attns[:len(x)]=1
    return rois, attns

### Dataset Definition

In [7]:
class TrainSet(Dataset):
    def __init__(self):
        super().__init__()
        self.df = tra_df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        tokens, attn1 = pad_token(row['query_tokenid'])
        rois,   attn2 = pad_rois (row['features'])
        return tokens, attn1, rois, attn2
tra = TrainSet()

In [8]:
class InferSet(Dataset):
    def __init__(self,df):
        super().__init__()
        self.df = df
        self.qids = df['query_id'].unique().tolist()
    def __len__(self):
        return len(self.qids)
    def __getitem__(self, idx):
        qid = self.qids[idx]
        rows = self.df[self.df['query_id']==qid]
        
        fisrt_row = rows.iloc[0]
        tokens, attn1 = pad_token(fisrt_row['query_tokenid'])
        
        rois_list = []
        attn2_list = []
        for _,row in rows.iterrows():
            rois,   attn2 = pad_rois (row['features'])
            rois_list.append(rois[None,:])
            attn2_list.append(attn2[None,:])
        rois = np.vstack(rois_list)
        attn2 = np.vstack(attn2_list)
        return tokens, attn1, rois, attn2, fisrt_row['query_id'], rows['product_id'].values
val = InferSet(val_df)
print(len(val))
for i in val:
    tokens, attn1, rois, attn2, qid, pid = i
    print(tokens.shape, attn1.shape, rois.shape, attn2.shape, pid.shape)
    break

496
(30,) (30,) (30, 40, 2048) (30, 40) (30,)


### Metric Definition

In [9]:
import numpy as np
import json

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(3, r.size + 2)))
    return 0.

def get_ndcg(r, ref, k):
    dcg_max = dcg_at_k(ref, k)
    if not dcg_max:
        return 0.
    dcg = dcg_at_k(r, k)
    return dcg / dcg_max

class Valid():
    def __init__(self):
        self.result = {}
        parsed_json = json.loads(open('dataset/valid_answer.json').read())
        for k,v in parsed_json.items():
            parsed_json[k]=[str(x) for x in v]
        self.answer = parsed_json # both k and v are strings
    def add_prediction(self, qid, pid, score):
        qid,pid = str(qid),str(pid)
        self.result.setdefault(qid, [])
        self.result[qid].append([pid,score])
    def cal_ndcg5(self):
        mNDCG = 0.0
        for qid in self.result.keys():
            preds = self.result[qid] # our
            gt = self.answer[qid] # keys
            preds = sorted(preds, key=lambda x:x[1], reverse=True)
            pred_vec = [1.0 if pid[0] in gt else 0.0 for pid in preds[:5]]
            mNDCG += get_ndcg(pred_vec, [1.0] * 5, 5)
        mNDCG /= len(self.result.keys())
        return mNDCG

### Model Definition

In [10]:
from transformers import BertModel, BertConfig
bert_text = BertModel(BertConfig(num_hidden_layers=4))
bert_img = BertModel(BertConfig(num_hidden_layers=4))

INFO:transformers.file_utils:PyTorch version 1.4.0+cu100 available.


In [11]:
class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__()
        self.bert_text = bert_text
    def forward(self, tokens, attns):
        sequence_output, pooled_output = self.bert_text(
            input_ids=tokens, 
            attention_mask=attns)
        return pooled_output

class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        self.bert_img = bert_img
        self.roi_encoder = nn.Linear(2048, 768)
    def forward(self, rois, attns):
        embedding_output = self.bert_img.embeddings.LayerNorm(self.roi_encoder(rois))
        extended_attention_mask = attns[:, None, None, :].to(dtype=embedding_output.dtype)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        encoder_outputs = self.bert_img.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=[None]*100)
        sequence_output = encoder_outputs[0]
        pooled_output = self.bert_img.pooler(sequence_output)
        return pooled_output

In [12]:
class MyModel(pl.LightningModule):
    def __init__(self, bs=512):
        super(MyModel, self).__init__()
        self.txt_enc = TextEncoder()
        self.img_enc = ImageEncoder()
        self.bs = bs
        self.target = torch.tensor(np.diag([1.0]*bs)).cuda()
    def forward(self, tokens, attn1, rois, attn2):
        txt_emb = self.txt_enc(tokens, attn1)
        img_emb = self.img_enc(rois, attn2)
        sim = txt_emb.mm(img_emb.T)
        return F.binary_cross_entropy_with_logits(sim, self.target)
    def predict(self, tokens, attn1, rois, attn2):
        # 1 query per pridiction
        txt_emb = self.txt_enc(tokens, attn1)
        img_emb = self.img_enc(rois.squeeze(), attn2.squeeze())
        sim = txt_emb.mm(img_emb.T)
        return sim
    
    # train stuff
    def training_step(self, batch, batch_nb):
        tokens, attn1, rois, attn2 = batch
        loss = self(tokens, attn1, rois, attn2)
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=2e-05, eps=1e-08)
    def train_dataloader(self):
        return DataLoader(tra, batch_size=self.bs, shuffle=True, 
                          num_workers=16, pin_memory=True, drop_last=True)
    
    # valid and test stuff
    def validation_step(self, batch, batch_nb):
        tokens, attn1, rois, attn2, qid, pid = batch
        y_hat = self.predict(tokens, attn1, rois, attn2)
        return {'y_hat': y_hat, 'qid':qid, 'pid':pid}
    def validation_epoch_end(self, outputs):
        valid = Valid()
        for o in outputs:
            qid = o['qid'].cpu().numpy()[0]
            pid = o['pid'].cpu().numpy()[0]
            scores = o['y_hat'].cpu().numpy()[0]
            for idx, p in enumerate(pid):
                valid.add_prediction(qid, p, scores[idx])
        tensorboard_logs = {'ndcg5': valid.cal_ndcg5()}
        return {'val_loss': torch.tensor(1-valid.cal_ndcg5()), 'log': tensorboard_logs}
    def val_dataloader(self):
        return DataLoader(val, batch_size=1)
model = MyModel()

In [None]:
# model = MyModel.load_from_checkpoint('lightning_logs/version_4/checkpoints/epoch=13.ckpt')
from pytorch_lightning.callbacks import ModelCheckpoint
trainer = pl.Trainer(gpus=1, precision=16, val_check_interval=0.5, 
                     checkpoint_callback=ModelCheckpoint(filepath='./checkpoint1/' + '{epoch:02d}-{val_loss:.2f}',
                                                         verbose=True, monitor='val_loss', mode='min'))
trainer.fit(model)

INFO:lightning:GPU available: True, used: True
INFO:lightning:VISIBLE GPUS: 0
INFO:lightning:Using 16bit precision.
INFO:lightning:
    | Name                                                         | Type              | Params
-----------------------------------------------------------------------------------------------
0   | txt_enc                                                      | TextEncoder       | 52 M  
1   | txt_enc.bert_text                                            | BertModel         | 52 M  
2   | txt_enc.bert_text.embeddings                                 | BertEmbeddings    | 23 M  
3   | txt_enc.bert_text.embeddings.word_embeddings                 | Embedding         | 23 M  
4   | txt_enc.bert_text.embeddings.position_embeddings             | Embedding         | 393 K 
5   | txt_enc.bert_text.embeddings.token_type_embeddings           | Embedding         | 1 K   
6   | txt_enc.bert_text.embeddings.LayerNorm                       | LayerNorm         | 1 K   
7   

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic




HBox(children=(FloatProgress(value=0.0, description='Validation sanity check', layout=Layout(flex='2'), max=5.…



HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …

