In [1]:
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade &> /dev/null

In [2]:
!pip install git+https://github.com/huggingface/transformers.git > /dev/null

  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-yc70mo2g


In [1]:
from transformers import AutoTokenizer, AutoModel

In [2]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from torch.nn import functional as F

In [3]:
import pytorch_lightning as pl

In [4]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 2
MODEL_PATH = "model.bin"
BERT_PATH = "DeepPavlov/rubert-base-cased-sentence"
TOKENIZER = AutoTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
)

In [7]:
!cp "/content/drive/MyDrive/PandemicHack/bert_test.pq" "bert_test.pq"
!cp "/content/drive/MyDrive/PandemicHack/bert_train.pq" "bert_train.pq"

In [5]:
def padder(batch):
    data = {k: [dic[k] for dic in batch] for k in batch[0]}

    data['ids'] = pad_sequence(data['ids'], batch_first=True, padding_value=0)
    data['mask'] = pad_sequence(data['mask'], batch_first=True, padding_value=0)
    data['targets'] = torch.stack(data['targets'], dim=0)
    return data

In [6]:
class Dataset:
    def __init__(self, text, target):
        self.text = text
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())


        inputs = self.tokenizer.encode_plus(
            text,
            None, 
            add_special_tokens=True,
            max_length=self.max_len,
            truncation='only_first',
            padding=True,
            return_tensors='pt',
            pad_to_max_length=False,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": ids[0], # torch.tensor(ids, dtype=torch.long),
            "mask": mask[0], #torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }


In [7]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = 1e-6
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1))+self.eps)

In [7]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss

In [8]:
def loss_fn(outputs, targets):
    #return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
    # print(f"true {outputs}")
    # print(f"predict {targets.view(-1, 1)}")
    result = RMSELoss()(outputs, targets.view(-1, 1))
    return result

In [76]:
import pdb

In [9]:
class BertClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.1)
        self.out = nn.Linear(768 * 2, 1)

    def forward(self, ids, mask): #, token_type_ids):

        o1, _ = self.bert(
            ids,
            attention_mask=mask,
            return_dict=False
        )
        

        # Average Max Pooling
        mean_pooling = torch.mean(o1, 1)
        max_pooling, _ = torch.max(o1, 1)
        cat = torch.cat((mean_pooling, max_pooling), 1)

        bo = self.bert_drop(cat)
        output = self.out(bo)
        return output

    def training_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch["mask"]
        targets = batch["targets"]

        outputs = self(
              ids=ids,
              mask=mask,
        )
        #pdb.set_trace
        loss = loss_fn(outputs, targets)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch["mask"]
        targets = batch["targets"]

        outputs = self(
              ids=ids,
              mask=mask,
        )
        loss = loss_fn(outputs, targets)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return {'loss': loss, "preds": outputs, "labels": targets}

    def validation_epoch_end(self, outputs):
        preds = torch.cat([x['preds'] for x in outputs])
        targets = torch.cat([x['labels'] for x in outputs])
        rmse = loss_fn(preds, targets)
        self.log('rmse', rmse, prog_bar=True)
        return

    def setup(self, stage):
        if stage == 'fit':
            train_loader = self.train_dataloader()
            self.total_steps = int(len(train_loader.dataset) // 
                                   TRAIN_BATCH_SIZE * EPOCHS)
            
    def configure_optimizers(self):
        model = self
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(
                        nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(
                        nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

        optimizer = AdamW(optimizer_parameters, lr=1e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=self.total_steps
        )
        return optimizer
      

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
df_train_full = pd.read_parquet('/content/bert_train.pq')
df_test = pd.read_parquet('/content/bert_test.pq')

df_train, df_valid = train_test_split(df_train_full, train_size=0.7)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

In [12]:
def merg(x):
  res = []
  for e in x:
    if e is not None:
      res.append(str(e))
  if len(res) > 0:
    return " ".join(res)
  else:
    return ""

In [13]:
df_train['txt'] = df_train[['position', 'responsibilities', 'achievements']].apply(merg, axis=1)
df_valid['txt'] = df_valid[['position', 'responsibilities', 'achievements']].apply(merg, axis=1)
df_test['txt'] = df_test[['position', 'responsibilities', 'achievements']].apply(merg, axis=1)

In [14]:
df_train['target'] = np.log(df_train['salary'] + 1)
df_valid['target'] = np.log(df_valid['salary'] + 1)

In [15]:

train_dataset = Dataset(
    text=df_train.txt.values, target=df_train.target.values
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=TRAIN_BATCH_SIZE, 
    num_workers=4,
    collate_fn=padder,
    pin_memory=False,
    drop_last=True

)

valid_dataset = Dataset(
    text=df_valid.txt.values, target=df_valid.target.values
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, 
    batch_size=VALID_BATCH_SIZE, 
    num_workers=4,
    collate_fn=padder,
    pin_memory=False,
    drop_last=True
)

In [20]:
test_dataset = Dataset(
    text=df_test.txt.values, target=[0.0]*len(df_test.txt.values)
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size=32, 
    num_workers=4,
    collate_fn=padder,
    pin_memory=False,
    drop_last=False
)

In [17]:
from pytorch_lightning import Trainer, seed_everything
seed_everything(0)

0

In [17]:
model = BertClassifier()

In [19]:
trainer = Trainer(gpus=1, progress_bar_refresh_rate=20, max_epochs=1)
# trainer = Trainer(progress_bar_refresh_rate=20, max_epochs=1)
trainer.fit(model, train_data_loader, valid_data_loader)   

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | bert      | BertModel | 177 M 
1 | bert_drop | Dropout   | 0     
2 | out       | Linear    | 1.5 K 
----------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [20]:
torch.save(model.state_dict(), MODEL_PATH)

In [19]:
model.load_state_dict(torch.load('model.bin'))

<All keys matched successfully>

In [23]:
!cp 'model.bin' "/content/drive/MyDrive/PandemicHack/rubert-base-cased-sentence.bin"

In [24]:
with torch.no_grad():
  DEVICE = "cuda"
  device = torch.device(DEVICE)
  result = []
  model.eval();
  model.to(DEVICE)
  for batch in tqdm(test_data_loader):
      ids = batch['ids']
      mask = batch["mask"]
      ids = ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      result += model(ids=ids, mask=mask).cpu().detach().numpy().tolist()

HBox(children=(FloatProgress(value=0.0, max=4102.0), HTML(value='')))




In [25]:
import joblib

In [26]:
joblib.dump(torch.tensor(result).flatten().numpy(), 'pred.bin')

['pred.bin']

In [27]:
!cp 'pred.bin' "/content/drive/MyDrive/PandemicHack/pred2.bin"

In [29]:
df_train_full['txt'] = df_train_full[['position', 'responsibilities', 'achievements']].apply(merg, axis=1)
full_train_dataset = Dataset(
    text=df_train_full.txt.values, target=[0.0]*len(df_train_full.txt.values)
)

full_train_data_loader = torch.utils.data.DataLoader(
    full_train_dataset, 
    batch_size=64, 
    num_workers=4,
    collate_fn=padder,
    pin_memory=False,
    drop_last=False
)

In [30]:
with torch.no_grad():
  DEVICE = "cuda"
  device = torch.device(DEVICE)
  result = []
  model.eval();
  model.to(DEVICE)
  for batch in tqdm(full_train_data_loader):
      ids = batch['ids']
      mask = batch["mask"]
      ids = ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      result += model(ids=ids, mask=mask).cpu().detach().numpy().tolist()

HBox(children=(FloatProgress(value=0.0, max=4786.0), HTML(value='')))




In [32]:
joblib.dump(torch.tensor(result).flatten().numpy(), 'train_pred.bin')

['train_pred.bin']

In [33]:
!cp 'pred.bin' "/content/drive/MyDrive/PandemicHack/pred2.bin"
!cp 'train_pred.bin' "/content/drive/MyDrive/PandemicHack/train_pred2.bin"