<a href="https://colab.research.google.com/github/chandrakanta-chaudhury/Misc/blob/master/Jigsaw_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [43]:
VERSION = "1.5"  #@param ["1.5" , "20200325", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  3727  100  3727    0     0  22587      0 --:--:-- --:--:-- --:--:-- 22865
Updating TPU and VM. This may take around 2 minutes.
Updating TPU runtime to pytorch-1.5 ...
Uninstalling torch-1.5.0a0+ab660ae:
  Successfully uninstalled torch-1.5.0a0+ab660ae
Uninstalling torchvision-0.6.0a0+3c254fb:
  Successfully uninstalled torchvision-0.6.0a0+3c254fb
Copying gs://tpu-pytorch/wheels/torch-1.5-cp36-cp36m-linux_x86_64.whl...
\ [1 files][ 79.0 MiB/ 79.0 MiB]                                                
Operation completed over 1 objects/79.0 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-1.5-cp36-cp36m-linux_x86_64.whl...
- [1 files][106.7 MiB/106.7 MiB]                                                
Operation comp

In [0]:
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
import sys
from sklearn import metrics, model_selection

import warnings
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
import warnings

warnings.filterwarnings("ignore")


In [45]:
!pip install transformers



In [0]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [0]:
class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768 * 2, 1)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)
        
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        cat = torch.cat((apool, mpool), 1)

        bo = self.bert_drop(cat)
        p2 = self.out(bo)
        return p2


In [0]:
class BERTDatasetTraining:
    def __init__(self, comment_text, targets, tokenizer, max_length):
        self.comment_text = comment_text
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.targets = targets

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.float)
        }


In [0]:
def _run():
    def loss_fn(outputs, targets):
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
        model.train()
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            optimizer.zero_grad()
            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            loss = loss_fn(outputs, targets)
            if bi % 10 == 0:
                xm.master_print(f'bi={bi}, loss={loss}')

            loss.backward()
            xm.optimizer_step(optimizer)
            if scheduler is not None:
                scheduler.step()

    def eval_loop_fn(data_loader, model, device):
        model.eval()
        fin_targets = []
        fin_outputs = []
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            targets_np = targets.cpu().detach().numpy().tolist()
            outputs_np = outputs.cpu().detach().numpy().tolist()
            fin_targets.extend(targets_np)
            fin_outputs.extend(outputs_np)    

        return fin_outputs, fin_targets

In [0]:
MAX_LEN = 192
TRAIN_BATCH_SIZE = 128
EPOCHS = 100

In [0]:
#API  >kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification
!pip install -q kaggle

In [0]:
from google.colab import files


In [0]:
!mkdir .kaggle

In [0]:
!mkdir ~/.kaggle

In [0]:
!touch ~/.kaggle/kaggle.json

In [0]:
api_token = {"username":"chandrakanta","key":"67f5e0fb3722d0bf9d99e91113d710cc"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [19]:
!kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification

Downloading test.csv.zip to /content
 40% 5.00M/12.4M [00:00<00:00, 15.0MB/s]
100% 12.4M/12.4M [00:00<00:00, 31.3MB/s]
Downloading jigsaw-toxic-comment-train-processed-seqlen128.csv.zip to /content
 90% 72.0M/79.6M [00:00<00:00, 138MB/s]
100% 79.6M/79.6M [00:00<00:00, 180MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/612k [00:00<?, ?B/s]
100% 612k/612k [00:00<00:00, 132MB/s]
Downloading jigsaw-toxic-comment-train.csv.zip to /content
 80% 30.0M/37.3M [00:00<00:00, 44.2MB/s]
100% 37.3M/37.3M [00:00<00:00, 77.0MB/s]
Downloading jigsaw-unintended-bias-train.csv.zip to /content
 95% 277M/292M [00:02<00:00, 152MB/s]
100% 292M/292M [00:02<00:00, 125MB/s]
Downloading test-processed-seqlen128.csv.zip to /content
 77% 23.0M/29.8M [00:00<00:00, 48.4MB/s]
100% 29.8M/29.8M [00:00<00:00, 97.8MB/s]
Downloading validation.csv.zip to /content
  0% 0.00/1.35M [00:00<?, ?B/s]
100% 1.35M/1.35M [00:00<00:00, 92.1MB/s]
Downloading jigsaw-unintended-bias-train-processed-seqlen128.csv.zip to /c

In [20]:
!unzip /content/jigsaw-toxic-comment-train.csv.zip

Archive:  /content/jigsaw-toxic-comment-train.csv.zip
  inflating: jigsaw-toxic-comment-train.csv  


In [21]:
!unzip /content/jigsaw-unintended-bias-train.csv.zip

Archive:  /content/jigsaw-unintended-bias-train.csv.zip
  inflating: jigsaw-unintended-bias-train.csv  


In [0]:
df_train1 = pd.read_csv("/content/jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"]).fillna("none")
df_train2 = pd.read_csv("/content/jigsaw-unintended-bias-train.csv", usecols=["comment_text", "toxic"]).fillna("none")
df_train_full = pd.concat([df_train1, df_train2], axis=0).reset_index(drop=True)
df_train = df_train_full.sample(frac=1).reset_index(drop=True).head(400000)
    

In [23]:
!unzip /content/validation.csv.zip

Archive:  /content/validation.csv.zip
  inflating: validation.csv          


In [0]:
df_valid = pd.read_csv('/content/validation.csv')

In [0]:
#get it from here BERT pretrained 
#!wget https://www.kaggle.com/abhishek/bert-base-multilingual-uncased

In [56]:
tokenizer = transformers.BertTokenizer.from_pretrained("/content/bert-base-multilingual-uncased", do_lower_case=True)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [0]:
train_targets = df_train.toxic.values
valid_targets = df_valid.toxic.values

   

In [0]:
 train_dataset = BERTDatasetTraining(
        comment_text=df_train.comment_text.values,
        targets=train_targets,
        tokenizer=tokenizer,
        max_length=MAX_LEN
    )

In [0]:
train_sampler = torch.utils.data.distributed.DistributedSampler(
          train_dataset,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=True)

train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        sampler=train_sampler,
        drop_last=True,
        num_workers=4
    )

valid_dataset = BERTDatasetTraining(
        comment_text=df_valid.comment_text.values,
        targets=valid_targets,
        tokenizer=tokenizer,
        max_length=MAX_LEN
    )


In [0]:
valid_sampler = torch.utils.data.distributed.DistributedSampler(
          valid_dataset,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=False)

valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=64,
        sampler=valid_sampler,
        drop_last=False,
        num_workers=1
    )

In [61]:
device = xm.xla_device()
model = BERTBaseUncased(bert_path="/content/bert-base-multilingual-uncased/").to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
lr = 3e-5 * xm.xrt_world_size()
num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS)
xm.master_print(f'num_train_steps = {num_train_steps}, world_size={xm.xrt_world_size()}')

optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

OSError: ignored

In [0]:
for epoch in range(EPOCHS):
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=scheduler)

        para_loader = pl.ParallelLoader(valid_data_loader, [device])
        o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device)
        xm.save(model.state_dict(), "model.bin")
        auc = metrics.roc_auc_score(np.array(t) >= 0.5, o)
        xm.master_print(f'AUC = {auc}')
