In [None]:
!pip install torch torchvision torchaudio neptune-client tqdm --quiet > /dev/null 
!pip install transformers==2.1.1 folium==0.2.1 pytorch-lightning --quiet > /dev/null
!apt install git git-lfs > /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[31mERROR: Operation cancelled by user[0m




In [None]:
!git clone https://github.com/karlfroldan/prototype.git
!git clone https://huggingface.co/microsoft/codebert-base
!mv prototype/* . 
!rm -rf prototype 
!mv codebert-base codebert

In [None]:
%cd codebert 
!git lfs install
!git lfs pull 
%cd ..

In [None]:
import pandas as pd

import random
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader, SubsetRandomSampler

import numpy as np
import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import MultiLabelBinarizer
from pytorch_lightning.loggers.neptune import NeptuneLogger

import neptune.new as neptune

import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor

import transformers
from transformers import RobertaTokenizer, RobertaModel

from torchvision.ops import sigmoid_focal_loss

from prototype_dataloader import get_datasets

from sklearn.metrics import f1_score, hamming_loss
import warnings

from tqdm.notebook import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {device}')

def seed_everything(seed=42):
    """"
    Seed everything.
    """   
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    pl.seed_everything(seed)

# Set the RNG
seed_everything(1729)


#torch.manual_seed(1729) # A Tribute to Srinavasa Ramanujan
# quicksort          0
# mergesort          0
# selectionsort      0
# insertionsort      0
# bubblesort         1
# linearsearch       0
# binarysearch       0
# linkedlist         0
# hashmap            0
def get_labels(arr):
    cols = ["quicksort", "mergesort", "selectionsort", "insertionsort", "bubblesort", 
            "linearsearch", "binarysearch", "linkedlist", "hashmap"]    
    return list(map(lambda tup: tup[0], 
                    filter(lambda tup: tup[1] == 1, 
                           zip(cols, arr.tolist()))))

### Data Pre-processing

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("./codebert")
model = RobertaModel.from_pretrained("./codebert")

data_csv = pd.read_csv("prototype.csv")
# split=0.1 => split=1.0 because we want to use k-fold cross validation instead
#train_data, test_data = get_datasets(data_csv, tokenizer, split=0.1, data_folder='./data/prototype')
data, _empty = get_datasets(data_csv, tokenizer, split=1.0, data_folder='./data/prototype')

### Model

In [None]:
class OurModel(pl.LightningModule):
    def __init__(self, codebert, loss, input=393_216, hidden=None, labels=9, train_rate=1e-3, device='cuda'):
        super().__init__()
        
        self.transformer = codebert
        
        # Disable the gradients of codebert
        for param in self.transformer.parameters():
            param.requires_grad = False
        
        self.loss = loss
        self.train_rate = train_rate
        
        self.fc1 = nn.Linear(768 * 512, 420)
        self.hidden_is_none = hidden is None
        last = 420
        if hidden is not None:
            self.hidden = []
            for i in hidden:
                n = nn.Linear(last, i).cuda()
                
                self.hidden.append(n)
                last = i
        
        self.output = nn.Linear(last, labels)

    def get_preds(self, y):
        return (y >= 0.5).long()
      
    def get_preds_numpy(self, y):
        return (y >= 0.5).astype(int)

    def forward(self, x):
        (out, mask) = self.transformer(x)
        out = torch.flatten(out, 1)
        out = F.relu(self.fc1(out))
        if not self.hidden_is_none:
            for layer in self.hidden:
                out = F.relu(layer(out))
        # Instead, we need to ensure that we add a sigmoid layer
        # when training the model.
        return self.output(out) 
        #out = self.output(out)
        #return #F.sigmoid(out)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.train_rate)
        return optimizer
        
    def training_step(self, train_batch, batch_idx):
        X, y = train_batch
        X = X['input_ids']
        y_hat = self(X)
        
        loss = self.loss(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        X, y = val_batch
        X = X['input_ids']
        y_hat = self(X)
        loss = self.loss(y_hat, y)

        y_hat_sigmoid = torch.sigmoid(y_hat)
        self.log('validation loss', loss)
        #subset_acc = self.subset_accuracy(y_hat_sigmoid, y) 

        # Transfer them to the CPU

        y_cpu = y.squeeze().cpu().detach().numpy()
        y_hat_sigmoid_cpu = self.get_preds(y_hat_sigmoid).squeeze().cpu().detach().numpy()

        hamming = hamming_loss(y_cpu, y_hat_sigmoid_cpu)

        f1_micro = f1_score(y_cpu, y_hat_sigmoid_cpu, average='micro', zero_division=1)
        f1_macro = f1_score(y_cpu, y_hat_sigmoid_cpu, average='macro', zero_division=1)
        self.log('hamming loss', hamming)
        self.log('Micro F1', f1_micro)
        self.log('Macro F1', f1_macro)

        

        # self.log('True labels', y_cpu)
        # self.log('Predicted labels', y_hat_sigmoid_cpu)
        
        

### The Loss Function

The criterion that this model will use is the **Focal Loss** which is defined as an extension of the **Cross-entropy loss**. 

We know that Cross-entropy loss is defined as 

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=4, alpha=0.1, device='cuda'):
        super(FocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1 - alpha])
        if device == 'cuda':
          self.alpha = self.alpha.cuda()
        self.gamma = gamma

        self.bce = nn.BCEWithLogitsLoss(reduction='none')
    
    def forward(self, y_hat, y):
        epsilon = 1e-4
        y_prime = y.type(torch.float32)
        b = self.bce(y_hat, y_prime)

        alpha_t = self.alpha.gather(0, y.data.view(-1)).reshape(-1, 9)
        p_t = torch.exp(-b + epsilon)
        
        F_loss = alpha_t * (1 - p_t) ** self.gamma * b
        return F_loss.mean()

class CEWithLogitsLoss(nn.Module):
  def __init__(self, device='cuda'):
        super(CEWithLogitsLoss, self).__init__()
        weights = [1,1,1,1,1,1,1,1,1]
        if device == 'cuda':
          class_weights = torch.FloatTensor(weights).cuda()
        class_weights = torch.FloatTensor(weights)
        self.bce = nn.BCEWithLogitsLoss(weight = class_weights, reduction='none')

  def forward(self, y_hat, y):
    y_prime = y.type(torch.float32)
    b = self.bce(y_hat, y_prime).mean()
    return b

### Training

In [None]:
focal_loss = FocalLoss(gamma=5, alpha=0.25, device='cuda')
crossentropy_loss = CEWithLogitsLoss(device='cuda')

In [None]:
# Some important stuff in our k-fold validation
kfold = KFold(n_splits=8, shuffle=True)
print(f'kfold n-splits: {kfold.get_n_splits(data)}')

# Manual. Values from 0 to 8.
which_split = 7

In [None]:
# trainer = pl.Trainer(gpus=1, precision=32, max_epochs=5, log_every_n_steps=15)
# trainer.fit(m, trainset, valset)

for k_split_idx, (train_idxs, test_idxs) in enumerate(kfold.split(data)):

    # If our `which_split` value is not equal the current `k_split_idx`, we skip
    # this for loop iteration. This is because we wanna save memory when training.
    # We can't afford to train twice.
    if (k_split_idx != which_split):
        continue;
    
    train_sampler = SubsetRandomSampler(train_idxs)
    test_sampler = SubsetRandomSampler(test_idxs)
    #print("train_idxs ", len(train_idxs), "test_idxs ", len(test_idxs))
    trainset = DataLoader(data, batch_size=32, sampler=train_sampler)
    # We set this as global for evaluation later on
    global valset, m
    valset = DataLoader(data, batch_size=32, sampler=test_sampler)
    neptune_logger = NeptuneLogger(
        api_key="",  # replace with your own
        project="pancit-canton/Optimus",  # "<WORKSPACE/PROJECT>"
        #tags=["training", "resnet"],  # optional
    )

    m = OurModel(model, loss=crossentropy_loss, train_rate=1e-5, hidden=None, device=device)
    trainer = pl.Trainer(gpus=1, precision=16, max_epochs=2, log_every_n_steps=6, logger=neptune_logger)
    trainer.fit(m, trainset, valset)

The hamming loss is defined as 
$$
\frac{1}{|N|\cdot|L|}\sum_{i=1}^{|N|}\sum_{j=1}^{|L|}\left(\widehat{y}_{i,j}\oplus y_{i,j}\right)
$$

In [None]:
prediction = []
real = []
subset_acc = 0
subset_accuracy = lambda y_hat, y: torch.all((y == y_hat)).float()
subset_pred = lambda y_hat, y: (torch.all(get_preds(y_hat) == y)).float()
get_preds = lambda ys : (ys >= 0.5).long()
        
for X, y in tqdm(valset):
    X = X['input_ids']
    y_hat = torch.sigmoid(m(X))
    prediction.append(get_preds(y_hat).detach().numpy())
    subset_acc += subset_pred(y_hat, y)
    real.append(y.detach().numpy())

subset_acc /= (len(valset) * 9)
print(f'Subset accuracy: {subset_acc}')

prediction = np.vstack(prediction)
real = np.vstack(real)

import pickle as pkl 

for fname, array in zip(['prediction.pkl', 'real.pkl'], [prediction, real]):
    with open(fname, 'wb') as f:
        pkl.dump(array, f)
        print(f'Dumped {fname}')