In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
from torch import nn

from transformers import BertTokenizer
from transformers import BertModel
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from catalyst import dl
from catalyst import dl, utils

In [None]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
from util import *

In [None]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [None]:
subset = list(set(all_tiers_100)-set(["PersonalizedProduct"]))
subset

In [None]:
training_set['labels']=training_set[subset].astype(int).values.tolist()
testing_set['labels']=testing_set[subset].astype(int).values.tolist()

In [None]:
# training_set['label'] = training_set.AnalysisAndModeling.astype(int)
# testing_set['label'] = testing_set.AnalysisAndModeling.astype(int)

In [None]:
#training_set.label

In [None]:
#training_data = Dataset.from_pandas(training_set, split="training")
#testing_data = Dataset.from_pandas(testing_set, split="testing")

In [None]:
MAX_LEN = 1600
TRAIN_BATCH_SIZE = 6
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
SEED = 17
PRED_THRES = 0.4
ACCUM_STEPS = 10

In [None]:
model_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.claims
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
training_dataset = MultiLabelDataset(training_set, tokenizer, MAX_LEN)
testing_dataset = MultiLabelDataset(testing_set, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_dataset, **train_params)
testing_loader = DataLoader(testing_dataset, **test_params)

In [None]:
loaders = {"train": training_loader, "valid": testing_loader}

In [None]:
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, gradient_checkpointing=True)

In [None]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, len(subset))

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Sigmoid()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = Model()

In [None]:
device = utils.get_device()

In [None]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-4)
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
#lrfinder = dl.LRFinder(final_lr=1)
runner = dl.SupervisedRunner(input_key=("input_ids", "attention_mask", "token_type_ids"))
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    logdir="./logdir",
    num_epochs=1,
    callbacks=[
               dl.MultiLabelAccuracyCallback(threshold=PRED_THRES),
               dl.EarlyStoppingCallback(patience=2, metric="loss", minimize=True),
               dl.TensorboardLogger(),
               dl.CheckpointCallback(),
               dl.OptimizerCallback(accumulation_steps=ACCUM_STEPS),
               dl.ValidationManagerCallback(),
               #lrfinder
                ],
               #dl.MetricManagerCallback(num_classes=len(subset), ),
    
    fp16=True,
    verbose=True
)

In [None]:
runner.

In [None]:
predictions = np.vstack(list(map(
    lambda x: x["logits"].cpu().numpy(), 
    runner.predict_loader(loader=loaders["valid"], resume=f"./logdir/checkpoints/best.pth")
)))

In [None]:
testing_set[subset].head()

In [None]:
binary_predictions = torch.sigmoid(torch.from_numpy(predictions)) > 0.5

In [None]:
from sklearn.metrics import *
print(classification_report(testing_set[subset].astype(int), binary_predictions, target_names=subset))

In [None]:
hamming_loss(testing_set[subset], binary_predictions)