## Imports

In [2]:
import pandas as pd
import numpy as np
from torch import nn
from torch.utils.data import DataLoader,Dataset
from stabilizer.model import PoolerClassifier
from stabilizer.llrd import get_optimizer_parameters_with_llrd
from stabilizer.reinitialize import reinit_autoencoder_model
from stabilizer.dataset import TextLabelDataset
from stabilizer.trainer import train_step,evaluate_step
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModel,AutoTokenizer
from sklearn.metrics import f1_score
from transformers import get_scheduler
from torch.optim import AdamW
import random

2021-09-27 09:48:57.887343: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


## Setting SEED

In [3]:

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

## Training configurations

In [None]:
config = {
    'pretrained_model':'roberta-base',
    'num_classes':1,
    'batch_size':32,
    'device_name':torch.device('cuda'),
    'lr':1e-5,
    'mutliplicative_lr':0.95,
    'llrd':False,
    'reinit':2,
    'epochs':3,
    'valid_every':15,
    'scheduler':'linear',
    'seed':1000
}

seed_everything(config['seed'])

## Read data

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
print(f"train dataset has {train_df.shape[0]} samples")
train_df.head(5)

## Data preparation

In [None]:
train,valid = train_test_split(train_df,test_size=0.2,stratify=train_df['target'])

In [None]:
# Prepate data to create dataset
train_tweets = train_df['text'].tolist()
valid_tweets = train_df['text'].tolist()
train_targets = torch.from_numpy(train_df['target'].to_numpy().reshape(-1, 1)).type(torch.float32)
valid_targets = torch.from_numpy(train_df['target'].to_numpy().reshape(-1, 1)).type(torch.float32)

In [None]:
# Create Dataset and DataLoader
train_dataset = TextLabelDataset(text_excerpts=train_tweets, labels=train_targets)
valid_dataset = TextLabelDataset(text_excerpts=valid_tweets, labels=valid_targets)

train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], shuffle=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=config['batch_size'], shuffle=False)


## Loss function and competition metric

In [None]:
loss_fn = nn.BCEWithLogitsLoss()
metric = f1_score


## Train and Evaluate

In [None]:
def train_and_eval(train_dataloader,valid_dataloader):
    
    tokenizer = AutoTokenizer.from_pretrained(config['pretrained_model'])
    transformer = AutoModel.from_pretrained(config['pretrained_model'])
    model = PoolerClassifier(transformer=transformer,
                             transformer_output_size=transformer.config.hidden_size,
                             transformer_output_dropout_prob=transformer.config.hidden_dropout_prob,
                             num_classes=config['num_classes']
                        )
    device = torch.device(config['device_name'])
    _ = model.to(device)
    
    
    if config['llrd']:
        parameters = get_optimizer_parameters_with_llrd(model,config['lr'],config['multiplicative_lr'])
        
    else:
        no_decay = ['bias','layerNorm.weight']
        parameters = [{'params':[p for n,p in model.named_parameters() if not any(k in n for k in no_decay)],
                     'weight_decay':0.01,'lr':config['lr']},
                      {'params':[p for n,p in model.named_parameters() if any(k in n for k in no_decay)],
                     'weight_decay':0.00,'lr':config['lr']}]
        
    if config['reinit']:
        model = reinit_autoencoder_model(model,config['reinit'])
        
    optimizer = AdamW(parameters)

    num_training_steps = config['epochs'] * len(train_dataloader)
    scheduler = get_scheduler(name=config['scheduler'],num_training_steps=num_training_steps,
                              num_warmup_steps=int(0.1*num_training_steps),optimizer=optimizer)
    num_iter=0
    for epoch in range(config['epochs']):
        train_f1,train_loss = 0.0,0.0
        for batch in train_dataloader:
            inputs = tokenizer(batch['text_excerpt'],padding=True, truncation=True,return_tensors='pt').to(config['device_name'])
            targets = batch['label'].to(config['device_name'])
            train_outputs = train_step(model=model, inputs=inputs, targets=targets, loss_fn=loss_fn, optimizer=optimizer,
                                        scheduler=scheduler)
            #train_f1 += metric(targets.detach().cpu().numpy(),train_outputs['predictions'].detach().cpu().numpy())
            train_loss += train_outputs['loss']
            if num_iter % config['valid_every'] == 0:
                valid_f1,valid_loss = 0.0,0.0
                for valid_batch in  valid_dataloader:
                    inputs = tokenizer(valid_batch['text_excerpt'],padding=True, truncation=True,return_tensors='pt').to(config['device_name'])
                    targets = valid_batch['label'].to(config['device_name'])
                    valid_outputs = evaluate_step(model=model, inputs=inputs, targets=targets, loss_fn=loss_fn)
                    predictions = (nn.Sigmoid()(valid_outputs['predictions'])).detach().cpu().numpy()
                    
                    valid_f1 += metric(targets.detach().cpu().numpy(),np.round(predictions),average='micro')
                    valid_loss += valid_outputs['loss']
                print("validation f1 score",valid_f1/len(valid_dataloader))
                print("validation loss",valid_loss.item()/len(valid_dataloader))
            num_iter+=1

            
        print(f"Train epoch {epoch} loss {train_loss.item()/len(train_dataloader)}")
        #print(f"Train epoch {epoch} f1 score {train_f1/len(train_dataloader)}")   

In [None]:
train_and_eval(train_dataloader,valid_dataloader)