In [22]:
# !pip install torch torchvision transformers emoji==0.6.0

In [23]:
from emoji import demojize
from nltk.tokenize import TweetTokenizer
import pandas as pd
import numpy as np
from datetime import datetime
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support

In [24]:
# https://huggingface.co/course/chapter0/1?fw=pt
# huggingface tutorial

In [25]:
train = pd.read_csv('train_label.csv')
dev = pd.read_csv('dev_label.csv')
test = pd.read_csv('test.csv')

In [26]:
train_len = len(train)
dev_len = len(dev)
test_len = len(test)

In [27]:
# https://huggingface.co/docs/transformers/v4.18.0/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase
# tokenizer encode_plus

def tokenise(df):
    tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-covid19-base-uncased', use_fast = False, normalization = True) # normalisation = True for raw tweets
    
    token_ids = []
    attn_masks = []

    for tweet in df.text:
        batch_encoding = tokenizer.encode_plus(tweet, padding = 'max_length', truncation = True, max_length = 128, return_tensors = 'pt', return_attention_mask = True)
        token_ids.append(batch_encoding['input_ids'])
        attn_masks.append(batch_encoding['attention_mask'])     

    token_ids = torch.cat(token_ids, 0)
    attn_masks = torch.cat(attn_masks, 0)
    
    if 'label' in df.columns:
        labels = torch.tensor(df.label)
    else:
        labels = None
            
    return token_ids, attn_masks, labels

In [28]:
# https://discuss.pytorch.org/t/what-do-tensordataset-and-dataloader-do/107017/2
# https://blog.paperspace.com/dataloaders-abstractions-pytorch/
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
# for TensorDataset & DataLoaders

def data_loader(train, dev, test):
    
    train_token_ids, train_attn_masks, train_labels = tokenise(train)
    train_set = TensorDataset(train_token_ids, train_attn_masks, train_labels)
    train_loader = DataLoader(train_set, batch_size = 32, num_workers = 2)
    
    dev_token_ids, dev_attn_masks, dev_labels = tokenise(dev)
    dev_set = TensorDataset(dev_token_ids, dev_attn_masks, dev_labels)
    dev_loader = DataLoader(dev_set, batch_size = 32, num_workers = 2)
    
    test_token_ids, test_attn_masks, _ = tokenise(test)
    test_set = TensorDataset(test_token_ids, test_attn_masks)
    test_loader = DataLoader(test_set, batch_size = 32, num_workers = 2)
     
    return train_loader, dev_loader, test_loader

In [29]:
train_loader, dev_loader, test_loader = data_loader(train, dev, test)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
num_epochs = 20
num_training_steps = num_epochs * len(train_loader)
num_labels = 2

pos = sum(train.label)
neg = train_len - pos
pos_weight = torch.tensor([pos/neg, 1])

In [31]:
# https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html#torch.optim.AdamW
# https://huggingface.co/docs/transformers/main_classes/optimizer_schedules
  
def model_setup(base):
    
    model = AutoModelForSequenceClassification.from_pretrained(base, num_labels = num_labels)

    # lr = 1e-3, eps = 1e-8, weight_decay = 0.01
    optimiser = AdamW(model.parameters(), lr = 1e-4)

    scheduler = get_linear_schedule_with_warmup(optimiser, num_warmup_steps = 0, num_training_steps = num_training_steps)

    return model, optimiser, scheduler

model, optimiser, scheduler = model_setup('vinai/bertweet-covid19-base-uncased')

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

In [32]:
def prediction(logits):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = probs.cpu().detach().numpy() 
    
    pred = []
    for item in soft_probs:
        pred.append(np.argmax(item))

    return pred

In [33]:
def get_accuracy(preds, labels):   
    
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)
    accuracy = sum(preds == labels)

    return accuracy

In [34]:
def get_scores(preds, labels):
    
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)
        
    y_pred = np.array(preds)
    y_true = np.array(labels)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred)
    
    return precision[0], recall[0], fscore[0]

## Modified training & validation to incorporate class weights

In [35]:
def validate(model, dev_loader, dev_len, best_f, pos_weight, i):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    preds = []
    labs = []
    val_loss = 0
    for batch in dev_loader:
        
        
        batch_token_ids, batch_attn_masks, batch_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
    
        with torch.no_grad():

            outputs = model(input_ids = batch_token_ids, token_type_ids = None, attention_mask = batch_attn_masks)

            logits = outputs.logits
            
            criterion = nn.CrossEntropyLoss(weight = pos_weight).to(device)
            loss = criterion(logits.squeeze(-1), batch_labels)
            
        val_loss += loss.item()
        
        pred = prediction(logits)
        preds.append(pred)
        labs.append(batch_labels.cpu().detach().numpy())
    
    val_acc = get_accuracy(preds, labs)    
    precision, recall, fscore = get_scores(preds, labs)
    print(f'Average validation loss = {val_loss/dev_len}')
    print(f'Average validation accuracy = {val_acc/dev_len * 100}')
    print(f'Precision = {precision * 100}')
    print(f'Recall = {recall * 100}')
    print(f'F1 Socre = {fscore * 100}')
    
    if fscore >= best_f:
        best_f = fscore
        torch.save(model.state_dict(), f'bertweet_test_{i}.dat')
        
    return best_f
      

In [36]:
# model.load_state_dict(torch.load('bertweet_test_10.dat'))

In [37]:
# https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
# https://discuss.huggingface.co/t/class-weights-for-bertforsequenceclassification/1674/5 (class weights)

def train(model, optimiser, scheduler, train_loader, dev_loader, num_epochs, train_len, dev_len, pos_weight):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    best_f = 0
    total_start = datetime.now()
    for i in range(num_epochs):
        print('=' * 25)
        print(f'epoch {i + 1} / {num_epochs}')

        
        training_loss = 0
        training_acc = 0
        epoch_start = datetime.now()
        model.train()
        for num, batch in enumerate(train_loader):
            batch_preds = []
            batch_labs = []
            
            optimiser.zero_grad()
                
            batch_token_ids, batch_attn_masks, batch_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)

            outputs = model(input_ids = batch_token_ids, token_type_ids = None, attention_mask = batch_attn_masks)
 
            logits = outputs.logits
            pred = prediction(logits)
            batch_preds.append(pred)
            
            criterion = nn.CrossEntropyLoss(weight = pos_weight).to(device)
            loss = criterion(logits.squeeze(-1), batch_labels)
            training_loss += loss.item()

            batch_labs.append(batch_labels.cpu().detach().numpy())
            
            batch_acc = get_accuracy(batch_preds, batch_labs)
            training_acc += batch_acc
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimiser.step()
            scheduler.step()
                      
            if num > 0 and num % 10 == 0:
                time = datetime.now() - epoch_start
                print(f'batch {num} / {len(train_loader)} Time: {time}')
        
        
        epoch_time = datetime.now() - epoch_start
        print(f'Average training loss = {training_loss/train_len}')
        print(f'Average training accuracy = {training_acc/train_len * 100}')
        print(f'Epoch time = {epoch_time}')
        
        best_fscore = validate(model, dev_loader, dev_len, best_f, pos_weight, i)
        best_f = best_fscore
    
    total_time = datetime.now() - total_start
    print(f'Total time = {total_time}')

train(model, optimiser, scheduler, train_loader, dev_loader, num_epochs, train_len, dev_len, pos_weight)

epoch 1 / 20
batch 10 / 50 Time: 0:00:06.357470
batch 20 / 50 Time: 0:00:11.080929
batch 30 / 50 Time: 0:00:15.806401
batch 40 / 50 Time: 0:00:20.561426
Average training loss = 0.015358556835514724
Average training accuracy = 74.71337579617834
Epoch time = 0:00:24.615217
Average validation loss = 0.009205790746478395
Average validation accuracy = 92.17877094972067
Precision = 95.23809523809523
Recall = 94.7867298578199
F1 Socre = 95.01187648456056
epoch 2 / 20
batch 10 / 50 Time: 0:00:06.295013
batch 20 / 50 Time: 0:00:11.100699
batch 30 / 50 Time: 0:00:15.877753
batch 40 / 50 Time: 0:00:20.667286
Average training loss = 0.0055734406545710794
Average training accuracy = 94.71337579617834
Epoch time = 0:00:24.747463
Average validation loss = 0.005700916343228111
Average validation accuracy = 96.8342644320298
Precision = 97.42388758782201
Recall = 98.5781990521327
F1 Socre = 97.99764428739694
epoch 3 / 20
batch 10 / 50 Time: 0:00:06.349091
batch 20 / 50 Time: 0:00:11.132605
batch 30 / 50

In [38]:
def predict(model, test_loader):
    model.eval()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    preds = []
    for batch in test_loader:
        batch_token_ids, batch_attn_masks = batch[0].to(device), batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids=batch_token_ids, token_type_ids=None, attention_mask=batch_attn_masks)

            logits = outputs.logits
            pred = prediction(logits)
            preds.append(pred)

    preds = np.hstack(preds)

    return preds

In [39]:
def get_results(model, test_loader, model_path, save_path):
    model.load_state_dict(torch.load(model_path))
    
    preds = predict(model, test_loader)

    results = pd.DataFrame({'Predicted': preds})
    results.index.name = 'Id'
    results.to_csv(save_path)

In [41]:
get_results(model, test_loader, 'bertweet_test_7.dat', 'bertweet_test_v4.csv')