In [18]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)
import os

In [19]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:

# ____________________________________________________________
# Load the  News Dataset

parent_path = os.path.join(os.getcwd(), os.pardir)
data_path = os.path.join(parent_path, 'data')
vaccination_recovery_news = pd.read_csv(os.path.join(data_path, 'vaccination-recovery-news-data.csv'))
vaccination_recovery_news['synthetic'] = False

# Create a DataFrame for original articles with label 1
vaccination_recovery_news_train = vaccination_recovery_news[['body_text', 'reliability']].copy()
vaccination_recovery_news_train.columns = ['text', 'target']
vaccination_recovery_news_train['synthetic'] = False
df = vaccination_recovery_news_train

# Print the ratio between the classes

print("Ratio of target value:\n",df['target'].value_counts(normalize = True))


Ratio between reliable and unreliable data:  target
1    0.652361
0    0.347639
Name: proportion, dtype: float64


In [None]:
# Defining key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
test_size = 0.2
train_size = 0.8
EPOCHS = 4
LEARNING_RATE = 1e-05
PADDING = "max_length"

# Tokenize and encode the sentences. We define SentimentData class to tokenize

tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base', truncation=True, do_lower_case=True)

class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.text)


    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding=PADDING,  # Updated to the new syntax
            truncation=True,       # Ensure the text is truncated if it's too long
            return_tensors="pt"    # Return PyTorch tensors
        )

        # Remove the batch dimension added by return_tensors="pt"
        ids = inputs['input_ids'].squeeze(0)
        mask = inputs['attention_mask'].squeeze(0)

        return {
            'ids': ids,  # Already a tensor of shape (seq_length)
            'mask': mask,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:


train_data, test_data = train_test_split(df, test_size=test_size, stratify=df['target'], random_state=200)
train_data = train_data.shuffle().reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (699, 3)
TRAIN Dataset: (559, 3)
TEST Dataset: (140, 3)


In [36]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [37]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("distilroberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
model = RobertaClass()
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

def calcuate_accuracy(preds, targets): #Used in the training loop in the next cell
    n_correct = (preds==targets).sum().item()
    return n_correct



In [39]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask).squeeze(1)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        # Apply sigmoid to convert logits to probabilities
        probs = torch.sigmoid(outputs)

        # Predict class: 1 if probability > 0.5, else 0
        big_idx = (probs > 0.5).long()
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        output_every = 50
        
        if _%output_every==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per {output_every} steps: {loss_step}")
            print(f"Training Accuracy per {output_every} steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:

for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Training Loss per 50 steps: 0.7224007844924927
Training Accuracy per 50 steps: 25.0


50it [05:56,  7.56s/it]

Training Loss per 50 steps: 0.6459068568313823
Training Accuracy per 50 steps: 62.009803921568626


70it [08:25,  7.23s/it]


The Total Accuracy for Epoch 0: 63.1484794275492
Training Loss Epoch: 0.6119360902479717
Training Accuracy Epoch: 63.1484794275492


0it [00:00, ?it/s]

Training Loss per 50 steps: 0.22250166535377502
Training Accuracy per 50 steps: 100.0


50it [08:18,  9.52s/it]

Training Loss per 50 steps: 0.3728890049399114
Training Accuracy per 50 steps: 87.25490196078431


70it [11:29,  9.85s/it]


The Total Accuracy for Epoch 1: 88.72987477638641
Training Loss Epoch: 0.33298698881907124
Training Accuracy Epoch: 88.72987477638641


0it [00:00, ?it/s]

Training Loss per 50 steps: 0.08609210699796677
Training Accuracy per 50 steps: 100.0


50it [08:20, 10.47s/it]

Training Loss per 50 steps: 0.19052918087325843
Training Accuracy per 50 steps: 93.87254901960785


70it [10:46,  9.24s/it]


The Total Accuracy for Epoch 2: 92.4865831842576
Training Loss Epoch: 0.209235532315714
Training Accuracy Epoch: 92.4865831842576


0it [00:00, ?it/s]

Training Loss per 50 steps: 0.04695649445056915
Training Accuracy per 50 steps: 100.0


50it [07:16, 11.08s/it]

Training Loss per 50 steps: 0.1845797813844447
Training Accuracy per 50 steps: 94.36274509803921


70it [10:50,  9.30s/it]

The Total Accuracy for Epoch 3: 94.81216457960645
Training Loss Epoch: 0.18568526144538608
Training Accuracy Epoch: 94.81216457960645





In [48]:
output_model_file = 'pytorch_roberta_MMCoVaR.bin'
output_vocab_file = 'results_ROBERTA'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

Vocabulary path (results_ROBERTA) should be a directory


All files saved
This tutorial is completed


In [None]:
from sklearn.metrics import precision_score, recall_score

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    all_targets = []
    all_predictions = []
    
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            # token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            
            outputs = model(ids, mask).squeeze(1)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            
            targets = data['targets'].to(device, dtype=torch.int)
            # Apply sigmoid to convert logits to probabilities
            probs = torch.sigmoid(outputs)

            # Predict class: 1 if probability > 0.5, else 0
            big_idx = (probs > 0.5).long()
            n_correct += (big_idx == targets).sum().item()
            
            # Store all targets and predictions for precision and recall
            all_targets.extend(targets.cpu().numpy())
            all_predictions.extend(big_idx.cpu().numpy())
            
            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)
            
            output_every = 50
            
            if _ % output_every == 0:
                loss_step = tr_loss / nb_tr_steps
                accu_step = (n_correct * 100) / nb_tr_examples
                print(f"Validation Loss per {output_every} steps: {loss_step}")
                print(f"Validation Accuracy per {output_every} steps: {accu_step}")


    epoch_accu = (n_correct * 100) / nb_tr_examples
    epoch_precision = precision_score(all_targets, all_predictions, average="binary")
    epoch_recall = recall_score(all_targets, all_predictions, average="binary")
    
    return epoch_accu, epoch_precision, epoch_recall


In [50]:
acc, precision, recall = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)
print("Precision on test data = %0.2f%%" % (precision * 100))
print("Recall on test data = %0.2f%%" % (recall * 100))

1it [00:00,  1.31it/s]

Validation Loss per 50 steps: 0.020363613963127136
Validation Accuracy per 50 steps: 100.0


35it [00:28,  1.24it/s]

all_targets: [1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0]
all_predictions: [1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0]
Validation Loss Epoch: 0.14900902195700577
Validation Accuracy Epoch: 95.0
Validation Precision Epoch: 0.9468085106382979
Valida


