# Train a CARDS classifier with RoBERTa

In [None]:
## Author: Mirjam Nanko
## Date Created: 2021-02-01
## Email: m.nanko@exeter.ac.uk

## Setup

In [2]:
# Load the required packages

# Dataframes
import pandas as pd

# Regular expressions
import re

# Unidecoder
import unicodedata

# Timestamp / time measurment
import time

# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel

# Label encode
from sklearn.preprocessing import LabelEncoder

# Class weights
from sklearn.utils.class_weight import compute_class_weight

# Model performance scores
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# PyTorch: enable GPU access
import torch

# If you want to select a specific GPU, set it here:
# gpu = 0
# torch.cuda.set_device(gpu) 

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use GPU {}:'.format(torch.cuda.current_device()), torch.cuda.get_device_name(torch.cuda.current_device()))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

2022-09-29 19:30:17.685713: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-29 19:30:18.593807: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-09-29 19:30:18.593876: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


There are 1 GPU(s) available.
We will use GPU 0: NVIDIA GeForce RTX 3080 Laptop GPU


In [3]:
# Define required functions

# Define additional model performance scores (F1)
def f1_multiclass_macro(labels, preds):
    return f1_score(labels, preds, average='macro')
def f1_multiclass_micro(labels, preds):
    return f1_score(labels, preds, average='micro')
def f1_multiclass_weighted(labels, preds):
    return f1_score(labels, preds, average='weighted')
def f1_class(labels, preds):
    return f1_score(labels, preds, average=None)
def precision(labels, preds):
    return precision_score(labels, preds, average='macro')
def recall(labels, preds):
    return recall_score(labels, preds, average='macro')

# Define text pre-processing functions
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def strip_underscores(text):
    return re.sub(r'_+', ' ', text)
def remove_multiple_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Merge text pre-processing functions
def denoise_text(text):
    text = remove_between_square_brackets(text)
    text = remove_non_ascii(text)
    text = strip_underscores(text)
    text = remove_multiple_spaces(text)
    return text.strip()

## Data Preparation

In [4]:
# Load and pre-process the text data

# Load the data
train = pd.read_csv('../../data/training/training.csv').rename(columns={"claim": "labels_orig"})
valid = pd.read_csv('../../data/training/validation.csv').rename(columns={"claim": "labels_orig"})
test = pd.read_csv('../../data/training/test.csv').rename(columns={"claim": "labels_orig"})

# Pre-process the text
train['text'] = train['text'].astype(str).apply(denoise_text)
valid['text'] = valid['text'].astype(str).apply(denoise_text)
test['text'] = test['text'].astype(str).apply(denoise_text)

# Load the label encoder
label_encoder = LabelEncoder()

# Encode the labels
train['labels'] = label_encoder.fit_transform(train.labels_orig)
valid['labels'] = label_encoder.transform(valid.labels_orig)
test['labels'] = label_encoder.transform(test.labels_orig)

## Weights creation

In [8]:
# Check the distribution of categories
print(round(train.labels.value_counts(normalize=True),2))
# Calculate weights
weights = compute_class_weight(
    class_weight='balanced', 
    classes=train.labels.unique(), 
    y=train.labels
)
weights = [*weights]
print(weights)

0     0.70
16    0.06
17    0.04
7     0.03
4     0.02
6     0.02
8     0.01
10    0.01
1     0.01
12    0.01
11    0.01
14    0.01
3     0.01
9     0.01
13    0.01
5     0.01
15    0.01
2     0.01
Name: labels, dtype: float64
[0.9482884195193008, 0.0798675009201325, 3.90990990990991, 1.6522842639593909, 5.314285714285714, 4.030959752321982, 3.9695121951219514, 6.852631578947369, 3.0491803278688523, 3.863501483679525, 2.6956521739130435, 1.2840236686390532, 3.84070796460177, 7.153846153846154, 6.888888888888889, 5.685589519650655, 8.857142857142858, 6.2898550724637685]


## RoBERTa classifier training

In [9]:
%%time

# Create a ClassificationModel
model = ClassificationModel('roberta', 'roberta-large', 
                            num_labels = 18, weight = weights,
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': False,
                                  'output_dir': 'models/new_model/',
                                  'best_model_dir': 'models/new_model/best_model/',
                                  # Hyperparameters
                                  'train_batch_size': 6,
                                  'num_train_epochs': 3, 
                                  'learning_rate': 1e-5,
                                  # Text processing
                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.6,
                                  'do_lower_case': False,
                                  # Evaluation
                                  'evaluate_during_training': True,
                                  'evaluate_during_training_verbose': True,
                                  'evaluate_during_training_steps': -1,
                                  # Saving
                                  'save_model_every_epoch': True,
                                  'save_eval_checkpoints': True,
                                  'weight_decay': 0
                                  })

# Train and evaluate the model
model.train_model(train, eval_df = valid,
                  f1_macro = f1_multiclass_macro, 
                  f1_micro = f1_multiclass_micro, 
                  f1_weighted = f1_multiclass_weighted, 
                  acc = accuracy_score, 
                  f1_class = f1_class)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/23436 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/3907 [00:00<?, ?it/s]



RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.80 GiB total capacity; 6.61 GiB already allocated; 14.81 MiB free; 6.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## RoBERTa classifier performance evaluation

In [10]:
%%time

# Evaluate the classifier performance on the validation data
result, model_outputs, wrong_predictions = model.eval_model(valid, 
                                                            f1_macro = f1_multiclass_macro,
                                                            precision = precision, 
                                                            recall = recall,
                                                            acc = accuracy_score,
                                                            f1_micro = f1_multiclass_micro, 
                                                            f1_weighted = f1_multiclass_weighted, 
                                                            f1_class = f1_class)

print('\n\nThese are the results when testing the model on the validation data set:\n')
print(result)

  0%|          | 0/2605 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/326 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.80 GiB total capacity; 6.61 GiB already allocated; 14.81 MiB free; 6.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
%%time

# Evaluate the classifier performance on the testing data
result_test, model_outputs_test, wrong_predictions_test = model.eval_model(test, 
                                                                           f1_macro = f1_multiclass_macro,
                                                                           precision = precision, 
                                                                           recall = recall,
                                                                           acc = accuracy_score,
                                                                           f1_micro = f1_multiclass_micro, 
                                                                           f1_weighted = f1_multiclass_weighted,
                                                                           f1_class = f1_class)
print('\n\nThese are the results when testing the model on the testing data set:\n')
print(result_test)

  0%|          | 0/2904 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1662 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1072 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (856 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (671 > 512). Running this sequence through the model will result in indexing errors


Running Evaluation:   0%|          | 0/378 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.80 GiB total capacity; 6.61 GiB already allocated; 14.81 MiB free; 6.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF