# Train a CARDS classifier with RoBERTa

In [1]:
## Author: Mirjam Nanko
## Date Created: 2021-02-01
## Email: m.nanko@exeter.ac.uk

## Setup

In [2]:
# Load the required packages

# Dataframes
import pandas as pd

# Regular expressions
import re

# Unidecoder
import unicodedata

# Timestamp / time measurment
import time

# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel

# Label encode
from sklearn.preprocessing import LabelEncoder

# Class weights
from sklearn.utils.class_weight import compute_class_weight

# Model performance scores
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# PyTorch: enable GPU access
import torch

# If you want to select a specific GPU, set it here:
# gpu = 0
# torch.cuda.set_device(gpu) 

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use GPU {}:'.format(torch.cuda.current_device()), torch.cuda.get_device_name(torch.cuda.current_device()))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  warn(


There are 1 GPU(s) available.
We will use GPU 0: NVIDIA GeForce GTX 1080 Ti


In [3]:
# Define required functions

# Define additional model performance scores (F1)
def f1_multiclass_macro(labels, preds):
    return f1_score(labels, preds, average='macro')
def f1_multiclass_micro(labels, preds):
    return f1_score(labels, preds, average='micro')
def f1_multiclass_weighted(labels, preds):
    return f1_score(labels, preds, average='weighted')
def f1_class(labels, preds):
    return f1_score(labels, preds, average=None)
def precision(labels, preds):
    return precision_score(labels, preds, average='macro')
def recall(labels, preds):
    return recall_score(labels, preds, average='macro')

# Define text pre-processing functions
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def strip_underscores(text):
    return re.sub(r'_+', ' ', text)
def remove_multiple_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Merge text pre-processing functions
def denoise_text(text):
    text = remove_between_square_brackets(text)
    text = remove_non_ascii(text)
    text = strip_underscores(text)
    text = remove_multiple_spaces(text)
    return text.strip()

## Data Preparation

In [4]:
# Load and pre-process the text data

# Load the data
train = pd.read_csv('training/train.csv').rename(columns={"claim": "labels_orig"})
valid = pd.read_csv('training/valid.csv').rename(columns={"claim": "labels_orig"})
test = pd.read_csv('training/test.csv').rename(columns={"claim": "labels_orig"})

# Pre-process the text
train['text'] = train['text'].astype(str).apply(denoise_text)
valid['text'] = valid['text'].astype(str).apply(denoise_text)
test['text'] = test['text'].astype(str).apply(denoise_text)

# Load the label encoder
label_encoder = LabelEncoder()

# Encode the labels
train['labels'] = label_encoder.fit_transform(train.labels_orig)
valid['labels'] = label_encoder.fit_transform(valid.labels_orig)
test['labels'] = label_encoder.fit_transform(test.labels_orig)

In [5]:
train

Unnamed: 0,text,labels_orig,labels
0,What do you do if you are a global warming ala...,5_1,16
1,(2.) A sun-blocking volcanic aerosols componen...,0_0,0
2,"Now, I am very interested in the AMO, since it...",1_1,1
3,Dr. Christy addressed recent challenges to the...,0_0,0
4,After a brief protest from Massachusetts Repub...,0_0,0
...,...,...,...
23431,Mrner and Parker conclude that the Fremantle t...,1_6,5
23432,"Siegel, Jeremy J., The Concise Encyclopedia of...",0_0,0
23433,According to Goklany's careful empirical analy...,0_0,0
23434,"In light of these several findings, it can rea...",4_4,14


In [6]:
train.labels.unique()

array([16,  0,  1,  7, 14, 11, 12, 13,  6, 10,  4, 17,  8, 15,  5,  3,  2,
        9])

In [7]:
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(le_name_mapping)

{'0_0': 0, '1_1': 1, '1_2': 2, '1_3': 3, '1_4': 4, '1_6': 5, '1_7': 6, '2_1': 7, '2_3': 8, '3_1': 9, '3_2': 10, '3_3': 11, '4_1': 12, '4_2': 13, '4_4': 14, '4_5': 15, '5_1': 16, '5_2': 17}


## Weights creation

In [8]:
# Check the distribution of categories
print(round(train.labels.value_counts(normalize=True),2))
# Calculate weights
weights = compute_class_weight(class_weight='balanced', classes= train.labels.unique(), y=train.labels)
weights = [*weights]
print(weights)

labels
0     0.70
16    0.06
17    0.04
7     0.03
4     0.02
6     0.02
8     0.01
10    0.01
1     0.01
12    0.01
11    0.01
14    0.01
3     0.01
9     0.01
13    0.01
5     0.01
15    0.01
2     0.01
Name: proportion, dtype: float64
[0.9482884195193008, 0.0798675009201325, 3.90990990990991, 1.6522842639593909, 5.314285714285714, 4.030959752321982, 3.9695121951219514, 6.852631578947369, 3.0491803278688523, 3.863501483679525, 2.6956521739130435, 1.2840236686390532, 3.84070796460177, 7.153846153846154, 6.888888888888889, 5.685589519650655, 8.857142857142858, 6.2898550724637685]


## RoBERTa classifier training

In [9]:
%%time

# Create a ClassificationModel
model = ClassificationModel('roberta', 'roberta-large', 
                            num_labels = 18, weight = weights,
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': False,
                                  'output_dir': 'models/new_model/',
                                  'best_model_dir': 'models/new_model/best_model/',
                                  # Hyperparameters
                                  'train_batch_size': 6,
                                  'num_train_epochs': 3, 
                                  'learning_rate': 1e-5,
                                  # Text processing
                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.6,
                                  'do_lower_case': False,
                                  # Evaluation
                                  'evaluate_during_training': True,
                                  'evaluate_during_training_verbose': True,
                                  'evaluate_during_training_steps': -1,
                                  # Saving
                                  'save_model_every_epoch': True,
                                  'save_eval_checkpoints': True,
                                  'weight_decay': 0
                                  })

# Train and evaluate the model
model.train_model(train, eval_df = valid,
                  f1_macro = f1_multiclass_macro, 
                  f1_micro = f1_multiclass_micro, 
                  f1_weighted = f1_multiclass_weighted, 
                  acc = accuracy_score, 
                  f1_class = f1_class)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/23436 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/3907 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/3907 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/3907 [00:00<?, ?it/s]

CPU times: user 1h 42min 40s, sys: 3min 33s, total: 1h 46min 13s
Wall time: 1h 47min 45s


(11721,
 defaultdict(list,
             {'global_step': [3907, 7814, 11721],
              'train_loss': [0.004569053649902344,
               0.0010551713639870286,
               0.008465169928967953],
              'mcc': [0.6919310410498666,
               0.7193987797589124,
               0.7539247258975573],
              'f1_macro': [0.698093010076865,
               0.7419281816247714,
               0.7721193478866935],
              'f1_micro': [0.83531669865643,
               0.8395393474088292,
               0.8690978886756238],
              'f1_weighted': [0.8395721691375037,
               0.849336173530304,
               0.8721242638620107],
              'acc': [0.83531669865643,
               0.8395393474088292,
               0.8690978886756238],
              'f1_class': [array([0.90739167, 0.64516129, 0.64516129, 0.74576271, 0.63576159,
                      0.86363636, 0.81553398, 0.71856287, 0.7012987 , 0.63636364,
                      0.85365854, 0.9577464

## RoBERTa classifier performance evaluation

In [10]:
%%time

# Evaluate the classifier performance on the validation data
result, model_outputs, wrong_predictions = model.eval_model(valid, 
                                                            f1_macro = f1_multiclass_macro,
                                                            precision = precision, 
                                                            recall = recall,
                                                            acc = accuracy_score,
                                                            f1_micro = f1_multiclass_micro, 
                                                            f1_weighted = f1_multiclass_weighted, 
                                                            f1_class = f1_class)

print('\n\nThese are the results when testing the model on the validation data set:\n')
print(result)

  0%|          | 0/2605 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/27 [00:00<?, ?it/s]



These are the results when testing the model on the validation data set:

{'mcc': 0.7539247258975573, 'f1_macro': 0.7721193478866935, 'precision': 0.7493412638868281, 'recall': 0.8022745001257632, 'acc': 0.8690978886756238, 'f1_micro': 0.8690978886756238, 'f1_weighted': 0.8721242638620107, 'f1_class': array([0.92259887, 0.90410959, 0.8       , 0.82142857, 0.8       ,
       0.9047619 , 0.84615385, 0.78125   , 0.74358974, 0.73913043,
       0.88607595, 0.97142857, 0.525     , 0.5106383 , 0.61538462,
       0.72340426, 0.73652695, 0.66666667]), 'eval_loss': 1.0783989347789127}
CPU times: user 51.8 s, sys: 1.36 s, total: 53.1 s
Wall time: 1min 10s


In [11]:
%%time

# Evaluate the classifier performance on the testing data
result_test, model_outputs_test, wrong_predictions_test = model.eval_model(test, 
                                                                           f1_macro = f1_multiclass_macro,
                                                                           precision = precision, 
                                                                           recall = recall,
                                                                           acc = accuracy_score,
                                                                           f1_micro = f1_multiclass_micro, 
                                                                           f1_weighted = f1_multiclass_weighted,
                                                                           f1_class = f1_class)
print('\n\nThese are the results when testing the model on the testing data set:\n')
print(result_test)

  0%|          | 0/2904 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1662 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1072 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (856 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (671 > 512). Running this sequence through the model will result in indexing errors


Running Evaluation:   0%|          | 0/31 [00:00<?, ?it/s]



These are the results when testing the model on the testing data set:

{'mcc': 0.778054540578735, 'f1_macro': 0.7736175971410385, 'precision': 0.7968518165578657, 'recall': 0.7568648091287078, 'acc': 0.865358126721763, 'f1_micro': 0.865358126721763, 'f1_weighted': 0.8615224032424584, 'f1_class': array([0.92556454, 0.73684211, 0.68421053, 0.80645161, 0.79136691,
       0.87272727, 0.86614173, 0.8503937 , 0.70212766, 0.81632653,
       0.86868687, 0.9375    , 0.56410256, 0.5862069 , 0.73972603,
       0.70422535, 0.79638009, 0.67613636]), 'eval_loss': 0.9732772377070843}
CPU times: user 1min, sys: 1.7 s, total: 1min 1s
Wall time: 1min 22s
