## Import Libraries

In [12]:
!pip install transformers
!pip install scikit-learn
!pip install sklearn
!pip install pandas
!pip install tensorflow

import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

[0mCollecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2936 sha256=24171862a295d1d197d1eeb4313cfe9f5cf4b135a4d153a9b3eba4d2a6905f62
  Stored in directory: /root/.cache/pip/wheels/db/9f/0b/772886b624f84c138a5febb6966c89d374ab58c62bd65d109e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1
[0m

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3090'

## Load and Preprocess Training Data

Dataset will be tokenized then split into training and validation sets. The validation set will be used to monitor training. For testing a separate test set will be loaded for analysis.

Load the pretrained tokenizer that corresponds to your choice in model. e.g.,

```
BERT:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 

XLNet:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=False) 

RoBERTa:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
```


In order to avoid memory issues with Google Colab, I enforce a max_length of 100 tokens. Note that some sentences may not adequately represent each label because of this.

In [22]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, XLNetForSequenceClassification, XLMRobertaTokenizer, XLNetTokenizer,BertForSequenceClassification,BertTokenizer, RobertaForSequenceClassification,RobertaTokenizer

In [10]:
def make_dataframe(input_folder, labels_fn=None):
    #MAKE TXT DATAFRAME
    text = []
    for fil in tqdm(filter(lambda x: x.startswith('a') and x.endswith('.txt'), os.listdir(input_folder))):
        iD = fil[7:].split('.')[0]
        lines = list(enumerate(open(input_folder+fil,'r',encoding='utf-8').read().splitlines(),1))
        text.extend([(iD,) + line for line in lines])

    df_text = pd.DataFrame(text, columns=['id','line','text'])
    df_text.id = df_text.id.apply(int)
    df_text.line = df_text.line.apply(int)
    df_text = df_text[df_text.text.str.strip().str.len() > 0].copy()
    df_text = df_text.set_index(['id','line'])
    
    df = df_text

    if labels_fn:
        #MAKE LABEL DATAFRAME
        labels = pd.read_csv(labels_fn,sep='\t',encoding='utf-8',header=None)
        labels = labels.rename(columns={0:'id',1:'line',2:'labels'})
        labels = labels.set_index(['id','line'])
        labels = labels[labels.labels.notna()].copy()

        #JOIN
        df = labels.join(df_text)[['text','labels']]

    return df

In [14]:
en_folder_train = 'data/en/train-articles-subtask-3/'
en_folder_dev = 'data/en/dev-articles-subtask-3/'
en_labels_train_fn = 'data/en/train-labels-subtask-3.txt'
en_labels_dev_fn = 'data/en/dev-labels-subtask-3.txt'

it_folder_train = 'data/it/train-articles-subtask-3/'
it_folder_dev = 'data/it/dev-articles-subtask-3/'
it_labels_train_fn = 'data/it/train-labels-subtask-3.txt'
it_labels_dev_fn = 'data/it/dev-labels-subtask-3.txt'

fr_folder_train = 'data/fr/train-articles-subtask-3/'
fr_folder_dev = 'data/fr/dev-articles-subtask-3/'
fr_labels_train_fn = 'data/fr/train-labels-subtask-3.txt'
fr_labels_dev_fn = 'data/fr/dev-labels-subtask-3.txt'

ru_folder_train = 'data/ru/train-articles-subtask-3/'
ru_folder_dev = 'data/ru/dev-articles-subtask-3/'
ru_labels_train_fn = 'data/ru/train-labels-subtask-3.txt'
ru_labels_dev_fn = 'data/ru/dev-labels-subtask-3.txt'

po_folder_train = 'data/po/train-articles-subtask-3/'
po_folder_dev = 'data/po/dev-articles-subtask-3/'
po_labels_train_fn = 'data/po/train-labels-subtask-3.txt'
po_labels_dev_fn = 'data/po/dev-labels-subtask-3.txt'

ge_folder_train = 'data/ge/train-articles-subtask-3/'
ge_folder_dev = 'data/ge/dev-articles-subtask-3/'
ge_labels_train_fn = 'data/ge/train-labels-subtask-3.txt'
ge_labels_dev_fn = 'data/ge/dev-labels-subtask-3.txt'

out_fn = 'output/'

In [15]:
import os
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

print('Loading dataset...')
en_train = make_dataframe(en_folder_train, en_labels_train_fn)
en_test = make_dataframe(en_folder_dev, en_labels_dev_fn)

it_train = make_dataframe(it_folder_train, it_labels_train_fn)
it_test = make_dataframe(it_folder_dev, it_labels_dev_fn)

fr_train = make_dataframe(fr_folder_train, fr_labels_train_fn)
fr_test = make_dataframe(fr_folder_dev, fr_labels_dev_fn)

ge_train = make_dataframe(ge_folder_train, ge_labels_train_fn)
ge_test = make_dataframe(ge_folder_dev, ge_labels_dev_fn)

ru_train = make_dataframe(ru_folder_train, ru_labels_train_fn)
ru_test = make_dataframe(ru_folder_dev, ru_labels_dev_fn)

po_train = make_dataframe(po_folder_train, po_labels_train_fn)
po_test = make_dataframe(po_folder_dev, po_labels_dev_fn)

train = pd.concat([en_train, it_train, fr_train, ge_train, ru_train, po_train])
test = pd.concat([en_test, it_test, fr_test, ge_test, ru_test, po_test])

Loading dataset...


446it [00:00, 15768.06it/s]
90it [00:00, 19636.25it/s]
227it [00:00, 21363.02it/s]
76it [00:00, 23087.35it/s]
158it [00:00, 18469.90it/s]
53it [00:00, 16750.67it/s]
132it [00:00, 13740.89it/s]
45it [00:00, 10065.79it/s]
143it [00:00, 14134.88it/s]
48it [00:00, 15765.59it/s]
145it [00:00, 16170.54it/s]
49it [00:00, 15071.93it/s]


In [16]:
import os
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

#print('Loading dataset...')
#train = make_dataframe(folder_train, labels_train_fn)
#test = make_dataframe(folder_dev, labels_dev_fn)

X_train = train['text'].values
Y_train = train['labels'].fillna('').str.split(',').values

X_test = test['text'].values
Y_test = test['labels'].fillna('').str.split(',').values

In [17]:
multibin = MultiLabelBinarizer() #use sklearn binarizer
Y_train = multibin.fit_transform(Y_train)
Y_test = multibin.fit_transform(Y_test)

l = list(multibin.classes_)
id2label = {idx:label for idx, label in enumerate(l)}
label2id = {label:idx for idx, label in enumerate(l)}

Y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
comments = X_train

In [19]:
labels = Y_train
num_labels = len(labels[0])

In [20]:
num_labels

23

In [21]:
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels
id,line,Unnamed: 2_level_1,Unnamed: 3_level_1
111111111,3,Geneva - The World Health Organisation chief o...,Doubt
111111111,5,"""The next transmission could be more pronounce...",Appeal_to_Authority
111111111,13,"But Tedros voiced alarm that ""plague in Madaga...",Repetition
111111111,17,He also pointed to the presence of the pneumon...,Appeal_to_Fear-Prejudice
111111111,19,He praised the rapid response from WHO and Mad...,Appeal_to_Fear-Prejudice


In [24]:
!pip install sentencepiece 

Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
[0m

In [31]:
max_length = 512
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [32]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [33]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = train.labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(train[train.labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [(999001293, 14), (832916508, 4), (798244842, 26), (798244842, 19), (795693029, 4), (789454337, 3), (789370909, 17), (789121265, 12), (788056108, 7), (787668628, 3), (785801366, 14), (783702663, 82), (783702663, 72), (783702663, 71), (783702663, 16), (783702663, 14), (783702663, 9), (783702663, 7), (782149032, 4), (780414700, 10), (780414700, 5), (779309765, 14), (778664280, 29), (778139122, 20), (778094905, 17), (777488669, 31), (777488669, 18), (776345502, 13), (776126299, 29), (774145019, 15), (772947654, 7), (772836731, 19), (772836731, 6), (771655795, 7), (770877978, 15), (770376380, 13), (769962328, 22), (766942310, 8), (766632016, 15), (765982381, 18), (765913191, 33), (765385479, 50), (765385479, 25), (765197039, 18), (765197039, 17), (765197039, 15), (765197039, 13), (765197039, 4), (765197039, 3), (764664283, 19), (764664283, 9), (763440871, 16), (763280007, 10), (763280007, 6), (763260610, 29), (763260610, 26), (763260610, 11), (7625

In [34]:
one_freq_idxs

[(999001293, 14),
 (832916508, 4),
 (798244842, 26),
 (798244842, 19),
 (795693029, 4),
 (789454337, 3),
 (789370909, 17),
 (789121265, 12),
 (788056108, 7),
 (787668628, 3),
 (785801366, 14),
 (783702663, 82),
 (783702663, 72),
 (783702663, 71),
 (783702663, 16),
 (783702663, 14),
 (783702663, 9),
 (783702663, 7),
 (782149032, 4),
 (780414700, 10),
 (780414700, 5),
 (779309765, 14),
 (778664280, 29),
 (778139122, 20),
 (778094905, 17),
 (777488669, 31),
 (777488669, 18),
 (776345502, 13),
 (776126299, 29),
 (774145019, 15),
 (772947654, 7),
 (772836731, 19),
 (772836731, 6),
 (771655795, 7),
 (770877978, 15),
 (770376380, 13),
 (769962328, 22),
 (766942310, 8),
 (766632016, 15),
 (765982381, 18),
 (765913191, 33),
 (765385479, 50),
 (765385479, 25),
 (765197039, 18),
 (765197039, 17),
 (765197039, 15),
 (765197039, 13),
 (765197039, 4),
 (765197039, 3),
 (764664283, 19),
 (764664283, 9),
 (763440871, 16),
 (763280007, 10),
 (763280007, 6),
 (763260610, 29),
 (763260610, 26),
 (7632606

In [35]:
# # Gathering single instance inputs to force into the training set after stratified split
# one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
# one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
# one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
# one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

Be sure to handle all classes during validation using "stratify" during train/validation split:

In [36]:
# Use train_test_split to split our data into train and validation sets
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids, attention_masks,
                                                            random_state=42, test_size=0.10, 
#                                                             stratify = labels
                                                            )

# Add one frequency data to train data
# train_inputs.extend(one_freq_input_ids)
# train_labels.extend(one_freq_labels)
# train_masks.extend(one_freq_attention_masks)
# train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [37]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [38]:
torch.save(validation_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

## Load Model & Set Params

Load the appropriate model below, each model already contains a single dense layer for classification on top.



```
BERT:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

XLNet:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=num_labels)

RoBERTa:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
```



In [39]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_labels, id2label=id2label,
                                                           label2id=label2id)
model.cuda()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Appeal_to_Authority",
    "1": "Appeal_to_Fear-Prejudice",
    "2": "Appeal_to_Hypocrisy",
    "3": "Appeal_to_Popularity",
    "4": "Appeal_to_Time",
    "5": "Appeal_to_Values",
    "6": "Causal_Oversimplification",
    "7": "Consequential_Oversimplification",
    "8": "Conversation_Killer",
    "9": "Doubt",
    "10": "Exaggeration-Minimisation",
    "11": "False_Dilemma-No_Choice",
    "12": "Flag_Waving",
    "13": "Guilt_by_Association",
    "14": "Loaded_Language",
    "15": "Name_Calling-Labeling",
    "16": "Obfuscation-Vagueness-

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/pytorch_model.bin
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly ide

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

Setting custom optimization parameters for the AdamW optimizer https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [40]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [41]:
from torch.optim import AdamW

optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [77]:
%%time
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from tqdm import trange
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0 #running loss
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        # # Forward pass for multiclass classification
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # loss = outputs[0]
        # logits = outputs[1]

        # Forward pass for multilabel classification
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        loss_func = BCEWithLogitsLoss() 
        loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        # loss_func = BCELoss() 
        # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        train_loss_set.append(loss.item())    

        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Variables to gather full output
    logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

    # Predict
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        with torch.no_grad():
            # Forward pass
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()

        tokenized_texts.append(b_input_ids)
        logit_preds.append(b_logit_pred)
        true_labels.append(b_labels)
        pred_labels.append(pred_label)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl>threshold for pl in pred_labels]
    true_bools = [tl==1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.21714162281373653


Epoch:  20%|██        | 1/5 [03:41<14:44, 221.08s/it]

F1 Validation Accuracy:  31.637578908280734
Flat Validation Accuracy:  9.606587374199451
Train loss: 0.197626120215509


Epoch:  40%|████      | 2/5 [07:22<11:03, 221.20s/it]

F1 Validation Accuracy:  40.10519395134779
Flat Validation Accuracy:  14.181152790484905
Train loss: 0.1764393051223057


Epoch:  60%|██████    | 3/5 [11:03<07:22, 221.21s/it]

F1 Validation Accuracy:  42.655809583463835
Flat Validation Accuracy:  16.37694419030192
Train loss: 0.1552811233129928


Epoch:  80%|████████  | 4/5 [14:44<03:41, 221.20s/it]

F1 Validation Accuracy:  46.799884158702575
Flat Validation Accuracy:  18.298261665141812
Train loss: 0.1350859608470909


Epoch: 100%|██████████| 5/5 [18:25<00:00, 221.20s/it]

F1 Validation Accuracy:  44.13004214328717
Flat Validation Accuracy:  18.48124428179323
CPU times: user 9min 21s, sys: 9min 4s, total: 18min 25s
Wall time: 18min 26s





In [79]:
torch.save(model.state_dict(), 'SemEval23_task3_subtask3_multi')

## Load and Preprocess Test Data

In [80]:
# test_df = pd.read_csv('test.csv')
# test_labels_df = pd.read_csv('test_labels.csv')
# test_df = test_df.merge(test_labels_df, on='id', how='left')
# test_label_cols = list(test_df.columns[2:])
# print('Null values: ', test_df.isnull().values.any()) #should not be any null sentences or labels
# print('Same columns between train and test: ', label_cols == test_label_cols) #columns should be the same
# test_df.head()

In [81]:
# test_df = test_df[~test_df[test_label_cols].eq(-1).any(axis=1)] #remove irrelevant rows/comments with -1 values
# test_df['one_hot_labels'] = list(test_df[test_label_cols].values)
# test_df.head()

In [82]:
# # Gathering input data
# test_labels = list(test_df.one_hot_labels.values)
# test_comments = list(test_df.comment_text.values)

In [47]:
_test = make_dataframe(folder_dev)

_X_test = _test['text'].values


NameError: name 'folder_dev' is not defined

In [83]:
test_comments = X_test

In [84]:
test_labels = Y_test

In [85]:
len(test_comments)

3294

In [86]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']



In [87]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
# test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks,  test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
# Save test dataloader
torch.save(test_dataloader,'test_data_loader')

In [88]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
# Save test dataloader
torch.save(test_dataloader,'test_data_loader')

## Prediction and Metics

In [89]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
#     b_input_ids, b_input_mask, b_labels, b_token_types = batch
    b_input_ids, b_input_mask, b_token_types = batch
    with torch.no_grad():
        # Forward pass
        outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        b_logit_pred = outs[0]
        pred_label = torch.sigmoid(b_logit_pred)

        b_logit_pred = b_logit_pred.detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()
#         b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
#     true_labels.append(b_labels)
    pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

ValueError: too many values to unpack (expected 3)

In [90]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
 #   b_input_ids, b_input_mask, b_token_types = batch
    with torch.no_grad():
        # Forward pass
        outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        b_logit_pred = outs[0]
        pred_label = torch.sigmoid(b_logit_pred)

        b_logit_pred = b_logit_pred.detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()
        b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

In [91]:
true_labels

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [92]:
len(pred_labels)

3294

In [93]:
# label2id

We need to threshold our sigmoid function outputs which range from [0, 1]. Below I use 0.50 as a threshold.

In [94]:
import pickle

pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=label2id)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

Test F1 Accuracy:  0.4004438168246924
Test Flat Accuracy:  0.1344869459623558 

                                  precision    recall  f1-score   support

             Appeal_to_Authority       0.38      0.17      0.23       170
        Appeal_to_Fear-Prejudice       0.46      0.25      0.32       379
             Appeal_to_Hypocrisy       0.33      0.01      0.02       221
            Appeal_to_Popularity       0.00      0.00      0.00       110
                  Appeal_to_Time       0.00      0.00      0.00        42
                Appeal_to_Values       0.46      0.11      0.18       193
       Causal_Oversimplification       0.67      0.02      0.04       111
Consequential_Oversimplification       0.67      0.02      0.04        95
             Conversation_Killer       0.37      0.08      0.13       241
                           Doubt       0.50      0.42      0.46       865
       Exaggeration-Minimisation       0.39      0.12      0.18       347
         False_Dilemma-No_Choic

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Output Dataframe

In [75]:
print(id2label)

{0: 'Appeal_to_Authority', 1: 'Appeal_to_Fear-Prejudice', 2: 'Appeal_to_Hypocrisy', 3: 'Appeal_to_Popularity', 4: 'Causal_Oversimplification', 5: 'Conversation_Killer', 6: 'Doubt', 7: 'Exaggeration-Minimisation', 8: 'False_Dilemma-No_Choice', 9: 'Flag_Waving', 10: 'Guilt_by_Association', 11: 'Loaded_Language', 12: 'Name_Calling-Labeling', 13: 'Obfuscation-Vagueness-Confusion', 14: 'Red_Herring', 15: 'Repetition', 16: 'Slogans', 17: 'Straw_Man', 18: 'Whataboutism'}


In [91]:
pred_bools = [pl>0.25 for pl in pred_labels] #boolean output after thresholding


In [85]:
# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
# for vals in true_bools:
#     true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
    pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [92]:
# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
# for vals in true_label_idxs:
#     if vals:
#         true_label_texts.append([id2label[val] for val in vals])
#     else:
#         true_label_texts.append(vals)

for vals in pred_label_idxs:
    if vals:
        pred_label_texts.append([id2label[val] for val in vals])
    else:
        pred_label_texts.append(vals)

In [87]:
# Decoding input ids to comment text
comment_texts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [88]:
len(comment_texts)

3127

In [93]:
len(pred_label_texts)

3127

In [94]:
# Converting lists to df
# comparisons_df = pd.DataFrame({'comment_text': comment_texts, 'true_labels': true_label_texts, 'pred_labels':pred_label_texts})
comparisons_df = pd.DataFrame({'comment_text': comment_texts, 'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df.head(50)

Unnamed: 0,comment_text,pred_labels
0,george iii lost america .,"[Loaded_Language, Name_Calling-Labeling]"
1,theresa may could lose the united kingdom over...,"[Loaded_Language, Name_Calling-Labeling]"
2,britain is locked in the most serious peacetim...,"[Loaded_Language, Name_Calling-Labeling]"
3,brexit has shown the world a british parliamen...,"[Loaded_Language, Name_Calling-Labeling]"
4,one veteran of margaret thatcher ’ s cabinet s...,"[Loaded_Language, Name_Calling-Labeling]"
5,we speak of the oldest parliamentary democracy...,"[Loaded_Language, Name_Calling-Labeling]"
6,since 1721 it has seen 74 prime ministers of h...,"[Loaded_Language, Name_Calling-Labeling]"
7,the ultimate test of a prime minister in a cri...,"[Loaded_Language, Name_Calling-Labeling]"
8,the answer to that depends on a combination of...,"[Loaded_Language, Name_Calling-Labeling]"
9,quite often they are skills that politicians d...,"[Loaded_Language, Name_Calling-Labeling]"


In [99]:
test.shape

(3127, 1)

In [105]:
test['pred'] = comparisons_df['pred_labels'].tolist()

In [110]:
out = test
del out['text']
out.to_csv('test.txt', sep='\t', header=False)

## Bonus - Optimizing threshold value for micro F1 score

Doing this may result in a trade offs between precision, flat accuracy and micro F1 accuracy. You may tune the threshold however you want.

In [95]:
# Calculate Accuracy - maximize F1 accuracy by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 then with 'micro_thresholds' on the order of e^-2

macro_thresholds = np.array(range(1,10))/10

f1_results, flat_acc_results = [], []
for th in macro_thresholds:
    pred_bools = [pl>th for pl in pred_labels]
    test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
    test_flat_accuracy = accuracy_score(true_bools, pred_bools)
    f1_results.append(test_f1_accuracy)
    flat_acc_results.append(test_flat_accuracy)

best_macro_th = macro_thresholds[np.argmax(f1_results)] #best macro threshold value

micro_thresholds = (np.array(range(10))/100)+best_macro_th #calculating micro threshold values

f1_results, flat_acc_results = [], []
for th in micro_thresholds:
    pred_bools = [pl>th for pl in pred_labels]
    test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
    test_flat_accuracy = accuracy_score(true_bools, pred_bools)
    f1_results.append(test_f1_accuracy)
    flat_acc_results.append(test_flat_accuracy)

best_f1_idx = np.argmax(f1_results) #best threshold value

# Printing and saving classification report
print('Best Threshold: ', micro_thresholds[best_f1_idx])
print('Test F1 Accuracy: ', f1_results[best_f1_idx])
print('Test Flat Accuracy: ', flat_acc_results[best_f1_idx], '\n')

best_pred_bools = [pl>micro_thresholds[best_f1_idx] for pl in pred_labels]
clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=label2id)
pickle.dump(clf_report_optimized, open('classification_report_optimized.txt','wb'))
print(clf_report_optimized)

Best Threshold:  0.24000000000000002
Test F1 Accuracy:  0.4468802698145026
Test Flat Accuracy:  0.1284153005464481 

                                  precision    recall  f1-score   support

             Appeal_to_Authority       0.32      0.32      0.32       170
        Appeal_to_Fear-Prejudice       0.32      0.37      0.34       379
             Appeal_to_Hypocrisy       0.27      0.16      0.20       221
            Appeal_to_Popularity       0.00      0.00      0.00       110
                  Appeal_to_Time       0.00      0.00      0.00        42
                Appeal_to_Values       0.30      0.33      0.32       193
       Causal_Oversimplification       0.13      0.15      0.14       111
Consequential_Oversimplification       0.22      0.15      0.18        95
             Conversation_Killer       0.25      0.30      0.27       241
                           Doubt       0.45      0.63      0.53       865
       Exaggeration-Minimisation       0.27      0.29      0.28     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
true_bools

[array([False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False,  True, False, False, False,
        False, False, False, False, False]),
 array([False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False,  True,  True, False, False,
        False, False, False, False, False]),
 array([False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False]),
 array([False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False,
         True, False, False, False, False]),
 array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False]),
 array([False, False, False, False, False, False, False, False, F

In [73]:
len(best_pred_bools)

3294

In [74]:
output_test = make_dataframe(folder_dev)

NameError: name 'folder_dev' is not defined

In [75]:
output_test.shape

NameError: name 'output_test' is not defined

In [76]:
output_test.head()

NameError: name 'output_test' is not defined

In [None]:
_X_test = output_test['text'].values

In [None]:
_X_test.shape

In [None]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(_X_test,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [None]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
# Save test dataloader
torch.save(test_dataloader,'test_data_loader')

## save to hub

In [97]:
pip install huggingface_hub

[0mNote: you may need to restart the kernel to use updated packages.


In [98]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.huggingface/token
Login successful


In [100]:
#pt_model = BertForSequenceClassification.from_pretrained("SemEval23_task3_subtask3_multi", from_tf=True)
#pt_model.save_pretrained("SemEval23_task3_subtask3_multi")
model.push_to_hub("SemEval23_task3_subtask3_multi")

Configuration saved in /tmp/tmpbf98y_zn/config.json
Model weights saved in /tmp/tmpbf98y_zn/pytorch_model.bin
Uploading the following files to franfj/SemEval23_task3_subtask3_multi: config.json,pytorch_model.bin


CommitInfo(commit_url='https://huggingface.co/franfj/SemEval23_task3_subtask3_multi/commit/0fd740a0b2efa815d10ed765074eedd598f8b25b', commit_message='Upload BertForSequenceClassification', commit_description='', oid='0fd740a0b2efa815d10ed765074eedd598f8b25b', pr_url=None, pr_revision=None, pr_num=None)

In [101]:
tokenizer.push_to_hub("SemEval23_task3_subtask3_multi")

tokenizer config file saved in /tmp/tmpezcm9233/tokenizer_config.json
Special tokens file saved in /tmp/tmpezcm9233/special_tokens_map.json
Uploading the following files to franfj/SemEval23_task3_subtask3_multi: tokenizer_config.json,special_tokens_map.json,vocab.txt


CommitInfo(commit_url='https://huggingface.co/franfj/SemEval23_task3_subtask3_multi/commit/9c3fdd9e4acf79718b33e807cd9a8f10b932a56b', commit_message='Upload tokenizer', commit_description='', oid='9c3fdd9e4acf79718b33e807cd9a8f10b932a56b', pr_url=None, pr_revision=None, pr_num=None)