# Subtask A

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
texts_df = pd.read_csv('subtaskA_train.csv')

In [4]:
texts_df.head()

Unnamed: 0,comment_text,conspiratorial
0,⚡Se non ci fossero soldati non ci sarebbero gu...,0
1,"21/08/21]( [PRE-PRINT]\n\n📄__ ""Shedding of Inf...",1
2,PAURA E DELIRIO ALLA CNN: IL MINISTERO DELLA V...,1
3,L'Aspirina non aumenta la sopravvivenza dei pa...,0
4,L'Italia non puo' dare armi lo vieta la Costit...,0


In [5]:
texts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845 entries, 0 to 1844
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   comment_text    1842 non-null   object
 1   conspiratorial  1845 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 29.0+ KB


In [6]:
texts_df[texts_df['comment_text'].isna()]

Unnamed: 0,comment_text,conspiratorial
244,,0
263,,0
665,,0


Delete rows with NaN text

In [7]:
texts_df = texts_df[texts_df.comment_text.notna()]

In [8]:
texts_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1842 entries, 0 to 1844
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   comment_text    1842 non-null   object
 1   conspiratorial  1842 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 43.2+ KB


Count positive and negatie samples

In [9]:
texts_df.conspiratorial.value_counts()

1    925
0    917
Name: conspiratorial, dtype: int64

## Preprocessing

Substitute '\n' with ' '.

In [10]:
texts_df.comment_text = texts_df.comment_text.apply(lambda text: text.replace('\n\n', ' ').replace('\n', ' '))

## Extraction

Extracting number of emojis for each text.

In [11]:
import emoji

texts_df['emoji_count'] = texts_df.apply(lambda row: emoji.emoji_count(row.comment_text), result_type='expand', axis=1)

In [12]:
texts_df.head()

Unnamed: 0,comment_text,conspiratorial,emoji_count
0,⚡Se non ci fossero soldati non ci sarebbero gu...,0,1
1,"21/08/21]( [PRE-PRINT] 📄__ ""Shedding of Infect...",1,6
2,PAURA E DELIRIO ALLA CNN: IL MINISTERO DELLA V...,1,0
3,L'Aspirina non aumenta la sopravvivenza dei pa...,0,0
4,L'Italia non puo' dare armi lo vieta la Costit...,0,0


Extracting ratio of full uppercase words for each text.

In [13]:
def uppercase_ratio(text):
    words = text.split()
    num_upper_words = 0
    for w in words:
        num_upper_words += w.isupper()
    return num_upper_words/len(words)

In [14]:
texts_df['upper_case_ratio'] = texts_df.apply(lambda row: uppercase_ratio(row.comment_text), result_type='expand', axis=1)

In [15]:
texts_df.head(10)

Unnamed: 0,comment_text,conspiratorial,emoji_count,upper_case_ratio
0,⚡Se non ci fossero soldati non ci sarebbero gu...,0,1,0.0
1,"21/08/21]( [PRE-PRINT] 📄__ ""Shedding of Infect...",1,6,0.01087
2,PAURA E DELIRIO ALLA CNN: IL MINISTERO DELLA V...,1,0,0.040936
3,L'Aspirina non aumenta la sopravvivenza dei pa...,0,0,0.0
4,L'Italia non puo' dare armi lo vieta la Costit...,0,0,0.0
5,Ma non siete stufi di essere presi in giro??,1,0,0.0
6,**Ritengo questo audio piuttosto importante**:...,1,3,0.0
7,‼️ DIFFERENZA TRA SANGUE VACCINATO E NON VACCI...,1,5,0.205128
8,Panzana pazzesca del leghista Siri: le misure ...,1,0,0.0
9,Tesla voleva portare a tutti energia gratuita ...,0,0,0.0


Extracting ratio of bold words for each text.

In [16]:
import re

def bold_ratio(text):
    num_words = len(text.split())


    split_bolds = text.split('**')
    split_bolds.pop()
    count_bold = 0

    for i, s in enumerate(split_bolds):
        if(i%2 != 0):
            count_bold += len(s.split())

    return count_bold/num_words

In [17]:
texts_df['bold_ratio'] = texts_df.apply(lambda row: bold_ratio(row.comment_text), result_type='expand', axis=1)

In [18]:
texts_df.head(10)

Unnamed: 0,comment_text,conspiratorial,emoji_count,upper_case_ratio,bold_ratio
0,⚡Se non ci fossero soldati non ci sarebbero gu...,0,1,0.0,0.0
1,"21/08/21]( [PRE-PRINT] 📄__ ""Shedding of Infect...",1,6,0.01087,0.184783
2,PAURA E DELIRIO ALLA CNN: IL MINISTERO DELLA V...,1,0,0.040936,0.0
3,L'Aspirina non aumenta la sopravvivenza dei pa...,0,0,0.0,0.0
4,L'Italia non puo' dare armi lo vieta la Costit...,0,0,0.0,0.0
5,Ma non siete stufi di essere presi in giro??,1,0,0.0,0.0
6,**Ritengo questo audio piuttosto importante**:...,1,3,0.0,0.067568
7,‼️ DIFFERENZA TRA SANGUE VACCINATO E NON VACCI...,1,5,0.205128,0.0
8,Panzana pazzesca del leghista Siri: le misure ...,1,0,0.0,0.0
9,Tesla voleva portare a tutti energia gratuita ...,0,0,0.0,0.0


## Train-Validation split

In [19]:
from sklearn.model_selection import StratifiedShuffleSplit

In [20]:
# Split the data into training and testing datasets using stratified sampling
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, eval_index in split.split(texts_df, texts_df.conspiratorial):
    train_df_full, eval_df_full = texts_df.iloc[train_index], texts_df.iloc[eval_index]

In [21]:
print(train_df_full.info())
print(train_df_full.conspiratorial.value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1473 entries, 1512 to 1771
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   comment_text      1473 non-null   object 
 1   conspiratorial    1473 non-null   int64  
 2   emoji_count       1473 non-null   int64  
 3   upper_case_ratio  1473 non-null   float64
 4   bold_ratio        1473 non-null   float64
dtypes: float64(2), int64(2), object(1)
memory usage: 69.0+ KB
None
1    740
0    733
Name: conspiratorial, dtype: int64


In [22]:
print(eval_df_full.info())
print(eval_df_full.conspiratorial.value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 363 to 670
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   comment_text      369 non-null    object 
 1   conspiratorial    369 non-null    int64  
 2   emoji_count       369 non-null    int64  
 3   upper_case_ratio  369 non-null    float64
 4   bold_ratio        369 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 17.3+ KB
None
1    185
0    184
Name: conspiratorial, dtype: int64


The dataset seems balanced in term of positive and negative samples.

## Model

In [23]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [25]:
train_df_bert = train_df_full[['comment_text', 'conspiratorial']]
train_df_bert.columns = ["text", "labels"]
eval_df_bert = eval_df_full[['comment_text', 'conspiratorial']]
eval_df_bert.columns = ["text", "labels"]

In [26]:
eval_df_bert.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369 entries, 363 to 670
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    369 non-null    object
 1   labels  369 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.6+ KB


In [27]:
import torch

cuda_available = torch.cuda.is_available()
print(cuda_available)

True


In [28]:
eval_labels = eval_df_bert.labels.tolist()

## Bert-based models 

In [29]:
def train_validate_bert_clf(model_hgf_name, model_class, cased, eval_labels):
    
    batch_size = 8

    # Num steps in epoch = num training samples / batch size
    steps_per_epoch = int(np.ceil(len(train_df_bert) / float(batch_size)))

    print('Each epoch will have {:,} steps.'.format(steps_per_epoch))


    # Optional model configuration
    model_args = ClassificationArgs(num_train_epochs=20, do_lower_case=cased, evaluate_during_training=True, evaluate_during_training_verbose=True, # Main options
                                    use_multiprocessing=False, use_multiprocessing_for_evaluation=False, overwrite_output_dir=True,  # System configurations
                                    output_dir='out_'+model_hgf_name,
                                    eval_batch_size=batch_size, train_batch_size=batch_size, evaluate_during_training_steps=steps_per_epoch, # Batch sizes and steps
                                    use_early_stopping=True, early_stopping_metric='eval_loss', early_stopping_patience=2, early_stopping_metric_minimize=True, # Early stopping
                                    early_stopping_delta=0.01, early_stopping_consider_epochs=True
                                    )

    # Create a ClassificationModel
    model = ClassificationModel(model_class, model_hgf_name, args=model_args, use_cuda=cuda_available)

    # Train the model
    model.train_model(train_df_bert, eval_df=eval_df_bert)

    # Predict on train
    full_pred_tr = model.predict(train_df_bert.text.tolist())
    pred_tr = full_pred_tr[0]
    raw_pred_tr = full_pred_tr[1]

    # Predict on evaluation
    full_pred_eval = model.predict(eval_df_bert.text.tolist())
    pred_eval = full_pred_eval[0]
    raw_pred_eval = full_pred_eval[1]

    # Make classification report
    clf_report = classification_report(eval_labels, pred_eval, target_names=['non-conspiratorial', 'conspiratorial'], digits=4)

    return model, clf_report, raw_pred_tr, raw_pred_eval

Train and evaluate various BERT based models

Models list

In [30]:
# class, huggingface name, cased

bert_models_list = [
    ("bert", "dbmdz/bert-base-italian-cased", True),
    ("distilbert", "indigo-ai/BERTino", False),
    ("bert", "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", False),
    ("bert", "dbmdz/bert-base-italian-xxl-cased", True)
    
]

In [31]:
trained_bert_models_list = []
eval_report_list = []
raw_pred_list_tr = []
raw_pred_list_val = []

for model_class, model_hgf_name, cased in bert_models_list:

    model, clf_report, raw_pred_tr, raw_pred_val = train_validate_bert_clf(model_hgf_name, model_class, cased, eval_labels)
    trained_bert_models_list.append(model)
    eval_report_list.append(clf_report)
    raw_pred_list_tr.append(raw_pred_tr)
    raw_pred_list_val.append(raw_pred_val)

    # Print model stats
    print('#################################')
    print('----', model_hgf_name, '----')
    print(clf_report)

    
   

Each epoch will have 185 steps.


Some weights of the model checkpoint at dbmdz/bert-base-italian-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model c

#################################
---- dbmdz/bert-base-italian-cased ----
                    precision    recall  f1-score   support

non-conspiratorial     0.6822    0.8750    0.7667       184
    conspiratorial     0.8271    0.5946    0.6918       185

          accuracy                         0.7344       369
         macro avg     0.7546    0.7348    0.7292       369
      weighted avg     0.7548    0.7344    0.7291       369

Each epoch will have 185 steps.


Some weights of the model checkpoint at indigo-ai/BERTino were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at indigo-ai/BERTino and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre

#################################
---- indigo-ai/BERTino ----
                    precision    recall  f1-score   support

non-conspiratorial     0.7315    0.8587    0.7900       184
    conspiratorial     0.8301    0.6865    0.7515       185

          accuracy                         0.7724       369
         macro avg     0.7808    0.7726    0.7707       369
      weighted avg     0.7809    0.7724    0.7707       369

Each epoch will have 185 steps.


Some weights of the model checkpoint at m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

#################################
---- m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 ----
                    precision    recall  f1-score   support

non-conspiratorial     0.6381    0.9293    0.7566       184
    conspiratorial     0.8713    0.4757    0.6154       185

          accuracy                         0.7019       369
         macro avg     0.7547    0.7025    0.6860       369
      weighted avg     0.7550    0.7019    0.6858       369

Each epoch will have 185 steps.


Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

#################################
---- dbmdz/bert-base-italian-xxl-cased ----
                    precision    recall  f1-score   support

non-conspiratorial     0.7927    0.8315    0.8117       184
    conspiratorial     0.8239    0.7838    0.8033       185

          accuracy                         0.8076       369
         macro avg     0.8083    0.8077    0.8075       369
      weighted avg     0.8083    0.8076    0.8075       369






In [32]:
# Print model stats

for i, clf_report in enumerate(eval_report_list):

    model_hgf_name = bert_models_list[i][1]
    print('\n')
    print('----', model_hgf_name, '----')
    print(clf_report)



---- dbmdz/bert-base-italian-cased ----
                    precision    recall  f1-score   support

non-conspiratorial     0.6822    0.8750    0.7667       184
    conspiratorial     0.8271    0.5946    0.6918       185

          accuracy                         0.7344       369
         macro avg     0.7546    0.7348    0.7292       369
      weighted avg     0.7548    0.7344    0.7291       369



---- indigo-ai/BERTino ----
                    precision    recall  f1-score   support

non-conspiratorial     0.7315    0.8587    0.7900       184
    conspiratorial     0.8301    0.6865    0.7515       185

          accuracy                         0.7724       369
         macro avg     0.7808    0.7726    0.7707       369
      weighted avg     0.7809    0.7724    0.7707       369



---- m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 ----
                    precision    recall  f1-score   support

non-conspiratorial     0.6381    0.9293    0.7566       184
    co

## Classification over Bert SoftMax + extracted features

Normalize extracted features and convert to numpy 2d array.

In [43]:
from sklearn.preprocessing import StandardScaler

extracted_cols = ['emoji_count', 'upper_case_ratio', 'bold_ratio']
scaler = StandardScaler()

extracted_features_tr = scaler.fit_transform(train_df_full[extracted_cols].to_numpy())
extracted_features_val = scaler.fit_transform(eval_df_full[extracted_cols].to_numpy())


print(extracted_features_tr.shape)
print(extracted_features_val.shape)

(1473, 3)
(369, 3)


Combine extracted features with raw BERT predictions

In [44]:
raw_pred_tr = raw_pred_list_tr[3]
raw_pred_val = raw_pred_list_val[3]

pred_with_extrac_tr = np.concatenate((raw_pred_tr, extracted_features_tr), axis=1)
pred_with_extrac_val = np.concatenate((raw_pred_val, extracted_features_val), axis=1)

print(pred_with_extrac_tr.shape)
print(pred_with_extrac_val.shape)

(1473, 5)
(369, 5)


In [45]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import callbacks

In [53]:
# Function that builds a NN from a list configuration containing layer neurons
def build_nn(layers_config, input, droput=0.0):
    model = Sequential()

    for i, layer_neurons in enumerate(layers_config):
        # Add hidden layers with the current number of neurons
        if i==0: # First layer
            model.add(Dense(layer_neurons, activation='relu', input_dim=input.shape[1]))
        else:
            model.add(Dropout(droput))
            model.add(Dense(layer_neurons, activation='relu', input_dim=layers_config[i-1]))
    
    # Add the output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [60]:
# Define a list of possible lists of numbers of neurons for each layer
layers_config_list = [[8], [12], [16], [24]]

history_list = []
models_list = []
val_acc_list = []

# Iterate over all possible combinations of number of layers and neurons
for layers_config in layers_config_list:
    # Builds NN architecture
    model = build_nn(layers_config, input=pred_with_extrac_tr, droput=0.3)

    # Add early stopping that checks val loss
    earlystopping = callbacks.EarlyStopping(monitor ="val_loss",  mode ="min", patience = 2, restore_best_weights = True)
    
    # Train the model on the training set
    history = model.fit(pred_with_extrac_tr, train_df_bert.labels, validation_data = (pred_with_extrac_val,eval_df_bert.labels),
        epochs=50, batch_size=32, verbose=0, callbacks=[earlystopping])
    
    # Evaluate the model on the validation set
    _, accuracy = model.evaluate(pred_with_extrac_val, eval_df_bert.labels, verbose=0)

    # Save and show results
    print('NN with ', layers_config, 'configuration - Validation Score:', accuracy,
        '- Epochs:', earlystopping.stopped_epoch)
    history_list.append(history)
    models_list.append(model)
    val_acc_list.append(accuracy)

NN with  [8] configuration - Validation Score: 0.8021680116653442 - Epochs: 6
NN with  [12] configuration - Validation Score: 0.8048780560493469 - Epochs: 2
NN with  [16] configuration - Validation Score: 0.7859078645706177 - Epochs: 5
NN with  [24] configuration - Validation Score: 0.8048780560493469 - Epochs: 4
