# MSc Thesis Applied Data Science

Due to the sensitive source of the data, the data will not be presented in this notebook nor in the repo. This code is used to train the classifier presented in the paper.

## Initialisation


In [None]:
# Import modules
import torch, evaluate, re
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.utils.class_weight import compute_class_weight
from imblearn.under_sampling import RandomUnderSampler

# Check if GPU is available
device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Import tokenizer
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

## Data

In [None]:
# Load Data
df = pd.read_csv('dataset')

# Preprocess Data
label_names = ['Niets', 'Dehumaniserend', 'Demoniserend', '(Gewelds)bedreiging', 'Doodsbedreiging', 'Elite-complot']

def l2t(label: int):
    return label_names[label]

def t2l(label_text: str):
    return label_names.index(label_text)

def clean_labels(labels,
                 convert_labels = False):
    """Takes labels in doccano format and returns a list of individual text labels, optionally as int labels"""
    #Check if label is given, otherwise assign label 'Niets'
    if labels is None:
        labels = 'Niets'

    #Split labels into list
    split_labels = labels.split('#')
    if convert_labels:
        return [t2l(lab) for lab in split_labels]
    else:
        return split_labels
    
def preproc_message(text: str):
    """
    Preprocess a text message by performing the following:
    - Remove hyperlinks.
    - Replace excessive quotation marks ("") with a single quote.
    - Replace @mentions with @user.
    """
    text = re.sub(r'(http|https|ftp|www|\w+\.\w+/)\S+', '', text)  # Remove hyperlinks
    text = re.sub(r'"{2,}', '"', text)  # Replace excessive quotation marks
    text = re.sub(r'@\S+', '@user', text)  # Replace @mentions
    return text

# Clean the tweets
df['text'] = df['text'].apply(preproc_message)

## Undersampling the data

In [None]:
rus = RandomUnderSampler(random_state=123)
X, Y = df[['text']], df[['label']]  # Select 'text' and 'label' columns
reX, reY = rus.fit_resample(X, Y)

dfRe = pd.DataFrame({'text': reX['text'], 'label': reY['label']})
dfRe = dfRe.dropna(axis=0)
dfRe['label'] = dfRe['label'].astype(int)
dataset = Dataset.from_pandas(dfRe[['id','text','label']], preserve_index=True)

## Split data

In [None]:
trainData = dataset.train_test_split(test_size=0.25, seed=123)
valData = trainData['test'].train_test_split(test_size=0.6, seed=123)

finalDataset = DatasetDict({"train": trainData['train'], 
                                "val": valData['train'],
                             "test": valData['test']})

## Compute Class Weights

In [None]:
y = finalDataset['train']['label']
classWeights = compute_class_weight(class_weight= 'balanced', classes = np.unique(y), y=y)
classWeights

## Initialize Padding

In [None]:

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Model


In [None]:
id2label = {0 :'Niets', 1 : 'Dehumaniserend', 2 : 'Demoniserend', 3 : '(Gewelds)bedreiging', 4: 'Doodsbedreiging'}
label2id = {'Niets' : 0, 'Dehumaniserend' : 1 , 'Demoniserend' : 2, '(Gewelds)bedreiging' : 3, 'Doodsbedreiging' : 4}

model = RobertaForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base", num_labels=5, id2label=id2label, label2id = label2id)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenizedDataset = finalDataset.map(preprocess_function, batched=True)

### Metrics


In [None]:
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric= evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

## Define and train the model

In [None]:
class MyTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # You pass the class weights when instantiating the Trainer
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.

            # Changes start here
            # loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            logits = outputs['logits']
            criterion = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = criterion(logits, inputs['labels'])

            # Changes end here

        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="Classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [None]:
trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=  tokenizedDataset['train'],
    eval_dataset= tokenizedDataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=torch.tensor(classWeights,dtype=torch.float).to(device)
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

## Testing model

In [None]:
classifier = pipeline("text-classification", model="/path/to/classifier")

In [None]:
classifier("Test Sentence")