## **Prerequisites**

Google Colab runtime with extended RAM needed for k fold cross-validation

In [1]:
# install needed packages simpletransformers
!pip install simpletransformers scikit-learn jedi Counter lxml openpyxl


Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m194.6/250.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting jedi
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Counter
  Downloading Counter-1.0.0.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets (from simpletransformers)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m21.9 MB/s[0m eta [36m0

In [2]:
# import needed modules
import random as rn
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from collections import Counter
import openpyxl
import gc

In [None]:
# load packages to make progress bar of simpletransformers in vs code work
#from tqdm import tqdm
#from ipywidgets import interact
#import ipywidgets as widgets

In [3]:
# mount GDrive to be able to import data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# set global seed for reproducibility of results
seed = 1337
np.random.seed(seed)

## **Import training data**

In [5]:
pd.set_option('display.max_columns', None)

# use dropna() to remove empty excel rows
df = pd.read_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/articles_mig_final.xlsx")

df = df.dropna(subset=["Text"])

print(df.head())

        Date                                              Title  \
0 2023-07-10          86 Migranten südlich der Kanaren gerettet   
1 2023-07-10  Seit Tagen vermisstes Flüchtlingsboot offenbar...   
2 2023-07-09    IfW-Chef: "Wir brauchen eine Million Migranten"   
3 2023-06-30  Ungarn und Polen blockieren Konsens zu EU-Asyl...   
4 2023-06-27   Deutschland verzeichnet 2022 Zuwanderungs-Rekord   

                                                Text  \
0  Der Atlantik zählt zu den gefährlichsten Fluch...   
1  Drei Boote aus dem Senegal mit Hunderten Migra...   
2  Der demografische Wandel und der Fachkräfteman...   
3  Anfang Juni handeln die EU-Innenminister mühsa...   
4  Der Krieg in der Ukraine sorgt in Deutschland ...   

                                                Lead  \
0  Der Atlantik zählt zu den gefährlichsten Fluch...   
1  Drei Boote aus dem Senegal mit Hunderten Migra...   
2  Der demografische Wandel und der Fachkräfteman...   
3  Anfang Juni handeln die EU-Innenm

In [None]:
# construct equally disributed sample
#df = pd.concat([
#    df[df['migration'] == 0].sample(100),
#    df[df['migration'] == 1].sample(100)
#])
#df

In [6]:
# get length of imported dataset
len(df)

45864

In [7]:
# check distribution of 1s = migration piece / 0s = descriptive article
print(Counter(df['migration'].values))

Counter({1: 23132, 0: 22732})


In [8]:
# Zähle die Anzahl der Zeilen mit migration = 0 und 1
counts = df['migration'].value_counts()

# Bestimme die minimale Anzahl von Zeilen, die benötigt wird
min_count = min(counts)

# Gleichverteilter Datensatz erstellen
df = pd.concat([
    df[df['migration'] == 0].sample(n=min_count, replace=True),
    df[df['migration'] == 1].sample(n=min_count, replace=True)
])

# Das ausgeglichene DataFrame balanced_data enthält jetzt die gewünschte Verteilung


# **Data preparation**

In [9]:
# split dataset into 5 folds
kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

# test if it works
i = 0

for train_index, val_index in kf.split(df, df["migration"]):
    i = i+1

    train_df = df[["Text", "migration"]].iloc[train_index]
    val_df  = df[["Text", "migration"]].iloc[val_index]

    print(f"Train {i} {len(train_df)} and Test {i} {len(val_df)}")
    print(f"Train {i} {train_df.index} and Test {i} {val_df.index}")

Train 1 36371 and Test 1 9093
Train 1 Int64Index([ 3501, 24535, 42765, 45119, 13948, 27641, 44012, 29862, 31449,
            30088,
            ...
             1348, 34514, 39350, 21630,  2817, 21153,  9030,  8116, 10908,
            21548],
           dtype='int64', length=36371) and Test 1 Int64Index([ 5864, 42253, 29496, 42944, 15634,  4524,  3543, 24833, 14884,
            14853,
            ...
            20822,  7717,  1898, 33704, 34883, 35999,  9807, 30703, 33649,
            34856],
           dtype='int64', length=9093)
Train 2 36371 and Test 2 9093
Train 2 Int64Index([ 5864,  3501, 24535, 42765, 45119, 42253, 13948, 29496, 44012,
            29862,
            ...
             9807, 22708, 30703, 20351, 33649,  2817, 21153,  9030,  8116,
            34856],
           dtype='int64', length=36371) and Test 2 Int64Index([27641, 42536, 27185, 24158,  1069, 26684, 23475, 32214,  5294,
            31729,
            ...
            16577, 34760, 19847, 19419,  1348, 34514, 3935

# **Define settings for the training process/model**


In [10]:
# define hyperparameters for model (https://simpletransformers.ai/docs/usage/)

# example for understanding batch size and epochs:
# Assume you have a dataset with 200 samples (rows of data) and you choose a batch size of 5 and 1,000 epochs.
# This means that the dataset will be divided into 40 batches, each with five samples. The model weights will be updated after each batch of five samples.
# This also means that one epoch will involve 40 batches or 40 updates to the model.
# With 1,000 epochs, the model will be exposed to or pass through the whole dataset 1,000 times. That is a total of 40,000 batches during the entire training process.


train_args ={"reprocess_input_data": True, # True needed for k fold cross validation (since we use different training sets)!!! If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "use_cached_eval_features": False, # False needed for k fold cross validation (since we use different evaluation sets)!!! tokenize validation set not again and again when ever a validation is conducted
             "no_cache": True,
             "output_dir": "outputs", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
             "max_seq_length": 512, # maximum number of tokens that a sequence can contain. Any tokens that appear after the max_seq_length will be truncated (max value: 512)
             "num_train_epochs": 1, # defines the number times that the learning algorithm will work through the entire training dataset. Take a high number since early stopping will stop the model automatically when its enough
             "train_batch_size": 16, # defines the number of samples to work through before updating the internal model parameters (smaller = better / 32 common for use / see: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU)
            # use the following if the machine has not enough gpu ram for bigger batch sizes:
            # "gradient_accumulation_steps": 2, # e.g. 16 batch size * 2 gradient accumulation = 32 batch size (uses batch size 16 but updates first internal model parameters after 2 batches are worked through)
            # when using gradient_acc use for evaluate_during_training_steps -> batch size/len(data) / gradient = steps per epoch
             "use_early_stopping": True, # prevent model from overfitting
             "early_stopping_metric": 'eval_loss', # choose evaluation metric for early stopping (other metric e.g. mcc) -> eval_loss: how good can the model extrapolate to not seen data
             "early_stopping_delta": 0.01, # stop if eval_loss cannot get better by 0.01
             "early_stopping_metric_minimze": True, # eval_loss should be minimized (note: if mcc is used, it should get maximizied!)
             "evaluate_during_training": True, # evaluation will be performed during training to monitor the training process closely in order to find best model
             "evaluate_during_training_steps": 455, # Perform evaluation at every specified number of steps. In this case evaluate twice every epoch (steps_per_epoch/2)
             "early_stopping_patience": 15, # Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta
             "evaluate_during_training_verbose": True, # Print results from evaluation during training.
             "manual_seed": seed, # for reproducible results
             "use_multiprocessing": False, # !!! False needed for use with extended RAM in Google Colab otherwise the training process will not start
             "use_multiprocessing_for_evaluation": False, # !!! False needed for use with extended RAM in Google Colab otherwise the training process will not start
             "save_steps": -1,
             "sliding_window": False} # dont save checkpoint every 2000 steps by default


In [11]:
train_args
# model.args

{'reprocess_input_data': True,
 'overwrite_output_dir': True,
 'use_cached_eval_features': False,
 'no_cache': True,
 'output_dir': 'outputs',
 'fp16': True,
 'max_seq_length': 512,
 'num_train_epochs': 1,
 'train_batch_size': 16,
 'use_early_stopping': True,
 'early_stopping_metric': 'eval_loss',
 'early_stopping_delta': 0.01,
 'early_stopping_metric_minimze': True,
 'evaluate_during_training': True,
 'evaluate_during_training_steps': 455,
 'early_stopping_patience': 15,
 'evaluate_during_training_verbose': True,
 'manual_seed': 1337,
 'use_multiprocessing': False,
 'use_multiprocessing_for_evaluation': False,
 'save_steps': -1,
 'sliding_window': False}

In [12]:
# check how many steps per epoch will be conducted using a batch size of 16 and folds with a proportion of 80% training data and 20% validation data
steps_per_epoch = (len(df)*0.8)/float(train_args['train_batch_size'])
steps_per_epoch

2273.2000000000003

In [13]:
# evaluate 5 times per epoch
steps_per_epoch/5

454.64000000000004

# **Perform k-fold Cross validation**

In [16]:
# prepare excel file in which results of k-fold cross-validation are written

# evaluation metrices
eval_metrics = ['auprc', 'auroc', 'eval_loss', 'fn', 'fp', 'mcc', 'tn', 'tp']

# prepare excel file
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows

excel_file = 'k_fold_cross_validation_results_mig_NOSW.xlsx'
wb = Workbook(write_only=True) # use write only of openpyxl to reduce memory usage in following loop. Otherwise you ll run into an out of ram error

# Create worksheet for results
result_sheet = wb.create_sheet(title='Results')
result_sheet.append(['Fold'] + eval_metrics)

Exception ignored in: <generator object WorksheetWriter.get_stream at 0x7d993e82a570>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/openpyxl/worksheet/_writer.py", line 289, in get_stream
  File "src/lxml/serializer.pxi", line 1834, in lxml.etree._FileWriterElement.__exit__
  File "src/lxml/serializer.pxi", line 1570, in lxml.etree._IncrementalFileWriter._write_end_element
lxml.etree.LxmlSyntaxError: inconsistent exit action in context manager
Exception ignored in: <generator object WriteOnlyWorksheet._write_rows at 0x7d993e82a5e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/openpyxl/worksheet/_write_only.py", line 66, in _write_rows
  File "src/lxml/serializer.pxi", line 1834, in lxml.etree._FileWriterElement.__exit__
  File "src/lxml/serializer.pxi", line 1568, in lxml.etree._IncrementalFileWriter._write_end_element
lxml.etree.LxmlSyntaxError: not in an element


In [17]:
# Perform K-fold Cross-Validation
for fold, (train_idx, val_idx) in enumerate(kf.split(df, df['migration'])):
    # Create folds out of dataset
    train_data = df[["Text", "migration"]].iloc[train_idx]
    val_data = df[["Text", "migration"]].iloc[val_idx]

    # Load pretrained pretrained german BERT model (cased -> takes into account lowercase and uppercase letters)
    # Models are imported from huggingface (see for a list: https://huggingface.co/transformers/v3.3.1/pretrained_models.html)
    model = ClassificationModel(
    "bert", "bert-base-german-cased",
    num_labels=2,
    args=train_args,
    use_cuda = True
    )

    # Initialize and train model
    model.train_model(train_data, eval_df=val_data)

    # free RAM after training has finished
    del(model)
    del(train_data)
    gc.collect()

    # load the best model from the best_model folder of the previous training process
    model = ClassificationModel(
    "bert", "/content/outputs/best_model",
    num_labels=2,
    args=train_args,
    use_cuda = True
    )

    # Use the best model to perform validation
    results, model_outputs, wrong_predictions = model.eval_model(val_data)

    # Save results in result worksheet
    result_sheet.append([fold + 1] + [results.get(metric, None) for metric in eval_metrics])

    # Create worksheet for misclassified predictions
    wrong_pred_sheet = wb.create_sheet(title=f'Fold {fold+1} - Wrongly classified predictions')
    wrong_pred_sheet.append(['Index number in whole dataset', 'Row number in validation dataset', 'Text', 'Predicted class'])

    # Save misclassified predictions to misclassified worksheet
    for pred in wrong_predictions:
        index_in_data = val_data.iloc[pred.guid].name
        wrong_pred_sheet.append([index_in_data, pred.guid, pred.text_a, pred.label])

    # free RAM after evaluation again
    del(results)
    del(model_outputs)
    del(wrong_predictions)
    del(model)
    del(val_data)
    gc.collect()


# save k-fold cross-validation results file
wb.save(excel_file)

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Exception ignored in: <generator object WorksheetWriter.get_stream at 0x7d993e82a810>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/openpyxl/worksheet/_writer.py", line 289, in get_stream
    with xf.element("worksheet", xmlns=SHEET_MAIN_NS):
  File "src/lxml/serializer.pxi", line 1834, in lxml.etree._FileWriterElement.__exit__
  File "src/lxml/serializer.pxi", line 1570, in lxml.etree._IncrementalFileWriter._write_end_element
lxml.etree.LxmlSyntaxError: inconsistent exit action in context manager
Exception ignored in: <generator object WriteOnlyWorksheet._write_rows at 0x7d993e82a730>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/openpyxl/worksheet/_write_only.py", line 66, in _write_rows
    with xf.element("sheetData"):
  File "src/lxml/serializer.pxi", line 1834, in lxml.etree._FileWriterElement.__exit__
  File "src/lxml/serializer.pxi", line 1568, in lxml.etree._IncrementalFileWriter._write_end_element
lxml.

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2274 [00:00<?, ?it/s]



Running Evaluation:   0%|          | 0/1137 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2274 [00:00<?, ?it/s]



Running Evaluation:   0%|          | 0/1137 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2274 [00:00<?, ?it/s]



Running Evaluation:   0%|          | 0/1137 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2274 [00:00<?, ?it/s]



Running Evaluation:   0%|          | 0/1137 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2274 [00:00<?, ?it/s]



Running Evaluation:   0%|          | 0/1137 [00:00<?, ?it/s]

