In [None]:
# install simpletransformers
!pip install simpletransformers scikit-learn jedi Counter wandb


Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting jedi
  Downloading jedi-0.19.0-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Counter
  Downloading Counter-1.0.0.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb
  Downloading wandb-0.15.8-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.31.0 (from simpletransformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from simpletransf

In [None]:
import sys
import numpy as np
import random as rn
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from collections import Counter
import logging # for seeing validation metrics while training the model in real time
import wandb # for visualization of training (User account required, best to use Google account as wandb recognises Colab sessios)
from sklearn.model_selection import StratifiedKFold, KFold
import openpyxl

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#wandb.login()

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
seed = 1337

In [None]:
pd.set_option('display.max_columns', None)

# use dropna() to remove empty excel rows
df = pd.read_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/articles_mig_final_edited.xlsx")

df = df.dropna(subset=["Text"])

print(df.head())

        Date                                              Title  \
0 2023-07-10          86 Migranten südlich der Kanaren gerettet   
1 2023-07-10  Seit Tagen vermisstes Flüchtlingsboot offenbar...   
2 2023-07-09    IfW-Chef: "Wir brauchen eine Million Migranten"   
3 2023-06-30  Ungarn und Polen blockieren Konsens zu EU-Asyl...   
4 2023-06-27   Deutschland verzeichnet 2022 Zuwanderungs-Rekord   

                                                Text  \
0  Der Atlantik zählt zu den gefährlichsten Fluch...   
1  Drei Boote aus dem Senegal mit Hunderten Migra...   
2  Der demografische Wandel und der Fachkräfteman...   
3  Anfang Juni handeln die EU-Innenminister mühsa...   
4  Der Krieg in der Ukraine sorgt in Deutschland ...   

                                                Lead  \
0  Der Atlantik zählt zu den gefährlichsten Fluch...   
1  Drei Boote aus dem Senegal mit Hunderten Migra...   
2  Der demografische Wandel und der Fachkräfteman...   
3  Anfang Juni handeln die EU-Innenm

In [None]:
len(df)

45845

#Prepare data for training of final model

In [None]:
# construct test (20%) and train dataset (90%)
split = train_test_split(df[["Text", "migration"]], test_size = 0.1, stratify= df['migration'], random_state = seed)

In [None]:
test_data = split[1]
train_data = split[0]
train_data

Unnamed: 0,Text,migration
25430,Im Internet kursieren Videos mit einem Transpo...,0
44070,"Sheeeeeeeee-it. Um den Fall von Baltimore, die...",0
2066,Berlin. Der geschäftsführende Außenminister He...,1
41847,Unionskanzlerkandidat Armin Laschet hat die al...,0
15494,Andrea Nahles warnt vor sozialen Verwerfungen....,0
...,...,...
18188,"„Don't worry, be happy!“ von Bobby McFerrin er...",1
39033,"Die Welt: Herr Haseloff, 25 Jahre nach der Wen...",1
25433,Der Bundestag hat dem Beitritt Schwedens und F...,0
14860,In der Abendsonne am Echinger Bürgerplatz komm...,0


In [None]:
test_data

In [None]:
#check balance of constructed datasets
print(Counter(train_data['migration'].values))

Counter({1: 20794, 0: 20466})


In [None]:
#check balance of constructed datasets
print(Counter(test_data['migration'].values))

# **Define settings for the final training process/model**

In [None]:
# define hyperparameters for model (https://simpletransformers.ai/docs/usage/)

# example for understanding batch size and epochs:
# Assume you have a dataset with 200 samples (rows of data) and you choose a batch size of 5 and 1,000 epochs.
# This means that the dataset will be divided into 40 batches, each with five samples. The model weights will be updated after each batch of five samples.
# This also means that one epoch will involve 40 batches or 40 updates to the model.
# With 1,000 epochs, the model will be exposed to or pass through the whole dataset 1,000 times. That is a total of 40,000 batches during the entire training process.


train_args ={"reprocess_input_data": True, # If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "use_cached_eval_features": False, # tokenize validation set not again and again when ever a validation is conducted
             "output_dir": "outputs", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
             "max_seq_length": 512, # maximum number of tokens that a sequence can contain. Any tokens that appear after the max_seq_length will be truncated (max value: 512)
             "sliding_window": False, # Whether to use sliding window technique to prevent truncating sequences longer than 512 tokens
             "num_train_epochs": 15, # defines the number times that the learning algorithm will work through the entire training dataset. Take a high number since early stopping will stop the model automatically when its enough
             "train_batch_size": 16, # defines the number of samples to work through before updating the internal model parameters (smaller = better / 32 common for use / see: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU)
            # use the following if the machine has not enough gpu ram for bigger batch sizes:
            #"gradient_accumulation_steps": 2, # e.g. 16 batch size * 2 gradient accumulation = 32 batch size (uses batch size 16 but updates first internal model parameters after 2 batches are worked through)
            # when using gradient_acc use for evaluate_during_training_steps -> batch size/len(data) / gradient = steps per epoch
             "use_early_stopping": True, # prevent model from overfitting
             "early_stopping_metric": 'eval_loss', # choose evaluation metric for early stopping (other metric e.g. mcc) -> eval_loss: how good can the model extrapolate to not seen data
             "early_stopping_delta": 0.01, # stop if eval_loss cannot get better by 0.01
             "early_stopping_metric_minimze": True, # eval_loss should be minimized (note: if mcc is used, it should get maximizied!)
             "evaluate_during_training": True, # evaluation will be performed during training to monitor the training process closely in order to find best model
             "evaluate_during_training_steps": 516, # Perform evaluation at every specified number of steps. In this case evaluate twice every epoch (steps_per_epoch/2)
             "early_stopping_patience": 10, # Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta
             "evaluate_during_training_verbose": True, # Print results from evaluation during training.
             "manual_seed": seed,
             "save_steps": -1} # for reproducible results


# Create a BERT ClassificationModel using the pretrained german BERT model (cased -> takes into account lowercase and uppercase letters)
# models are imported from huggingface (see for a list: https://huggingface.co/transformers/v3.3.1/pretrained_models.html)
model = ClassificationModel(
    "bert", "bert-base-german-cased",
    num_labels=2,
    args=train_args,
    use_cuda = True
)


Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [None]:
model.args

ClassificationArgs(adafactor_beta1=None, adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_eps=(1e-30, 0.001), adafactor_relative_step=True, adafactor_scale_parameter=True, adafactor_warmup_init=True, adam_betas=(0.9, 0.999), adam_epsilon=1e-08, best_model_dir='outputs/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=0, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0.01, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=10, encoding=None, eval_batch_size=8, evaluate_during_training=True, evaluate_during_training_silent=True, evaluate_during_training_steps=2015, evaluate_during_training_verbose=True, evaluate_each_epoch=True, fp16=True, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, loss_type=None, loss_args={}, manual_see

In [None]:
# check how many steps per epoch will be conducted using a batch size of 16 and folds with a proportion of 80% training data and 20% validation data
steps_per_epoch = (len(df)*0.9)/float(train_args['train_batch_size'])
steps_per_epoch

2578.78125

In [None]:
steps_per_epoch/5

515.75625

# **Train final model**

In [None]:
# fine-tune the pretrained bert-base-german-cased model with our final training dataset
# to use early stopping, eval_df has to be defined here
model.train_model(train_data, eval_df = test_data)



  0%|          | 0/41260 [00:00<?, ?it/s]

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Running Epoch 0 of 15:   0%|          | 0/2579 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]

Running Epoch 1 of 15:   0%|          | 0/2579 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]

Running Epoch 2 of 15:   0%|          | 0/2579 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]

Running Epoch 3 of 15:   0%|          | 0/2579 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]

Running Epoch 4 of 15:   0%|          | 0/2579 [00:00<?, ?it/s]



  0%|          | 0/4585 [00:00<?, ?it/s]

(10320,
 defaultdict(list,
             {'global_step': [516,
               1032,
               1548,
               2064,
               2579,
               2580,
               3096,
               3612,
               4128,
               4644,
               5158,
               5160,
               5676,
               6192,
               6708,
               7224,
               7737,
               7740,
               8256,
               8772,
               9288,
               9804,
               10316,
               10320],
              'train_loss': [0.32471656799316406,
               0.21952980756759644,
               0.29263418912887573,
               0.06859558820724487,
               0.025036176666617393,
               0.08817046880722046,
               0.12700510025024414,
               0.015980392694473267,
               0.03907155990600586,
               0.05035965144634247,
               0.004143039230257273,
               0.0077250003814697266,
 

# **Test model and evaluate performance**

In [None]:
# evaluate model performance using test_data dataframe

# evaluation metrics:
# mcc - matthews  correlation coefficient (range -1:1): close to 1 good / close to 0 no difference / under 0 bad performance -> doing the opposite as expected
# tp: true positive
# tn: true negative
# fp: false positive
# fn: false negative
# for the following: https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc?hl=de
# auroc (range 0:1): is the area under the curve where x is false positive rate (FPR) and y is true positive rate. Tells whether model has good discriminatory ability: 0.70 – 0.80 is good performance, greater than 0.8 is excellent performance
# auprc: is the area under the curve where x is recall and y is precision.
# eval_loss:

results, model_outputs, wrong_predictions = model.eval_model(test_data)

In [None]:
test_inputs = list(test_data['Text'])

# Durchführen der Vorhersagen mit dem Modell
predictions, raw_outputs = model.predict(test_inputs)

In [None]:
# Erstellen eines DataFrames mit den Vorhersagen und den wahren Labels
results1 = pd.DataFrame({'Text': test_data['Text'], 'true_label': test_data['migration'], 'predicted_label': predictions})

# Filtern der Zeilen, in denen das Modell falsch vorhergesagt hat
wrong_predictions = results1[results1['true_label'] != results1['predicted_label']]

In [None]:
wrong_predictions

In [None]:
# Now look at evaluation data
val_inputs = list(val_data['Text'])

# Durchführen der Vorhersagen mit dem Modell
predictions, raw_outputs = model.predict(val_inputs)


In [None]:
# Erstellen eines DataFrames mit den Vorhersagen und den wahren Labels
results2 = pd.DataFrame({'Text': val_data['Text'], 'true_label': val_data['migration'], 'predicted_label': predictions})

# Filtern der Zeilen, in denen das Modell falsch vorhergesagt hat
wrong_predictions2 = results2[results2['true_label'] != results2['predicted_label']]

In [None]:
wrong_predictions

In [None]:
accuracy = 1 - (len(wrong_predictions) / len(val_data))
accuracy

0.9569816643159379

In [None]:
#Count false positives und false negatives of evaluation dataset
false_positives = 0
false_negatives = 0
for i, row in wrong_predictions2.iterrows():
    if row['true_label'] == 0 and row['predicted_label'] == 1:
        false_positives += 1
    elif row['true_label'] == 1 and row['predicted_label'] == 0:
        false_negatives += 1

# Ausgabe von false positives und false negatives
print("False positives:", false_positives)
print("False negatives:", false_negatives)

In [None]:
# Anzeigen der falsch vorhergesagten Beispiele
file_path = '/content/drive/MyDrive/Masterarbeit/BertTopicModel/falsch_vorhergesagte_beispiele.xlsx'
wrong_predictions.to_excel(file_path, index=False)

**Save model**

In [None]:
# save files of model written on colab to /content/output

import os
import tarfile

def save_model(model_path='',file_name=''):
  files = [files for root, dirs, files in os.walk(model_path)][0]
  with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
    for file in files:
      f.add(f'{model_path}/{file}')

In [None]:
save_model('outputs/checkpoint-5158-epoch-2','classifier_migrational_topic')

In [None]:
!tar -zxvf ./classifier_migrational_topic.tar.gz

outputs/checkpoint-5158-epoch-2/special_tokens_map.json
outputs/checkpoint-5158-epoch-2/eval_results.txt
outputs/checkpoint-5158-epoch-2/vocab.txt
outputs/checkpoint-5158-epoch-2/tokenizer.json
outputs/checkpoint-5158-epoch-2/config.json
outputs/checkpoint-5158-epoch-2/pytorch_model.bin
outputs/checkpoint-5158-epoch-2/optimizer.pt
outputs/checkpoint-5158-epoch-2/training_args.bin
outputs/checkpoint-5158-epoch-2/tokenizer_config.json
outputs/checkpoint-5158-epoch-2/model_args.json
outputs/checkpoint-5158-epoch-2/scheduler.pt


In [None]:
#!rm -rf outputs

In [None]:
import os
import tarfile

def unpack_model(model_name=''):
  tar = tarfile.open(f"{model_name}.tar.gz", "r:gz")
  tar.extractall()
  tar.close()

unpack_model('classifier_migrational_topic')

In [None]:
# download files or move folder outputs to /content/drive/MyDrive to be able to download files
from google.colab import files
files.download("/content/outputs/config.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Load models and compare on evaluation dataset**

For Model Batch size = 16, No SW

In [None]:
# strategy for dealing with texts containing > 512 tokens: https://simpletransformers.ai/docs/classification-specifics/#dealing-with-long-text
train_args ={"reprocess_input_data": True, # If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "use_cached_eval_features": False, # tokenize validation set not again and again when ever a validation is conducted
             "output_dir": "outputs", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
             "max_seq_length": 512, # maximum number of tokens that a sequence can contain. Any tokens that appear after the max_seq_length will be truncated (max value: 512)
             "sliding_window": False, # Whether to use sliding window technique to prevent truncating sequences longer than 512 tokens
             "num_train_epochs": 15, # defines the number times that the learning algorithm will work through the entire training dataset. Take a high number since early stopping will stop the model automatically when its enough
             "train_batch_size": 16, # defines the number of samples to work through before updating the internal model parameters (smaller = better / 32 common for use / see: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU)
            # use the following if the machine has not enough gpu ram for bigger batch sizes:
            #"gradient_accumulation_steps": 2, # e.g. 16 batch size * 2 gradient accumulation = 32 batch size (uses batch size 16 but updates first internal model parameters after 2 batches are worked through)
            # when using gradient_acc use for evaluate_during_training_steps -> batch size/len(data) / gradient = steps per epoch
             "use_early_stopping": True, # prevent model from overfitting
             "early_stopping_metric": 'eval_loss', # choose evaluation metric for early stopping (other metric e.g. mcc) -> eval_loss: how good can the model extrapolate to not seen data
             "early_stopping_delta": 0.01, # stop if eval_loss cannot get better by 0.01
             "early_stopping_metric_minimze": True, # eval_loss should be minimized (note: if mcc is used, it should get maximizied!)
             "evaluate_during_training": True, # evaluation will be performed during training to monitor the training process closely in order to find best model
             "evaluate_during_training_steps": 516, # Perform evaluation at every specified number of steps. In this case evaluate twice every epoch (steps_per_epoch/2)
             "early_stopping_patience": 10, # Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta
             "evaluate_during_training_verbose": True, # Print results from evaluation during training.
             "manual_seed": seed,
             "save_steps": -1} # for reproducible results



# assuming that our files are saved under /content/outputs
model16NSW = ClassificationModel(
    "bert", "/content/drive/MyDrive/Masterarbeit/BertClassifierMig/outputs/FINALMig_B16NoSW",
    num_labels=2,
    args=train_args
)

In [None]:
# Now look at evaluation data
val_inputs = list(val_data['Text'])

# Durchführen der Vorhersagen mit dem Modell
predictions, raw_outputs = model16NSW.predict(val_inputs)


In [None]:
# Erstellen eines DataFrames mit den Vorhersagen und den wahren Labels
results2 = pd.DataFrame({'Text': val_data['Text'], 'true_label': val_data['migration'], 'predicted_label': predictions})

# Filtern der Zeilen, in denen das Modell falsch vorhergesagt hat
wrong_predictions2 = results2[results2['true_label'] != results2['predicted_label']]

NameError: ignored

In [None]:
wrong_predictions2

In [None]:
accuracy = 1 - (len(wrong_predictions2) / len(val_data))
accuracy

0.9580394922425952

In [None]:
#Count false positives und false negatives of evaluation dataset
false_positives = 0
false_negatives = 0
for i, row in wrong_predictions2.iterrows():
    if row['true_label'] == 0 and row['predicted_label'] == 1:
        false_positives += 1
    elif row['true_label'] == 1 and row['predicted_label'] == 0:
        false_negatives += 1

# Ausgabe von false positives und false negatives
print("False positives:", false_positives)
print("False negatives:", false_negatives)

False positives: 77
False negatives: 42


For Model Batch size = 32, No SW

In [None]:
# strategy for dealing with texts containing > 512 tokens: https://simpletransformers.ai/docs/classification-specifics/#dealing-with-long-text
train_args ={"reprocess_input_data": True, # If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "output_dir": "outputs", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
             "max_seq_length": 512, # maximum number of tokens that a sequence can contain. Any tokens that appear after the max_seq_length will be truncated (max value: 512)
             "num_train_epochs": 15, # defines the number times that the learning algorithm will work through the entire training dataset. Take a high number since early stopping will stop the model automatically when its enough
             "train_batch_size": 16, # defines the number of samples to work through before updating the internal model parameters (smaller = better / 32 common for use / see: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU)
             "gradient_accumulation_steps": 2,
             "use_early_stopping": True, # prevent model from overfitting
             "early_stopping_metric": 'eval_loss', # choose evaluation metric for early stopping (other metric e.g. mcc)
             "early_stopping_delta": 0.01, # stop if eval_loss cannot get better by 0.01
             "early_stopping_metric_minimze": True, # mcc should be minimized (note: if mcc is used, it should get maximizied!)
             "early_stopping_patience": 15, # Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta
             "use_cached_eval_features": False, # tokenize validation set not again and agian when ever a validation is conducted
             "evaluate_during_training": True,
             "evaluate_during_training_verbose": True, # Print results from evaluation during training.
             "evaluate_during_training_steps": 142,
             "sliding_window":False,
             "manual_seed":seed}


# assuming that our files are saved under /content/outputs
model16SW = ClassificationModel(
    "bert", "/content/drive/MyDrive/Masterarbeit/BertClassifierMig/outputs/FINALMig_B16SW",
    num_labels=2,
    args=train_args
)

OSError: ignored

In [None]:
# Now look at evaluation data
val_inputs = list(val_data['Text'])

# Durchführen der Vorhersagen mit dem Modell
predictions, raw_outputs = model32NSW.predict(val_inputs)


  0%|          | 0/2836 [00:00<?, ?it/s]

  0%|          | 0/355 [00:00<?, ?it/s]

In [None]:
# Erstellen eines DataFrames mit den Vorhersagen und den wahren Labels
results2 = pd.DataFrame({'Text': val_data['Text'], 'true_label': val_data['migration'], 'predicted_label': predictions})

# Filtern der Zeilen, in denen das Modell falsch vorhergesagt hat
wrong_predictions2 = results2[results2['true_label'] != results2['predicted_label']]

In [None]:
wrong_predictions2

In [None]:
accuracy = 1 - (len(wrong_predictions2) / len(val_data))
accuracy

0.9555712270803949

In [None]:
#Count false positives und false negatives of evaluation dataset
false_positives = 0
false_negatives = 0
for i, row in wrong_predictions2.iterrows():
    if row['true_label'] == 0 and row['predicted_label'] == 1:
        false_positives += 1
    elif row['true_label'] == 1 and row['predicted_label'] == 0:
        false_negatives += 1

# Ausgabe von false positives und false negatives
print("False positives:", false_positives)
print("False negatives:", false_negatives)

#Predict data from Survey

In [None]:
# strategy for dealing with texts containing > 512 tokens: https://simpletransformers.ai/docs/classification-specifics/#dealing-with-long-text
train_args ={"reprocess_input_data": True, # If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
             "overwrite_output_dir": True, # If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
             "output_dir": "outputs", # The directory where all outputs will be stored. This includes model checkpoints and evaluation results.
             "fp16": True, # fp16 = True when graphic card is avaliable otherwise fp16 = False
             "max_seq_length": 512, # maximum number of tokens that a sequence can contain. Any tokens that appear after the max_seq_length will be truncated (max value: 512)
             "num_train_epochs": 15, # defines the number times that the learning algorithm will work through the entire training dataset. Take a high number since early stopping will stop the model automatically when its enough
             "train_batch_size": 16, # defines the number of samples to work through before updating the internal model parameters (smaller = better / 32 common for use / see: https://wandb.ai/ayush-thakur/dl-question-bank/reports/What-s-the-Optimal-Batch-Size-to-Train-a-Neural-Network---VmlldzoyMDkyNDU)
             #"gradient_accumulation_steps": 2,
             "use_early_stopping": True, # prevent model from overfitting
             "early_stopping_metric": 'eval_loss', # choose evaluation metric for early stopping (other metric e.g. mcc)
             "early_stopping_delta": 0.01, # stop if eval_loss cannot get better by 0.01
             "early_stopping_metric_minimze": True, # mcc should be minimized (note: if mcc is used, it should get maximizied!)
             "early_stopping_patience": 15, # Terminate training after this many evaluations without an improvement in the evaluation metric greater then early_stopping_delta
             "use_cached_eval_features": False, # tokenize validation set not again and agian when ever a validation is conducted
             "evaluate_during_training": True,
             "evaluate_during_training_verbose": True, # Print results from evaluation during training.
             "evaluate_during_training_steps": 284,
             "use_multiprocessing": False, # !!! False needed for use with extended RAM in Google Colab otherwise the training process will not start
             "use_multiprocessing_for_evaluation": False, # !!! False needed for use with extended RAM in Google Colab otherwise the training process will not start
             "sliding_window":False,
             "manual_seed":seed}


# assuming that our files are saved under /content/outputs
model16NSW = ClassificationModel(
    "bert", "/content/drive/MyDrive/Masterarbeit/BertClassifierMig/outputs/FINALMig_B16NoSW",
    num_labels=2,
    args=train_args
)

In [None]:
# use dropna() to remove empty excel rows
news1 = pd.read_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/all_news_clean.xlsx")

news1 = news1.dropna(subset=["text"])

In [None]:
#first 10000 rows
news = news1
#news = news.sample(frac=0.1, random_state=42)
len(news)


140025

In [None]:
#Data is too big to do it with complete set. So split it in first 50%, run code, delete RAM, run it with second 50% and put them together

#Use the first 50%
# Anzahl der Zeilen berechnen
total_rows = len(news)
half_rows = int(total_rows * 0.5)

# Die ersten 50% der Daten extrahieren
first_half = news.head(half_rows)
second_half = news[int(total_rows * 0.5):]


# Ergebnis anzeigen
#print(first_half)
print(second_half)

In [None]:
#choose which half to use

news = first_half
#news = second_half
print(news)

In [None]:
#predict Dataset
predictions, _ = model16NSW.predict(news["text"].tolist())

#for SW
#predict Dataset
#predictions, _ = model16SW.predict(news["text"].tolist())

#Store results
results = pd.DataFrame({"text": news["text"], "migration": predictions})

# match data with predictions
news_categorised = pd.concat([news, results["migration"]], axis=1)

# print
print(news_categorised)

  0%|          | 0/8752 [00:00<?, ?it/s]

                                                link  \
0  https://abendblatt.de/archiv/2001/article20480...   
1  https://abendblatt.de/article233357889/Maenner...   
2  https://abendblatt.de/article233426519/feuerwe...   
3  https://abendblatt.de/article234092307/impfen-...   
4  https://abendblatt.de/hamburg-tipps/kinder/kin...   

                                                text  \
0  "Bettler bringt Jungpastoren das Betteln bei",...   
1  Hamburg. Gut einen Monat, nachdem zwei unbekan...   
2  Hamburg. In der Nacht zum Sonntag musste die F...   
3  Hamburg. Lange Warteschlangen vor Impfzentren,...   
4  Blau, Weiß, Schwarz, das sind die Farben des H...   

                                               title  \
0                                      "LESERBRIEFE"   
1  Täter zünden Obdachlosen an: Polizei sucht bes...   
2  Großbrand zerstört Lagerhalle – Feuerwehr im D...   
3  Hier gibt es noch Termine für Erst- und Booste...   
4               Wie entstand die bekannte HSV-

In [None]:
# Create a new column "tags_migration" that contains 1 if the "tags" column contains any word that starts with "migration", and 0 otherwise

news_categorised["tags_migration"] = news_categorised["tags"].apply(lambda x: 1 if pd.notna(x) and any("migra" in tag for tag in str(x).split()) else 0)

# Ausgabe des Datensatzes mit der neuen Spalte
print(news_categorised.head())

In [None]:
#Create new column "identical". Shows if in migration and tags_klima are the same values
news_categorised['identical'] = news_categorised.apply(lambda row: 1 if row['migration'] == row['tags_migration'] else 0, axis=1)


In [None]:
# Filtern der Zeilen, in denen "tags_migration" den Wert 1 hat
rows_with_tags_migration_1 = news_categorised[news_categorised["tags_migration"] == 1]

# Ausgabe der Ergebnisse
print(rows_with_tags_migration_1)

Empty DataFrame
Columns: [link, text, title, lead, tags, source, language, is_alt_news, word_count, migration, tags_migration, identical]
Index: []


In [None]:
# Filtern der Zeilen, in denen "migration" und "tags_migration" nicht übereinstimmen
mismatched_rows = news_categorised[news_categorised["migration"] != news_categorised["tags_migration"]]

# Ausgabe der Ergebnisse
print(mismatched_rows)

In [None]:
#false poitives/false negatives
# Zählen von false positives und false negatives
false_positives = 0
false_negatives = 0
for i, row in news_categorised.iterrows():
    if row["migration"] == 0 and row["tags_migration"] == 1:
        false_negatives += 1
    elif row["migration"] == 1 and row["tags_migration"] == 0:
        false_positives += 1

# Ausgabe von false positives und false negatives
print("False positives:", false_positives)
print("False negatives:", false_negatives)

False positives: 2562
False negatives: 0


In [None]:
#which one depends on which half you running
mismatched_rows.to_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/missmatches_tags_pred1.xlsx", index=False)
#mismatched_rows.to_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/missmatches_tags_pred2.xlsx", index=False)

In [None]:
#Just the migration articles
#All with evironment = 1
news_categorised_justMig = news_categorised[news_categorised["migration"] == 1]

# Ausgabe der Ergebnisse
print(news_categorised_justMig)

In [None]:
#save complete dataset
#which one depends on which half you running
news_categorised_justMig.to_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/newsMig_categorised1.xlsx", index=False)
#news_categorised_justMig.to_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/newsMig_categorised2.xlsx", index=False)

In [None]:
#Combine both to final set
# Excel-Dateien einlesen
df1 = pd.read_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/newsMig_categorised1.xlsx")
df2 = pd.read_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/newsMig_categorised2.xlsx")

# Datenframes zusammenführen
news_categorised_final_justMig = pd.concat([df1, df2], ignore_index=True)

news_categorised_final_justMig.to_excel("/content/drive/MyDrive/Masterarbeit/BertClassifierMig/News/news_categorised_final_justMig.xlsx", index=False)