In [5]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

In [6]:
output_path = '/pfs/data5/home/ma/ma_ma/ma_dschweim/Thesis/'
classifier = 'BERT'

In [15]:
import datetime
import os
def output_and_store_endmodel_results(output_path, classifier, feature, Y_test, Y_pred, X_test, hyperparameters):
    """
    Print results in console and store them in csv (merging with previous results)
    :param output_path: path to data output
    :type output_path: str
    :param classifier: model used in current run
    :type classifier: str
    :param feature: vectorization used in current run
    :type feature: str
    :param Y_test: ground truth labels of test set
    :type Y_test: pd.Series
    :param Y_pred: predicted labels of test set
    :type Y_pred: np.ndarray
    :param X_test: test set
    :type X_test: pd.DataFrame
    :param hyperparameters: tuned hyperparameters retrieved from model
    :type hyperparameters: dict
    :return:
    :rtype:
    """

    print('---------------------------------------')
    print(f"Model: {classifier}, Feature: {feature}")
    print(f"Model Test Accuracy: {accuracy_score(Y_test, Y_pred)}")
    print(f"Model Test Precision: {precision_score(Y_test, Y_pred)}")
    print(f"Model Test Recall: {recall_score(Y_test, Y_pred)}")
    print(f"Model Test F1: {f1_score(Y_test, Y_pred, average='binary')}")

    # Save results
    timestamp = datetime.datetime.now().strftime("%d-%m-%Y %H-%M-%S")

    results_df = pd.DataFrame({'model': [classifier],
                               'vectorization': [feature],
                               'hyperparameters': [hyperparameters],
                               'accuracy': [accuracy_score(Y_test, Y_pred)],
                               'precision': [precision_score(Y_test, Y_pred)],
                               'recall': [recall_score(Y_test, Y_pred)],
                               'f1': [f1_score(Y_test, Y_pred, average='binary')],
                               'timestamp': [timestamp]
                               })
    results_df = results_df.set_index(['model', 'vectorization'])

    # If results file exists, append results to file
    if os.path.isfile(f'{output_path}\\Results\\End_Model\\results.csv'):
        prev_results = pd.read_csv(f'{output_path}\\Results\\End_Model\\results.csv',
                                   index_col=['model', 'vectorization'])

        results_df = results_df.append(prev_results)

        # only keep newest run
        results_df = results_df[~results_df.index.duplicated()].sort_index()

    results_df.to_csv(f'{output_path}\\Results\\End_Model\\results.csv')
    
    if classifier == 'BERT':
         # Save individual predictions and corresponding information
        model_preds = pd.DataFrame({'Content': X_test.content,
                                    'POPULIST_PeopleCent': X_test.POPULIST_PeopleCent,
                                    'POPULIST_AntiElite': X_test.POPULIST_AntiElite,
                                    'POPULIST_Sovereign': X_test.POPULIST_Sovereign,
                                    'Country': X_test.Sample_Country,
                                    'Category': X_test.Sample_Type,
                                    'Y_test': Y_test,
                                    'Y_pred': Y_pred})

    else:
        # Save individual predictions and corresponding information
        model_preds = pd.DataFrame({'Content': X_test.content,
                                    'POPULIST_PeopleCent': X_test.POPULIST_PeopleCent,
                                    'POPULIST_AntiElite': X_test.POPULIST_AntiElite,
                                    'POPULIST_Sovereign': X_test.POPULIST_Sovereign,
                                    'Country': X_test.Sample_Country,
                                    'Category': X_test.Sample_Type,
                                    'Y_test': Y_test.astype(int),
                                    'Y_pred': Y_pred})

    model_preds.to_csv(f'{output_path}\\{classifier}_{feature}_preds.csv')


In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [20]:
def run_transformer(X, y, X_test, model_name):

    # Instantiate Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Encode the text features for training (test data is encoded later)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

    # Tokenize
    X_train_tokenized = tokenizer(X_train, padding='max_length', truncation=True)
    X_val_tokenized = tokenizer(X_val, padding='max_length', truncation=True)

    # Create torch dataset
    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)

    # Instantiate model
    transformer_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Create Trainer Object
    # Use GPU, if available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Define Trainer
    training_args = TrainingArguments("test_trainer")
    trainer = Trainer(
        model=transformer_model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset
    )

    # Train pre-trained model
    transformer_model.to(device)
    trainer.train()

    # ----- 3. Predict -----#
    # Load test data
    X_test_tokenized = tokenizer(X_test, padding='max_length', truncation=True)

    # Create torch dataset
    test_dataset = Dataset(X_test_tokenized)

    # Make prediction
    raw_pred = trainer.predict(test_dataset)

    # Preprocess raw predictions
    y_pred = raw_pred[0].argmax(-1)

    # Get best hyperparam setting
    hyperparameters = {
        "batch_size":trainer.args.per_device_train_batch_size,
        "learning_rate": trainer.args.learning_rate,
        "num_epochs": trainer.args.num_train_epochs}

    return y_pred, hyperparameters


In [21]:
# Load data
train = pd.read_csv('/pfs/data5/home/ma/ma_ma/ma_dschweim/Thesis/datasets/labeled_df_train_threshold_None.csv')
test = pd.read_csv('/pfs/data5/home/ma/ma_ma/ma_dschweim/Thesis/datasets/labeled_df_test.csv')

# Define test labels
Y_test = test.label.tolist()

# Predict test labels
Y_pred, hyperparameters = run_transformer(X=train.content.tolist(), y= train.label.tolist(), X_test=test.content.tolist(), model_name='bert-base-german-cased')

loading configuration file https://huggingface.co/bert-base-german-cased/resolve/main/config.json from cache at /home/ma/ma_ma/ma_dschweim/.cache/huggingface/transformers/98877e98ee76b3977d326fe4f54bc29f10b486c317a70b6445ac19a0603b00f0.1f2afedb22f9784795ae3a26fe20713637c93f50e2c99101d952ea6476087e5e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading file https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt from cache at /home/ma/ma_m

Step,Training Loss
500,0.0397


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 692
  Batch size = 8


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Print resuls
print(f"Model Test Accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f"Model Test Precision: {precision_score(Y_test, Y_pred)}")
print(f"Model Test Recall: {recall_score(Y_test, Y_pred)}")
print(f"Model Test F1: {f1_score(Y_test, Y_pred, average='binary')}")

Model Test Accuracy: 0.3208092485549133
Model Test Precision: 0.3183139534883721
Model Test Recall: 0.9954545454545455
Model Test F1: 0.48237885462555063


In [23]:
output_and_store_endmodel_results(output_path=output_path, classifier=classifier, feature='-',
                                  Y_test=Y_test, Y_pred=Y_pred, X_test=test,
                                  hyperparameters=hyperparameters)

---------------------------------------
Model: BERT, Feature: -
Model Test Accuracy: 0.3208092485549133
Model Test Precision: 0.3183139534883721
Model Test Recall: 0.9954545454545455
Model Test F1: 0.48237885462555063
