In [1]:
!pip install transformers
!pip install datasets
!pip install sklearn
!pip install scikit-multilearn

Collecting transformers
  Using cached transformers-4.20.1-py3-none-any.whl (4.4 MB)
Collecting filelock
  Using cached filelock-3.7.1-py3-none-any.whl (10 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Using cached tokenizers-0.12.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Collecting regex!=2019.12.17
  Using cached regex-2022.7.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (765 kB)
Installing collected packages: filelock, tokenizers, regex, huggingface-hub, transformers
Successfully installed filelock-3.7.1 huggingface-hub-0.8.1 regex-2022.7.9 tokenizers-0.12.1 transformers-4.20.1
You should consider upgrading via the '/opt/conda/bin/python3.8 -m pip install --upgrade pip' command.[0m
Collecting datasets
  Using cached datasets-2.3.2-py3-none-any.whl (362 kB)
Collecting xxhash
  Using cached xxhash-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manyl

In [2]:
import transformers
import datasets
import sklearn
import pandas as pd
import numpy as np

### Annotationslabels encodieren

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

def encode(labels):
    """
    Input: labels = Liste der Annotationslabels f√ºr den Datensatz,
            z.B. ["KEINE", "KEINE", "VVH", "KEINE", ...] oder
                 [["KeineGruppe"], ["Politische Einstellung", "Geschlecht"], ["KeineGruppe"], ...]
    Output:
            namen = Liste der Klassenlabels in korrekter Reihenfolge;
                    eine der Listen in label_namen
            labels_encoded = Liste der Annotationslabels im bin√§ren Format
                    die Probleme stellen immer die Frage "Ist dieses Ph√§nomen vorhanden?" - Ja/Nein
                    dann im bin√§ren Format: Ja = 1, Nein = 0
    """

    # M√∂gliche Klassen
    labels_vvh = ["KEINE", "VVH"]
    labels_gruppe = ["KeineGruppe", "Gruppe"]
    labels_handlung = ["KeineHandlung", "Handlung"]
    labels_gruppe_det = ["KeineGruppe", "Nationalit√§t", 'ethnische Herkunft / "Rasse"', "Religion / Weltanschauung",
        "Politische Einstellung", "Geschlecht", "Anderes Merkmal"]
    labels_handlung_det = ["KeineHandlung", "Aufstachelung zu Hass", "Aufforderung zu Gewalt- oder Willk√ºrma√ünahmen", "Angriff der Menschenw√ºrde"]
    labels_comb = labels_gruppe_det + labels_handlung_det
    label_namen = [labels_vvh, labels_gruppe, labels_handlung, labels_gruppe_det, labels_handlung_det, labels_comb]


    # Klassifizierungsproblem, also das Set der vorhandenen Labels, ermitteln
    # Fall 1: Strings (bin√§re Klassen)
    namen = []
    if type(labels[0]) == str: labels_flat = set(labels)
    # Fall 2: Listen (mehrere Klassen)
    else: labels_flat = set([label for entry in labels for label in entry])
    # Richtige Liste finden
    for i in label_namen:
        if set(i) == labels_flat: namen = i

    # Labels transformieren
    # Fall 1: 2 Klassen
    eins = ["NEG", "VVH", "Gruppe", "Handlung"]
    if len(namen) == 2:
        labels_eins = np.array(list(map(lambda x: 1 if x in eins else 0, list(labels))))
        return (labels_eins, namen)
    # Fall 2: mehrere Klassen
    else:
        binarizer = MultiLabelBinarizer(classes = namen)
        label_array = binarizer.fit_transform(labels)
        return (label_array, namen)

### Stratifizierter Train/Test-Split

In [4]:
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split,IterativeStratification
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from collections import Counter

def train_test_split_multilabel(daten, ziele, test_size=0.3):
    """TODO Beschreibung
    """

    # Input daten als Input f√ºr den Multilabel-Stratifizierer (http://scikit.ml/stratification.html) vorbereiten:
    # Format ndarray (beispiele) x ndarray (features)
    # da allerdings die Features erst nach dem Train/Test-Split berechnet werden,
    # werden stattdessen die Korpus-IDs √ºbergeben, anhand derer dann die Tweets zugeordnet werden
    #X, y  = daten.index, ziele
    X, y = np.array([np.array([entry, ]) for entry in daten.index]), ziele
    #X = np.array(X)

    X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size = test_size) # Multilabel

    # Tweets anhand der IDs dem Train/Test-Split zuordnen
    X_train_tweets = [daten.loc[index[0]]["tweet"] for index in X_train]
    X_test_tweets =  [daten.loc[index[0]]["tweet"] for index in X_test]

    return ((X_train_tweets, X_test_tweets, y_train, y_test), (X_train, X_test))


def split(daten, ziele, labels):
    """Train/Test-Split 
    Input: Datensatz (Pandas Dataframe mit Index und Spalte "tweets"),
           ziele (Zielannotation im bin√§ren Format)
    Output: X_train, y_train, X_test, y_test
            nach stratifiziertem Train/Test-Split, Preprocessing, Merkmalsauswahl und Normalisierung
    """

    # Stratifizierter Train/Test-Split
    if len(labels) == 2:
        X_train, X_test, y_train, y_test = train_test_split(daten["tweet"], ziele, test_size=0.3, random_state=36, stratify=ziele)
    else:
        X_train, X_test, y_train, y_test = train_test_split_multilabel(daten, ziele, test_size=0.3)[0]

    return (X_train, y_train), (X_test, y_test)    


### Tokenizer f√ºr das Preprocessing laden

In [5]:
from transformers import AutoTokenizer

# F√ºr das Preprocessing mit Sklearn: kein Padding
tokenizer = AutoTokenizer.from_pretrained('deepset/gelectra-large', truncation=True, padding=False)

def bert_tokenize(inputs):
    """Einen String mit BERT tokenisieren
    Output: Liste von Tokens"""
    token_ids = tokenizer(inputs)
    tokens = tokenizer.convert_ids_to_tokens(token_ids["input_ids"])
    return tokens

# F√ºr das Preprocessing f√ºr BERT: Padding, Output: Pytorch tensons
tokenizer_bert = AutoTokenizer.from_pretrained('bert-base-german-cased', padding=True, truncation=True, return_tensors="pt")

def preprocess(data):
    return tokenizer_bert(data["text"])

In [None]:
# TODO: weg
tweet1 = "@user Das meiste geht eh wieder f√ºr die Asylanten drauf ü§Æ"
tweet2 = "@user Die ganze Bande muss weg!"
tweet1 = tweet1.lower()
tweet2 = tweet2.lower()
print(tweet2)
tweettoks = tokenizer(tweet2)
print(tokenizer.convert_ids_to_tokens(tweettoks["input_ids"]))

### Evaluationsmetriken vorbereiten

In [6]:
from sklearn import metrics
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve, multilabel_confusion_matrix, matthews_corrcoef

def eval(y_test, predicted, labels):
    '''TODO Beschreibung
    '''
    # TODO: restliche Metriken erg√§nzen
    evaluation = dict() # Precision, Recall, Accuracy, F1, MCC, Confusion Matrix

    evaluation["f1"] = metrics.f1_score(y_test, predicted)
    evaluation["rep"] = metrics.classification_report(y_test, predicted)#, labels=labels)
    evaluation["confusion"] = metrics.confusion_matrix(y_test, predicted)#, labels=labels)
    evaluation["mcc"] = metrics.matthews_corrcoef(y_test, predicted)

    return evaluation

def eval_tofile(evaldata, labels, path):
    with open(path, mode="w", encoding="utf-8") as outfile:
        outfile.write(str(labels)+"\n")
        outfile.write(str(evaldata))
    return True

### Klassifikationsmethode Nr.1: Logistischer Regression

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

def features_tfidf(X_train, X_test):
    '''TODO Beschreibung
    Preprocessing und Merkmalsauswahl
    Output: X_train, y_train, X_test, y_test
            nach stratifiziertem Train/Test-Split, Preprocessing, Merkmalsauswahl und Normalisierung
    '''

    # Preprocessing und Merkmalsauswahl
    # weitere m√∂gliche Parameter: strip_accents = unicode, max_features=100, min_df = 2, sublinear_tf = True (replace tf with 1 + log(tf))
    vectorizer = TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
        lowercase=True,
        tokenizer=bert_tokenize,
        strip_accents='unicode',
        max_features=1000,
        sublinear_tf = True
        )
    X_train_feat = vectorizer.fit_transform(X_train)
    X_test_feat = vectorizer.transform(X_test)

    # Normalisierung
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_train_maxabs = max_abs_scaler.fit_transform(X_train_feat)
    X_test_maxabs = max_abs_scaler.transform(X_test_feat)

    return X_train_maxabs, X_test_maxabs


def pipeline(train, test, labels, save=False):
    '''TODO Beschreibung
    f√ºr zwei Klassen
    '''
    X_train, X_test = features_tfidf(train[0], test[0])
    y_train, y_test = train[1], test[1]

    # Training
    model = LogisticRegression()
    model.fit(X_train, y_train)
    if save == True:
        dump(model, '../models/lr-'+str(labels[1])+'.joblib') 

    predicted = model.predict(X_test)
    evaluation = eval(y_test, predicted, labels)
    return evaluation


def multilabel_pipeline(train, test, labels, save=False):
    """TODO Beschreibung
    """
    # 1. Klassifikationspipeline (train_eval) f√ºr jede Klasse einmal laufen lassen,
    #    Evaluationsergebnisse sammeln
    X_train, X_test = features_tfidf(train[0], test[0])
    y_train, y_test = train[1], test[1]

    # sammelt alle Tupel (predicted, evaluation)
    # f√ºr die i-te Klasse an der jeweils i-ten Stelle
    ergebnisse_gsmmlt = []
    predicted_gsmmlt = []
    for i, label in enumerate(labels):
        # f√ºr jede Klasse angepasste Labelliste
        labels_i = ["Keine", label]

        train_i = (X_train, y_train[:, i])
        test_i = (X_test, y_test[:, i])

        # Training
        model = LogisticRegression()
        model.fit(X=train_i[0], y=train_i[1])    
        if save == True:
            dump(model, '../models/lr-'+str(labels[0][5:])+str(i)+'.joblib') 

        predicted = model.predict(test_i[0])
        evaluation = eval(test_i[1], predicted, labels=labels_i)
        
        ergebnisse_gsmmlt.append(evaluation)
        predicted_gsmmlt.append(predicted)

    # 2. Gesammelte Ergebnisse evaluieren
    # gesammeltes Predict
    # Matrix im Format Eintr√§ge (Reihen) x Klassen (Spalten)
    predicted_gsmmlt = np.array(predicted_gsmmlt).transpose()
    confusion = multilabel_confusion_matrix(y_test, predicted_gsmmlt)#, labels=labels)

    eval_gsmmlt = {"mcc":[], "f1": []}
    for i in ergebnisse_gsmmlt:
        eval_gsmmlt["mcc"].append(i["mcc"])
        eval_gsmmlt["f1"].append(i["f1"])

    eval_tofile((ergebnisse_gsmmlt, confusion), labels, '../Ergebnisse/Ergebnisse-lr-'+str(labels[0][5:])+".txt")

    return eval_gsmmlt, confusion, ergebnisse_gsmmlt

### Klassifikationsmethode Nr. 2: Transfer Learning

In [8]:
from datasets import load_dataset, Dataset
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer_bert)

def pipeline_bert(train, test, labels, save=False):
    '''TODO Beschreibung
    Klassifikation f√ºr 2 Klassen mit BERT
    
    '''
    X_train, y_train = train
    X_test, y_test = test

    train = {"text": X_train, "labels": y_train}
    test = {"text": X_test, "labels": y_test}

    train_dataset = Dataset.from_dict(train)
    test_dataset = Dataset.from_dict(test)

    train_tokenized = train_dataset.map(preprocess, batched=True)
    test_tokenized = test_dataset.map(preprocess, batched=True)
    train_tokenized = train_tokenized.remove_columns("text")
    test_tokenized = test_tokenized.remove_columns("text")

    # Fine-Tuning
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-german-cased', num_labels=2)

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        evaluation_strategy = 'no'
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        tokenizer=tokenizer_bert,
        data_collator=data_collator,
    )

    trainer.train()

    if save == True:
        model.save_pretrained(save_directory='../models/bert-'+str(labels[1]))

    pred = trainer.predict(test_dataset=test_tokenized)
    predicted = np.argmax(pred[0], axis=-1)    

    evaluation = eval(y_test, predicted, labels)
    
    return evaluation


def multilabel_pipeline_bert(train, test, labels, save=False):
    '''
    Klassifizierung mit BERT f√ºr mehr als zwei Klassen
    TODO Beschreibung
    '''
    X_train, y_train = train
    X_test, y_test = test

    train = {"text": X_train, "labels": y_train}
    test = {"text": X_test, "labels": y_test}
    train_dataset = Dataset.from_dict(train)
    test_dataset = Dataset.from_dict(test)

    train_tokenized = train_dataset.map(preprocess, batched=True)
    test_tokenized = test_dataset.map(preprocess, batched=True)
    train_tokenized = train_tokenized.remove_columns("text")
    test_tokenized = test_tokenized.remove_columns("text")

    # sammelt alle Tupel (predicted, evaluation)
    # f√ºr die i-te Klasse an der jeweils i-ten Stelle
    ergebnisse_gsmmlt = []
    predicted_gsmmlt = []
    for i, label in enumerate(labels):
        # f√ºr jede Klasse angepasste Labelliste
        labels_i = ["Keine", label]

        # i-te Spalte in der Labelmatrix ausw√§hlen, dabei den letzten Eintrag ersetzen
        train_tokenized = train_tokenized.remove_columns("labels")
        test_tokenized = test_tokenized.remove_columns("labels")
        train_tokenized = train_tokenized.add_column("labels", y_train[:, i])
        test_tokenized = test_tokenized.add_column("labels", y_test[:, i])

        # Fine-Tuning
        model = AutoModelForSequenceClassification.from_pretrained('bert-base-german-cased', num_labels=2)

        training_args = TrainingArguments(
            output_dir="./results",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=10,
            weight_decay=0.01,
            evaluation_strategy = 'no'
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_tokenized,
            eval_dataset=test_tokenized,
            tokenizer=tokenizer_bert,
            data_collator=data_collator,
        )

        trainer.train()

        if save == True:
            model.save_pretrained(save_directory='../models/bert-'+str(labels[0][5:])+str(i))


        pred = trainer.predict(test_dataset=test_tokenized)
        predicted = np.argmax(pred[0], axis=-1)    

        evaluation = eval(y_test[:, i], predicted, labels=labels_i)

        ergebnisse_gsmmlt.append(evaluation)
        predicted_gsmmlt.append(predicted)


    # 2. Gesammelte Ergebnisse evaluieren
    # gesammeltes Predict
    # Matrix im Format Eintr√§ge (Reihen) x Klassen (Spalten)
    predicted_gsmmlt = np.array(predicted_gsmmlt).transpose()
    confusion = multilabel_confusion_matrix(y_test, predicted_gsmmlt)#, labels=labels)

    eval_gsmmlt = {"mcc":[], "f1": []}
    for i in ergebnisse_gsmmlt:
        eval_gsmmlt["mcc"].append(i["mcc"])
        eval_gsmmlt["f1"].append(i["f1"])

    eval_tofile((ergebnisse_gsmmlt, confusion), labels, '../Ergebnisse/Ergebnisse-bert-'+str(labels[0][5:])+".txt")

    return eval_gsmmlt, confusion, ergebnisse_gsmmlt

### Entscheidungsbaum: Klassifikation Volksverhetzung Ja/Nein

In [37]:
def vvh_entscheidungsbaum(modelle_gruppe, modelle_handlung, test, labels):
    X_test, y_test = test
    
    test = {"text": X_test}
    test_dataset = Dataset.from_dict(test)
    test_tokenized = test_dataset.map(preprocess, batched=True)
    test_tokenized = test_tokenized.remove_columns("text")
    
    # alle Gruppenmodelle laden
    # jeweils einzeln X_test klassifizieren
    predicted_gruppe_comb = []
    for model_path in modelle_gruppe:
        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

        training_args = TrainingArguments(
            output_dir="./results",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            #num_train_epochs=10,
            weight_decay=0.01,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            tokenizer=tokenizer_bert,
            data_collator=data_collator,
        )
        
        pred = trainer.predict(test_dataset=test_tokenized)
        predicted = np.argmax(pred[0], axis=-1)   
        
        # Ergebnisse kombinieren 
        predicted_gruppe_comb.append(predicted)
    
    # ableiten Gruppe Ja/Nein --> neues Array Predict
    predicted_gruppe_comb = np.array([np.array(entry) for entry in predicted_gruppe_comb])
    predicted_gruppe_comb = predicted_gruppe_comb.transpose()
    predicted_gruppe = [max(entry) for entry in predicted_gruppe_comb]

    # alle Handlungsmodelle laden
    # jeweils einzeln X_test klassifizieren
    predicted_hndl_comb = []
    for model_path in modelle_handlung:
        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

        training_args = TrainingArguments(
            output_dir="./results",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            #num_train_epochs=10,
            weight_decay=0.01,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            tokenizer=tokenizer_bert,
            data_collator=data_collator,
        )
        
        pred = trainer.predict(test_dataset=test_tokenized)
        predicted = np.argmax(pred[0], axis=-1)   
        
        # Ergebnisse kombinieren 
        predicted_hndl_comb.append(predicted)
    
    # ableiten Handlung Ja/Nein --> neues Array Predict
    predicted_hndl_comb = np.array([np.array(entry) for entry in predicted_hndl_comb])
    predicted_hndl_comb = predicted_hndl_comb.transpose()
    predicted_hndl = [max(entry) for entry in predicted_hndl_comb]

    # Kombination der beiden Arrays --> neuer Array Predict
    # falls == 1 in beiden F√§llen: 1 im neuen Array
    predicted_vvh = [1 if predicted_hndl[i] + predicted_gruppe[i] == 2 else 0 for i in range(len(y_test))]

    # Evaluation: 
    evaluation = eval(y_test, predicted_vvh, labels = labels)
    eval_tofile(evaluation, labels, '../Ergebnisse/Ergebnisse-VVH-BERT.txt')

    return evaluation


### Datensatz laden

In [10]:
vvh_off = "..\Korpora\Referenzdatensatz_HateSpeech_Deutsch\HateSpeechDe_HATE_VVH.json"
vvh_on = "../data_bert/HateSpeechDe_HATE_VVH.json"

gruppen_off = "..\Korpora\Referenzdatensatz_HateSpeech_Deutsch\HateSpeechDe_HATE_GruppeDetail.json"
gruppen_on = "../data_bert/HateSpeechDe_HATE_GruppeDetail.json"

hndl_off = "..\Korpora\Referenzdatensatz_HateSpeech_Deutsch\HateSpeechDe_HATE_HandlungDetail.json"
hndl_on = "../data_bert/HateSpeechDe_HATE_HandlungDetail.json"


vvh = pd.read_json(vvh_on)
vvh = vvh.set_index(keys="corpus_id")
gruppen = pd.read_json(gruppen_on)
gruppen = gruppen.set_index(keys="corpus_id")
hndl = pd.read_json(hndl_on)
hndl = hndl.set_index(keys="corpus_id")

# Annotationen kombinieren
vvh["labels_comb"] = [list(gruppen["label"])[i] + list(hndl["label"])[i] for i in range(len(list(vvh["label"])))]
vvh

Unnamed: 0_level_0,label,tweet,labels_comb
corpus_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1112521,KEINE,@user @user @user Weitaus schlimmer. Heute ist...,"[KeineGruppe, KeineHandlung]"
1114995,KEINE,Das Deutsche Kaiserreich soll wieder auferstehen,"[KeineGruppe, KeineHandlung]"
1110545,KEINE,Die BRD ist eine einzige Schande,"[KeineGruppe, KeineHandlung]"
1114326,KEINE,@user @user Die Gr√ºnen....besser kann man das ...,"[Politische Einstellung, KeineHandlung]"
4112169,KEINE,@user @user Scheiss deutsche Politiker! Mehr g...,"[KeineGruppe, KeineHandlung]"
...,...,...,...
1223336,KEINE,Unbequeme Wahrheit: Sexuelle Bel√§stigung ist...,"[KeineGruppe, KeineHandlung]"
1221963,KEINE,Vor was habt ihr Angst Liebe B√ºrger !!!???? St...,"[KeineGruppe, KeineHandlung]"
2220834,KEINE,@user Sch√∂n den Dummkopf #Oppermann von der Vo...,"[Politische Einstellung, KeineHandlung]"
1220580,KEINE,@user Hoffentlich sind die Nationalisten dan...,"[Anderes Merkmal, KeineHandlung]"


### Pipeline f√ºr eine oder mehrere Klassen laufen lassen

In [38]:
# Labels im Bin√§rformat encodieren, Klassennamen herausfinden
labels_encoded, label_namen = encode(list(vvh["labels_comb"]))
gruppe_namen, handlung_namen = label_namen[:7], label_namen[7:]

# Aufteilung in Trainings- und Testdaten
(X_train_tweets, X_test_tweets, y_train_comb, y_test_comb), (X_train_IDs, X_test_IDs) = train_test_split_multilabel(vvh, labels_encoded)

# Gruppen- / Handlungslabels Train/Test zuordnen
y_train_gruppe, y_test_gruppe = y_train_comb[:, 0:7], y_test_comb[:, 0:7]
y_train_handlung, y_test_handlung = y_train_comb[:, 7:11], y_test_comb[:, 7:11]

# VVh-Labels Train/Test zuordnen
#y_train_vvh = [vvh.loc[index[0]]["label"] for index in X_train_IDs]
y_test_vvh = [vvh.loc[index[0]]["label"] for index in X_test_IDs]
vvh_encoded_test, vvh_namen = encode(y_test_vvh)


#gruppen_klassfikation = multilabel_pipeline_bert((X_train_tweets, y_train_gruppe), (X_test_tweets, y_test_gruppe), gruppe_namen, save=True)
#handlung_klassifikation = multilabel_pipeline_bert((X_train_tweets, y_train_handlung), (X_test_tweets, y_test_handlung), handlung_namen, save=True)

modelle_gruppe = ['../models/bert-Gruppe1', '../models/bert-Gruppe2', '../models/bert-Gruppe3', '../models/bert-Gruppe4', '../models/bert-Gruppe5', '../models/bert-Gruppe6']
modelle_handlung = ['../models/bert-Handlung1', '../models/bert-Handlung2', '../models/bert-Handlung3']

print(vvh_entscheidungsbaum(modelle_gruppe, modelle_handlung, (X_test_tweets, vvh_encoded_test), vvh_namen))



# Datenanalyse: Anzahl der Eintr√§ge pro Klasse / Klassenkombination
#print(label_namen)
#print(Counter(combination for row in get_combination_wise_output_matrix(train[1], order=1) for combination in row))
#print(Counter(combination for row in get_combination_wise_output_matrix(test[1], order=1) for combination in row))

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file ../models/bert-Gruppe1/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Gruppe1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Gruppe1/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized

loading configuration file ../models/bert-Gruppe2/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Gruppe2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Gruppe2/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized

loading configuration file ../models/bert-Gruppe3/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Gruppe3",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Gruppe3/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized

loading configuration file ../models/bert-Gruppe4/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Gruppe4",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Gruppe4/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized

loading configuration file ../models/bert-Gruppe5/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Gruppe5",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Gruppe5/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized

loading configuration file ../models/bert-Gruppe6/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Gruppe6",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Gruppe6/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized

loading configuration file ../models/bert-Handlung1/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Handlung1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Handlung1/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initi

loading configuration file ../models/bert-Handlung2/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Handlung2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Handlung2/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initi

loading configuration file ../models/bert-Handlung3/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert-Handlung3",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ../models/bert-Handlung3/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initi

{'f1': 0.3333333333333333, 'rep': '              precision    recall  f1-score   support\n\n           0       0.98      1.00      0.99       893\n           1       0.71      0.22      0.33        23\n\n    accuracy                           0.98       916\n   macro avg       0.85      0.61      0.66       916\nweighted avg       0.97      0.98      0.97       916\n', 'confusion': array([[891,   2],
       [ 18,   5]]), 'mcc': 0.3865479661057195}
