# Referenzdatensatz erstellen

In [49]:
# Besonderheit HASOC 2020 Daten: Zeilenumbrüche innerhalb der Tweets
# --> notwendig, die Dateien erst so aufzubereiten, dass nur ein Tweet pro Zeile steht

# HASOC 2020 Train: "..\Korpora\German_2020_hasoc\German\hasoc_2020_de_train_new.txt"
# HASOC 2020 Test: "..\Korpora\German_2020_hasoc\German\hasoc_2020_de_test_new.txt"

def rem_white(filename):
    with open(filename, mode="r", encoding="utf-16") as f:
        content = f.readlines()
        content = content[1:] # Erklärungszeile ignorieren
        newcont = []
        for i in range(len(content)):
            # Fall 1: Zeile ist komplett
            if len(content[i].split("\t")) == 5:
                newcont.append(content[i])
            # Fall 2: Zeile ist nicht komplett
            else:
                # letzter Teil einer Zeile erreicht
                if "hasoc_2020_de_" in content[i]:
                    comp_line += content[i]
                    comp_line = comp_line.replace("\n"," ")
                    comp_line += "\n"
                    newcont.append(comp_line)
                    comp_line = "" # Zeilenakkumulator zurücksetzen
                # erster Teil einer Zeile 
                elif content[i].startswith("11") :
                    comp_line = content[i]
                # mittlerer Teil einer Zeile, manchmal nur \n
                else: comp_line += content[i]
    nwfilename = filename[:len(filename)-4] + "_formatted" + ".txt"
    with open(nwfilename, mode="w", encoding="utf-8") as outfile:
        for line in newcont:
            outfile.write(line)
    return True

# Bereits formatiert, nicht nochmals durchführen
# rem_white("..\Korpora\German_2020_hasoc\German\hasoc_2020_de_train_new.txt")
# rem_white("..\Korpora\German_2020_hasoc\German\hasoc_2020_de_test_new.txt")

In [88]:
# Besonderheit des Covid19-Abusive-Datensatzes: noch nicht unterteilt in Trainings- und Testdaten

from sklearn.model_selection import train_test_split

with open("..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset.csv", mode="r", encoding="utf-8") as infile:
    content = infile.readlines()
    explanation = content[0].split("\t")
    explanation = "\t".join([explanation[0],explanation[1],explanation[2],explanation[3]]) +"\n"
    content = content[1:]
    sep_content = [entry.split("\t") for entry in content]
    tweets = [(entry[0], entry[1], entry[2]) for entry in sep_content]
    labels = [entry[3] for entry in sep_content]

    # Separate into train/test
    # insg. 4960 Tweets, laut Paper unterteilt in Trainings-Set mit 3485 Tweets (70%), Validation-Set mit 735, und Test-Set mit 740 Tweets
    # hier entsprechend der Split 70/30, Validation-Daten werden also im Zweifelsfall von dem Testset abgespalten
    traintweets, testtweets, trainlabels, testlabels = train_test_split(tweets, labels, test_size=0.3, train_size=0.7, random_state=4, stratify=labels)

    # Reassemble into filewritable content
    train = []
    test = []
    for i, tweet in enumerate(traintweets):
        train.append("\t".join([tweet[0],tweet[1],tweet[2],trainlabels[i]]))
    for j, tweet in enumerate(testtweets):
        test.append("\t".join([tweet[0],tweet[1],tweet[2],testlabels[j]]))

with open("..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset_train.txt", mode="w", encoding="utf-8") as trainout:
    trainout.write(explanation)
    for train_point in train:
        trainout.write(train_point+"\n")

with open("..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset_test.txt", mode="w", encoding="utf-8") as testout:
    testout.write(explanation)
    for test_point in test:
        testout.write(test_point+"\n")

In [103]:
import re

def convert_to_refcorp(filename, corp_id, mod):
    """
    GermEval-Daten, HASOC-Daten und die Daten des Covid19-Abusive-Datensatzes in ein einheitliches Format zu übertragen.
    Input: Datei mit Tabstopp-getrennten Werten (ob txt, csv oder tsv), Korpus-ID, train/test-Information
    Output: Liste von Tupeln à (Referenzkorpus-ID, Tweet, Label1, Label2)
            - ReferenzkorpusID - setzt sich zusammen aus der Korpus-ID,
                                 md_id = "11", falls es um Trainingsdaten (mod=train), "22", falls es um Testdaten (mod=test) geht
                                 und der Zeilennummer in der Ursprungsdatei;
                                 also z.B.: "01220034" - für einen Tweet der Zeile 34, aus den Testdaten des GermEval2018-Datensatzes
            - Tweet            - String des Tweets, URLs sind mit der generischen Twitter-URL "https://t.co" ersetzt
            - Label1           -
            - Label2           -
    """
    newcorp = []
    with open(filename, mode="r", encoding="utf-8") as f:
        text = f.readlines()
        
        # erste Zeile ignorieren bei Covid19 ("05") und HASOC2019 ("03")
        if corp_id == "05" or corp_id == "03": text = text[1:]

        # Bestimmen, welche Formatierungsfunktion genutzt wird
        if corp_id == "01" or corp_id == "02": form_func = format_germeval
        elif corp_id == "03" or corp_id == "04": form_func = format_hasoc
        else: form_func = format_covidabusive

        url_pattern = re.compile('https:\/\/.*?(?: |$)')

        for num, entry in enumerate(text):
            entry = entry.strip()
            tag1, tag2 = "NOT", "NOT"

            tweet, tag1, tag2 = form_func(entry, tag1, tag2)

            # URLs mit generischer Twitter-URL ersetzen
            tweet = url_pattern.sub("https://t.co ", tweet)
            tweet = tweet.strip()

            # ID erstellen
            if mod == "train": md_id = "11"
            elif mod =="test": md_id = "22"
            id_num = f'{num:04d}'
            tweet_id = str(corp_id) + str(md_id) + str(id_num)
            
            # der neuen Sammlung hinzufügen
            newcorp.append((tweet_id, tweet, tag1, tag2))
    return newcorp

def format_germeval(entry, tag1, tag2):
    tweet, label1, label2 = entry.split("\t")
    if label1 == "OFFENSE": tag1 = "NEG"
    if label2 == "INSULT": tag2 = "INSOFF"
    elif label2 == "PROFANITY": tag2 = "PRFN"
    elif label2 == "ABUSE": tag2 = "HATE"
    tweet = tweet.replace("|LBR|", " ")
    return tweet, tag1, tag2

def format_covidabusive(entry, tag1, tag2):
    sep = entry.split("\t")
    tweet, l1 = sep[1], sep[3]
    tag2 = "NAN"
    if l1 == "abuse": tag1 = "NEG"
    return tweet, tag1, tag2

def format_hasoc(entry, tag1, tag2):
    sep = entry.split("\t")
    tweet, l1, l2 = sep[1], sep[2], sep[3]
    if l1 == "HOF": tag1 = "NEG"
    if l2 == "HATE": tag2 = "HATE"
    elif l2 == "OFFN": tag2 = "INSOFF"
    elif l2 == "PRFN": tag2 = "PRFN"
    return tweet, tag1, tag2

In [104]:
# GermEval2018
# Train: "..\Korpora\GermEval-2018-Data-master\germeval2018.training.txt"
# Test: "..\Korpora\GermEval-2018-Data-master\germeval2018.test.txt"
germeval2018train_converted = convert_to_refcorp("..\Korpora\GermEval-2018-Data-master\germeval2018.training.txt", "01", "train")
germeval2018test_converted = convert_to_refcorp("..\Korpora\GermEval-2018-Data-master\germeval2018.test.txt", "01", "test")

# GermEval2019
# Train: "..\Korpora\GermEval-2019-Data\germeval2019.training_subtask1_2_korrigiert.txt"
# Test: "..\Korpora\GermEval-2019-Data\germeval2019GoldLabelsSubtask1_2.txt"
germeval2019train_converted = convert_to_refcorp("..\Korpora\GermEval-2019-Data\germeval2019.training_subtask1_2_korrigiert.txt", "02", "train")
germeval2019test_converted = convert_to_refcorp("..\Korpora\GermEval-2019-Data\germeval2019GoldLabelsSubtask1_2.txt", "02", "test")

In [105]:
# HASOC 2019
# Train: Korpora\german_dataset_hasoc2019\german_dataset\german_dataset.tsv
# Test: Korpora\german_dataset_hasoc2019\german_dataset\hasoc_de_test_gold.tsv
hasoc2019train_converted = convert_to_refcorp("..\Korpora\german_dataset_hasoc2019\german_dataset\german_dataset.tsv", "03", "train")
hasoc2019test_converted = convert_to_refcorp("..\Korpora\german_dataset_hasoc2019\german_dataset\hasoc_de_test_gold.tsv", "03", "test")

# HASOC 2020
# Train: Korpora\German_2020_hasoc\German\hasoc_2020_de_train_new_formatted.txt
# Test: Korpora\German_2020_hasoc\German\hasoc_2020_de_test_new_formatted.txt
hasoc2020train_converted = convert_to_refcorp("..\Korpora\German_2020_hasoc\German\hasoc_2020_de_train_new_formatted.txt", "04", "train")
hasoc2020test_converted = convert_to_refcorp("..\Korpora\German_2020_hasoc\German\hasoc_2020_de_test_new_formatted.txt", "04", "test")

In [106]:
# Covid19 Abusive

# Korpus ursprünglich: "..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset.csv"
# Train (neu): "..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset_train.txt"
# Test (neu): "..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset_test.txt"

covidabusivetrain_converted = convert_to_refcorp("..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset_train.txt", "05", "train")
covidabusivetest_converted = convert_to_refcorp("..\Korpora\german-abusive-language-covid-19-main\covid_2021_dataset_test.txt", "05", "test")

In [107]:
import random

# Referenzdatensatz zusammenstellen
# Bereits ausgeführt, nicht noch einmal ausführen

#refcorp_train = germeval2018train_converted + germeval2019train_converted + hasoc2019train_converted + hasoc2020train_converted + covidabusivetrain_converted
#random.shuffle(refcorp_train)

#refcorp_test = germeval2018test_converted + germeval2019test_converted + hasoc2019test_converted + hasoc2020test_converted + covidabusivetest_converted
#random.shuffle(refcorp_test)

#with open("..\Korpora\Referenzdatensatz_HateSpeech_Deutsch\RefKorpHateSpeechDe_Train.txt", mode="w", encoding="utf-8") as reftrainout:
#    reftrainout.write("corpus_id\ttweet\tbinarylabel\tfinelabel\n")
#    for reftweet in refcorp_train:
#        reftrainout.write("\t".join(reftweet)+"\n")
    
#with open("..\Korpora\Referenzdatensatz_HateSpeech_Deutsch\RefKorpHateSpeechDe_Test.txt", mode="w", encoding="utf-8") as reftestout:
#    reftestout.write("corpus_id\ttweet\tbinarylabel\tfinelabel\n")
#    for reftweet in refcorp_test:
#        reftestout.write("\t".join(reftweet)+"\n")

In [109]:
# Ausschnittdatensätze (Train, Test) erstellen, in dem nur die Einträge mit dem feinen Label "HATE" vorkommen
# Bereits erstellt, nicht nochmals ausführen

#with open("..\Korpora\Referenzdatensatz_HateSpeech_Deutsch\RefKorpHateSpeechDe_Test.txt", mode="r", encoding="utf-8") as hatetrainin:
#    all_cont = hatetrainin.readlines()
#    all_cont = all_cont[1:]
#    sep_cont = [entry.strip().split("\t") for entry in all_cont]
#    hate = []
#    for tweet in sep_cont:
#        if tweet[3] == "HATE": hate.append(tweet)

#with open("..\Korpora\Referenzdatensatz_HateSpeech_Deutsch\RefKorpHateSpeechDe_Test_HATE.txt", mode="w", encoding="utf-8") as hatetrainout:
#    for line in hate:
#        hatetrainout.write("\t".join(line)+"\n")
