In [2]:
import pandas as pd
import spacy
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import fasttext.util
from sklearn.model_selection import train_test_split

# Pipeline für Textklassifikation mit fasttext

## Datenvorbereitung

In [3]:
# load data
sample = pd.read_csv("data/labeled_unprocessed_sample_data.csv", sep=";")

In [4]:
# remove possible missing values and duplicates
sample = sample.dropna()
sample = sample.drop_duplicates(subset="answer")

In [5]:
nlp = spacy.load("de_core_news_sm")

def text_lemmatization(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_punct]
    return " ".join(lemmas)

In [6]:
def text_preprocessing(text):
    # remove links, punctuation, special letters
    text = re.sub(r"[^a-zA-öZÖäÄüÜß]|\bhttps?://\S*|&\w+;|[\.,]", " ", text)
    
    # replace single characters
    text = re.sub(r" [a-zA-Z] ", " ", text)
    
    # remove additional whitespaces
    text = re.sub(r"\s+", " ", text)
    
    # lemmatize texts
    text = text_lemmatization(text)

    # lower text
    text = text.lower()
    
    # tokenization of words
    text = text.split()
    
    # remove stopwords
    german_stopwords = set(stopwords.words("german"))
    text = [w for w in text if w not in german_stopwords]
    
    # return joined text
    return " ".join(text)

In [7]:
sample["clean_answer"] = sample["answer"].apply(text_preprocessing)

In [8]:
# show most frequent words
vectorizer = CountVectorizer(
)
dtm = vectorizer.fit_transform(sample["clean_answer"])

frequencies = dtm.sum(axis=0).tolist()[0]

df_freq = pd.DataFrame(
    dict(frequencies=frequencies,
         index=vectorizer.get_feature_names_out()
    )
)

df_freq.sort_values("frequencies", ascending=False).head(n=20)

Unnamed: 0,frequencies,index
9221,2012,geehrt
8766,1718,freundlich
24838,1712,vieler
11383,1698,herr
8594,1579,frage
4977,1279,dank
20807,971,sollen
15637,942,mensch
9138,900,geben
10810,898,gut


In [9]:
# remove stopwords with refined list and after lemmatization to catch 
def remove_individual_stopwords(text):
    individual_stopwords = [
        "geehrt",
        "frau",
        "vieler",
        "dank",
        "herr",
        "danke",
        "anfrage",
        "frage",
        "nachricht",
        "freundlich",
        "sollen",
        "müssen",
        "mehr",
        "grüße",
        "daher",
        "immer",
        "dafür",
        "frage"
    ]
    text = text.split()
    text = [w for w in text if w not in individual_stopwords]
    
    return " ".join(text)

In [10]:
sample["clean_answer"] = sample["clean_answer"].apply(remove_individual_stopwords)

In [11]:
# prepare data for processing by fasttext
sample["answer_encoded"] = sample["answer_encoded"].apply(lambda x: x.replace(" ", "_"))
sample["answer_encoded"] = sample["answer_encoded"].apply(lambda x: "__label__" + x)
sample["answer_encoding_combined"] = sample["answer_encoded"]+ " " + sample["clean_answer"]

In [12]:
# seperate sample data into training, test and validation sets (80/10/10)
training_sample, temp_df = train_test_split(sample, test_size=0.2, random_state=42)
testing_sample, validation_sample = train_test_split(temp_df, test_size=0.5, random_state=42)

In [13]:
# export datasets
training_sample["answer_encoding_combined"].to_csv("data/training_data.csv", index=False, header=False, sep=";")
testing_sample["answer_encoding_combined"].to_csv("data/testing_data.csv", index=False, header=False, sep=";")
validation_sample["answer_encoding_combined"].to_csv("data/validation_data.csv", index=False, header=False, sep=";")

## Modelltraining

### Ohne optimierte Hyperparamater

In [14]:
# train model without optimizing the hyperparameter
ft_model = fasttext.train_supervised(input="data/training_data.csv")

In [15]:
# test model
ft_model.test("data/testing_data.csv")

(228, 0.6578947368421053, 0.6578947368421053)

### Mit optimierten Hyperparametern

In [None]:
# train model with optimizing the hyperparameter
ft_model_optimized = fasttext.train_supervised(
    input="data/training_data.csv", 
    autotuneValidationFile="data/validation_data.csv"
    )

In [26]:
# test model
ft_model_optimized.test("data/testing_data.csv")

(228, 0.6798245614035088, 0.6798245614035088)

#### Testergebnisse für Label "answer"

In [None]:
# test model for label "answer"
ft_model_optimized.test_label("data/testing_data.csv")["__label__answer"]

{'precision': 0.6818181818181818,
 'recall': 0.9310344827586207,
 'f1score': 0.7871720116618076}

#### Testergebnisse für Label "evasive answer"

In [None]:
# test model for label "evasive answer"
ft_model_optimized.test_label("data/testing_data.csv")["__label__evasive_answer"]

{'precision': 0.6666666666666666,
 'recall': 0.24096385542168675,
 'f1score': 0.35398230088495575}

In [None]:
# save trained model
ft_model_optimized.save_model("model/classification.bin")