# Modeling

In [None]:
import fasttext
import fasttext.util
import numpy as np
import pandas as pd
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from pathlib import Path

from studienarbeit.config import party_encoding
from studienarbeit.utils.load import Load

In [None]:
# Either load the bin file using the command, or the text (vector) file from https://fasttext.cc/docs/en/crawl-vectors.html
fasttext.util.download_model("de", if_exists="ignore")

load = Load(data_dir = "../../data/tweets")

## FastText

---

In [None]:
df = load.load_dataframe("cache/prep_tweets_fast_full.parquet", columns=["clean_text", "lemma_text", "filter_text", "party"])

In [None]:
parameter = {"input": "train.txt", "pretrainedVectors": "cc.de.300.vec"} # "epoch": 50, "lr": 0.05, "wordNgrams": 2, "verbose": 2, "minCount":1, "loss": "ns", "lrUpdateRate": 100, "thread": 4, "ws": 5, "dim": 300,

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["filter_text"], df["party"], test_size=0.2, random_state=42
)


In [None]:
with open(Path("train.txt"), "w") as f:
    for index, row in pd.DataFrame({"text": X_train, "party": y_train}).iterrows():
        f.write(f"__label__{row['party']} {row['text']}\n")

with open(Path("test.txt"), "w") as f:
    for index, row in pd.DataFrame({"text": X_test, "party": y_test}).iterrows():
        f.write(f"__label__{row['party']} {row['text']}\n")

In [None]:
model = fasttext.train_supervised(input="train.txt", epoch=5, lr=0.1, wordNgrams=2, loss="softmax", dim=300, pretrainedVectors="cc.de.300.vec")

In [None]:
test_score = model.test("test.txt")

print(f"Count of test data (N): {test_score[0]}")
print(f"F1 Score: {2 * ((test_score[1] * test_score[2]) / (test_score[1] + test_score[2]))}")
print(f"Percision: {test_score[1]}")
print(f"Recall: {test_score[2]}")

In [None]:
df_test = pd.DataFrame({"text": X_test, "party": y_test})

df_test["prediction"] = df_test["text"].apply(lambda x: int(model.predict(x)[0][0].replace("__label__", "")))

In [None]:
print(classification_report(df_test["party"], df_test["prediction"]))

In [None]:
cm = confusion_matrix(df_test["party"], df_test["prediction"], normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=party_encoding.keys())
disp.plot()