# Modeling (FastText)


In [None]:
from pathlib import Path

import fasttext
import fasttext.util
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split

from studienarbeit.config import party_encoding
from studienarbeit.utils.load import EDataTypes, Load

sns.set(style="white", palette="muted", rc={"figure.figsize": (20, 8)})

# Either load the bin file using the command, or the text (vector) file from https://fasttext.cc/docs/en/crawl-vectors.html
fasttext.util.download_model("de", if_exists="ignore")

In [None]:
file_name = "prep_tweets_fast_full.parquet"
data_type = EDataTypes.TWEETS
data_dir = Path("../../data/") / data_type.value

load = Load(data_type=data_type)

In [None]:
suffix = []

if "fast" in file_name:
    suffix.append("fast")

if "full" in file_name:
    suffix.append("full")
elif "sm" in file_name:
    suffix.append("sm")
elif "md" in file_name:
    suffix.append("md")
elif "lg" in file_name:
    suffix.append("lg")


---

In [None]:
df = load.load_dataframe(file_name, columns=["clean_text", "lemma_text", "filter_text", "party"])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["filter_text"], df["party"], test_size=0.2, random_state=42)

In [None]:
with open(data_dir / f"cache/train_{'_'.join(suffix)}.txt", "w") as f:
    for index, row in pd.DataFrame({"text": X_train, "party": y_train}).iterrows():
        f.write(f"__label__{row['party']} {row['text']}\n")

with open(data_dir / f"cache/test_{'_'.join(suffix)}.txt", "w") as f:
    for index, row in pd.DataFrame({"text": X_test, "party": y_test}).iterrows():
        f.write(f"__label__{row['party']} {row['text']}\n")


In [None]:
model = fasttext.train_supervised(
    input=str(data_dir / f"cache/train_{'_'.join(suffix)}.txt"),
    epoch=5,
    lr=0.1,
    wordNgrams=2,
    loss="softmax",
    dim=300,
    pretrainedVectors="cc.de.300.vec",
)

In [None]:
test_score = model.test(path=data_dir / f"cache/test_{'_'.join(suffix)}.txt")

print(f"Count of test data (N): {test_score[0]}")
print(f"F1 Score: {2 * ((test_score[1] * test_score[2]) / (test_score[1] + test_score[2]))}")
print(f"Percision: {test_score[1]}")
print(f"Recall: {test_score[2]}")


In [None]:
df_test = pd.DataFrame({"text": X_test, "party": y_test})

df_test["prediction"] = df_test["text"].apply(lambda x: int(model.predict(x)[0][0].replace("__label__", "")))


In [None]:
print(classification_report(df_test["party"], df_test["prediction"]))


In [None]:
model.save_model(str(data_dir / f"models/fasttext_{'_'.join(suffix)}.bin"))


In [None]:
cm = confusion_matrix(df_test["party"], df_test["prediction"], normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=party_encoding.keys())
disp.plot()


In [None]:
df_test["prediction"].value_counts()


In [None]:
df_test["party"].value_counts()


In [None]:
df_test.loc[df_test["party"] != df_test["prediction"]].head(10)


In [None]:
def plot_pca(model, words):
    word_vectors = np.array([model.get_word_vector(w) for w in words])
    threedim = PCA().fit_transform(word_vectors)[:, :3]
    scatter = go.Scatter3d(x=threedim[:, 0], y=threedim[:, 1], z=threedim[:, 2], mode="markers", text=words)

    layout = go.Layout(
        title="3D PCA",
        showlegend=True,
        scene=dict(
            xaxis=dict(title="PC1"),
            yaxis=dict(title="PC2"),
            zaxis=dict(title="PC3"),
        ),
    )

    plot_figure = go.Figure(data=scatter, layout=layout)
    plot_figure.show()


plot_pca(
    model,
    ["afd", "weidel", "spd", "sozial", "grüne", "grünen", "union", "cdu", "csu", "linke", "linke", "fdp", "steuern"],
)