# Modeling


In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from studienarbeit.config import party_encoding
from studienarbeit.utils.load import EDataTypes, Load

In [None]:
CV = 5

file_name = "prep_tweets_full.parquet"
data_type = EDataTypes.TWEETS
data_dir = Path("../../data/") / data_type.value

load = Load(data_type=data_type)

In [None]:
df_modeling = load.load_dataframe(file_name, columns=["clean_text", "lemma_text", "filter_text", "party"])
df_modeling.head()


In [None]:
df_modeling["party"].value_counts()

## Bag-of-Words (BoW)

---


In [None]:
bow_vector = CountVectorizer(ngram_range=(1, 1))
bow_features = bow_vector.fit_transform(df_modeling["filter_text"])
bow_labels = df_modeling["party"]

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, bow_features, bow_labels, scoring="accuracy", cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

df_cv = pd.DataFrame(entries, columns=["model_name", "fold_idx", "accuracy"])

sns.boxplot(x="model_name", y="accuracy", data=df_cv)
sns.stripplot(x="model_name", y="accuracy", data=df_cv, size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()


### Support Vector Machine (SVM)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_modeling["filter_text"], df_modeling["party"], test_size=0.2, random_state=42
)

svc = LinearSVC()
svc.fit(bow_vector.fit_transform(X_train), y_train)
cross_val = cross_val_score(svc, bow_vector.transform(X_train), y_train, cv=5)
print(f"Cross validation score: {cross_val.mean():.3f} +/- {cross_val.std():.3f}")
y_pred = svc.predict(bow_vector.transform(X_test))

conf_mat = confusion_matrix(y_test, y_pred, normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=party_encoding.keys())
disp.plot(cmap=plt.cm.Blues)

## Term Frequency-Inverse Document Frequency (TF-IDF)

---


In [None]:
tfidf_vector = TfidfVectorizer(sublinear_tf=True, min_df=5, norm="l2", encoding="latin-1", ngram_range=(1, 2))
tfidf_features = tfidf_vector.fit_transform(df_modeling["filter_text"])
tfidf_labels = df_modeling["party"]

In [None]:
N = 5
for party, party_id in sorted(party_encoding.items()):
    features_chi2 = chi2(tfidf_features, tfidf_labels == party_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf_vector.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(" ")) == 1]
    bigrams = [v for v in feature_names if len(v.split(" ")) == 2]
    print(f"# {party}")
    print(f"\tMost correlated unigrams: {unigrams[-N:]}")
    print(f"\tMost correlated bigrams: {bigrams[-N:]}")


In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, tfidf_features, tfidf_labels, scoring="accuracy", cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

df_cv = pd.DataFrame(entries, columns=["model_name", "fold_idx", "accuracy"])

sns.boxplot(x="model_name", y="accuracy", data=df_cv)
sns.stripplot(x="model_name", y="accuracy", data=df_cv, size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()


### Support Vector Machine (SVM)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_modeling["filter_text"], df_modeling["party"], test_size=0.2, random_state=42
)

svc = LinearSVC()
svc.fit(tfidf_vector.fit_transform(X_train), y_train)
cross_val = cross_val_score(svc, tfidf_vector.transform(X_train), y_train, cv=5)
print(f"Cross validation score: {cross_val.mean():.3f} +/- {cross_val.std():.3f}")
y_pred = svc.predict(tfidf_vector.transform(X_test))

conf_mat = confusion_matrix(y_test, y_pred, normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=party_encoding.keys())
disp.plot(cmap=plt.cm.Blues)
