Basierend auf https://data-dive.com/german-nlp-binary-text-classification-of-reviews-part1/

In [None]:
import re
import pickle
import sklearn
import pandas as pd
import numpy as np
from bokeh.io import output_notebook

output_notebook()

from hvplot import pandas
from pathlib import Path

pd.options.display.max_columns = 100
pd.options.display.max_rows = 300
pd.options.display.max_colwidth = 100
np.set_printoptions(threshold=2000)

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

In [None]:
data_dir_pp = Path("../../data/party_programs")
df_pp = pd.read_parquet(data_dir_pp / "party_programs.parquet")

data_dir_speeches = Path("../../data/speeches")
df_speeches = pd.read_parquet(data_dir_speeches / "speeches.parquet")

df_sample = pd.concat([df_pp, df_speeches], ignore_index=True).sample(20000, random_state=42)

In [None]:
from bokeh.models import NumeralTickFormatter

word_freq = pd.Series(" ".join(df_sample["tokenized_text"]).split()).value_counts()
word_freq[1:40].rename("Word frequency of most common words in comments").hvplot.bar(rot=45).opts(
    width=700, height=400, yformatter=NumeralTickFormatter(format="0,0")
)

In [None]:
vectorizer = TfidfVectorizer(analyzer="word", max_df=0.3, min_df=10, ngram_range=(1, 2), norm="l2")
vectorizer.fit(df_sample["tokenized_text"])

In [None]:
train, test = train_test_split(df_sample, random_state=1, test_size=0.25, shuffle=True)

X_train = train["tokenized_text"]
Y_train = train["party"]
X_test = test["tokenized_text"]
Y_test = test["party"]
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)
X_train_vec.get_shape()

In [None]:
classifiers = [
    LogisticRegression(solver="sag", multi_class="multinomial"),
    LinearSVC(multi_class="crammer_singer"),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GradientBoostingClassifier(),
    MultinomialNB(),
    SGDClassifier(),
    MLPClassifier(
        solver="adam",
        hidden_layer_sizes=(12, 12, 12),
        activation="relu",
        early_stopping=True,
        n_iter_no_change=1,
    ),
]
names = [re.match(r"[^\(]+", name.__str__())[0] for name in classifiers]

In [None]:
results = {}
for name, clf in zip(names, classifiers):
    print(f"Training classifier: {name}")
    clf.fit(X_train_vec, Y_train)
    prediction = clf.predict(X_test_vec)
    report = sklearn.metrics.classification_report(Y_test, prediction)
    results[name] = report

In [None]:
for k, v in results.items():
    print(f"Results for {k}:")
    print(f"{v}\n")

In [None]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("svc", LinearSVC())])

params = {
    "tfidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "tfidf__max_df": np.arange(0.3, 0.8, 0.2),
    "tfidf__min_df": np.arange(5, 100, 45),
}
pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro")
pipe_clf.fit(X_train, Y_train)

In [None]:
print(pipe_clf.best_params_)

In [None]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("svc", LinearSVC())])

params = {
    "tfidf__ngram_range": [(1, 3)],
    "tfidf__max_df": [0.5],
    "tfidf__min_df": [5],
    "svc__C": np.arange(0.2, 1, 0.15),
}
pipe_svc_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro")
pipe_svc_clf.fit(X_train, Y_train)

In [None]:
best_params = pipe_svc_clf.best_params_
print(best_params)

In [None]:
pipe.set_params(**best_params).fit(X_train, Y_train)
pipe_pred = pipe.predict(X_test)
report = sklearn.metrics.classification_report(Y_test, pipe_pred)
print(report)

In [None]:
conf_score = pipe.decision_function(X_test)

In [None]:
test_df = test.copy()
test_df["pred_party"] = pipe_pred
test_df["pred_score"] = conf_score.max(axis=1)
test_df[["party", "pred_party", "clean_text", "pred_score"]][(test_df["party"] != test_df["pred_party"])].sort_values(
    by="pred_score", ascending=False
).head(10)