# Tweets (Sältzer)


In [None]:
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
from studienarbeit.utils.plots import Plots

load_dotenv()


In [None]:
FAST_MODE = False

data_dir = Path("../../data/tweets")
party_encoding = {
    "AfD": 0,
    "FDP": 1,
    "DIE GRÜNEN": 2,
    "DIE LINKE": 3,
    "SPD": 4,
    "UNION": 5,
}
gender_encoding = {
    "male": 0,
    "female": 1,
}

plot = Plots(document_type="Tweets")


## Business Understanding

---

Lorem


## Data Understanding

---


### Import


In [None]:
df_base = pd.read_parquet(
    data_dir / "tweets.parquet",
    columns=["screen_name", "created_at", "is_retweet", "text", "party", "birthyear", "gender"],
    use_nullable_dtypes=True,
)


In [None]:
df_base.shape


In [None]:
for col in df_base.columns:
    df_base[col] = df_base[col].apply(
        lambda x: None if x == "" or x == "NA" or x == "NA, NA" or x == "NA, NA, NA, NA, NA, NA, NA, NA" else x
    )


Check for missing values


In [None]:
df_base.isna().sum()


In the cell above we can see that there are about 11k missing values in the `text` column. Regarding the `is_retweet` column, about 3k entries have missing values.

Following we will delete the rows.


In [None]:
df_base = df_base.dropna(subset=["text", "is_retweet"])


Following, we can check which columns represent categorical data.


In [None]:
df_base.nunique()


Clean duplicated rows (some tweets seem to be scraped twice at different days)


In [None]:
df_base = df_base.drop_duplicates(
    subset=["screen_name", "is_retweet", "text", "party", "birthyear", "gender"], keep="last"
)


In [None]:
df_base["party"].value_counts()


In [None]:
df_base.groupby("gender")["screen_name"].nunique()


In [None]:
convert_dict = {
    "screen_name": "category",
    "created_at": "datetime64[ns]",
    "is_retweet": "category",
    "text": "string[pyarrow]",
    "party": "category",
    "birthyear": "datetime64[ns]",
    "gender": "category",
}


In [None]:
df_base = df_base.astype(convert_dict)


In [None]:
df_base.info(verbose=True, memory_usage="deep")


In [None]:
df_base.describe(include="all", datetime_is_numeric=True)


In [None]:
df_base.head()


## Data Preparation

---


In [None]:
from tqdm import tqdm
from pandarallel import pandarallel
from collections import Counter
import itertools
from nltk import ngrams
from studienarbeit.utils.cleaning import Cleaning
from studienarbeit.utils.sentiment import Sentiment

tqdm.pandas()
pandarallel.initialize(progress_bar=False, verbose=1)


In [None]:
cache_file = data_dir / "cache/tweets_prep.parquet"

clean = Cleaning()
sentiment = Sentiment()


In [None]:
clean.pipeline(
    "Ehemalige @AfD-Vorsitzende #Petry muss wegen Meineid vor Gericht. Kein Einzelfall: gegen circa 10% aller AfD-Abgeordneten bundesweit laufen oder liefen Strafverfahren. Kriminelle Asylbewerber? Fehlanzeige. Kriminelle AfD-Hetzer trifft den Nagel eher auf den Kopf <U+0001F602> #AfD"
)


### Cleaning


In [None]:
def prep_pipeline(df: pd.DataFrame, min_word_count: int = 5):
    # Group CDU and CSU as Union
    df["party"] = df["party"].replace("CSU", "UNION")
    df["party"] = df["party"].replace("CDU", "UNION")
    df["party"] = df["party"].cat.remove_unused_categories()

    # Fix labels for retweets
    df["is_retweet"] = df["is_retweet"].replace("FALSE", False)
    df["is_retweet"] = df["is_retweet"].replace("TRUE", True)
    df["is_retweet"] = df["is_retweet"].astype("bool")

    # Remove tweets from parties that are not in the Bundestag and/or retweets
    print(
        f"The dataset contains {len(df.loc[(df['is_retweet'] == True) | (df['text'].str.startswith('RT'))])} retweets..."
    )
    df = df.loc[(df["party"] != "Parteilos") & (df["is_retweet"] == False) & (~df["text"].str.startswith("RT"))]

    # Encode party and gender
    df["party"] = df["party"].map(party_encoding).astype("int8")
    df["gender"] = df["gender"].map(gender_encoding).astype("int8")

    # Apply cleaning pipeline
    df["clean_text"] = df["text"].parallel_apply(clean.clean_text).astype("string[pyarrow]")
    df["lemma_text"] = df["clean_text"].parallel_apply(clean.lemma_text).astype("string[pyarrow]")
    df["filter_text"] = df["lemma_text"].parallel_apply(clean.filter_text).astype("string[pyarrow]")

    # Count the number of words and tokens in the tweet
    df["clean_word_count"] = df["clean_text"].parallel_apply(lambda x: len(x.split())).astype("int16")
    df["clean_symbol_count"] = df["clean_text"].parallel_apply(lambda x: len(x)).astype("int16")
    df["filter_word_count"] = df["filter_text"].parallel_apply(lambda x: len(x.split())).astype("int16")
    df["filter_symbol_count"] = df["filter_text"].parallel_apply(lambda x: len(x)).astype("int16")

    # Filter out tweets that are too short
    print(
        f"Found {len(df.loc[df['lemma_word_count'] < min_word_count])} tweets with less than {min_word_count} words..."
    )
    df = df.loc[df["lemma_word_count"] >= min_word_count]

    # Calculate the sentiment of the tweets
    df["sentiment"] = df["clean_text"].progress_apply(sentiment.predict_sentiment).astype("category")

    return df


Either load the cached data or process the raw tweets


In [None]:
if FAST_MODE and cache_file.exists():
    df_prep = pd.read_parquet(cache_file)
else:
    df_prep = prep_pipeline(df_base.sample(50000, random_state=42).copy()).reset_index(drop=True)

    if not (data_dir / "cache").exists():
        (data_dir / "cache").mkdir()
    df_prep.to_parquet(cache_file)


Check for n-grams


In [None]:
Counter(
    list(itertools.chain.from_iterable(df_prep["filter_text"].str.split().apply(lambda x: ngrams(x, 2))))
).most_common(50)


In [None]:
df_prep.info(verbose=True, memory_usage="deep")


In [None]:
df_prep.describe(include="all", datetime_is_numeric=True)


In [None]:
df_prep.head(10)


### Plotting


In [None]:
df_prep = pd.read_parquet(data_dir / "cache/tweets_prep.parquet")

In [None]:
df_plot = df_prep.copy()
df_plot["party"] = df_plot["party"].map({value: key for key, value in party_encoding.items()})

In [None]:
plot.party_count(df_plot)


In [None]:
plot.sentiment(df_plot)


In [None]:
plot.word_count(df_plot, "stemm_word_count")


In [None]:
plot.gender(df_plot)


In [None]:
plot.user_count(df_plot)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr = df_prep.select_dtypes(exclude=["object", "category", "datetime64[ns]", "bool"]).corr(numeric_only=True)
mask = np.triu(np.ones_like(corr, dtype=bool))
mask[np.diag_indices_from(mask)] = False

fig, ax = plt.subplots(figsize=(10, 10))
corr_plot = sns.heatmap(
    corr,
    mask=mask,
    cmap="coolwarm",
    center=0,
    square=True,
    cbar_kws={"shrink": 0.5},
    annot=True,
    annot_kws={"fontsize": 10},
    fmt=".2f",
    ax=ax,
)


## Modeling

---


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score


In [None]:
df_modeling = df_prep[["clean_text", "lemma_text", "filter_text", "party"]]
df_modeling.to_parquet(data_dir / "cache/tweets_modeling.parquet")


In [None]:
df_modeling = pd.read_parquet(data_dir / "cache/tweets_modeling.parquet")


In [None]:
tfidf_vector = TfidfVectorizer(sublinear_tf=True, min_df=5, norm="l2", encoding="latin-1", ngram_range=(1, 2))
bow_vector = CountVectorizer(ngram_range=(1, 1))


In [None]:
bow_features = bow_vector.fit_transform(df_modeling["filter_text"])
bow_labels = df_modeling["party"]


In [None]:
tfidf_features = tfidf_vector.fit_transform(df_modeling.filter_text).toarray()
tfidf_labels = df_modeling["party"]


In [None]:
N = 5
for party, party_id in sorted(party_encoding.items()):
    features_chi2 = chi2(tfidf_features, tfidf_labels == party_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf_vector.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(" ")) == 1]
    bigrams = [v for v in feature_names if len(v.split(" ")) == 2]
    print(f"# {party}")
    print(f"\tMost correlated unigrams: {unigrams[-N:]}")
    print(f"\tMost correlated bigrams: {bigrams[-N:]}")


### Bag-of-Words (BoW)


In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5
# df_cv = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, bow_features, bow_labels, scoring="accuracy", cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

df_cv = pd.DataFrame(entries, columns=["model_name", "fold_idx", "accuracy"])

sns.boxplot(x="model_name", y="accuracy", data=df_cv)
sns.stripplot(x="model_name", y="accuracy", data=df_cv, size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_modeling["filter_text"], df_modeling["party"], test_size=0.2, random_state=42
)

svc = LinearSVC()
svc.fit(bow_vector.fit_transform(X_train), y_train)
cross_val = cross_val_score(svc, bow_vector.transform(X_train), y_train, cv=5)
print(f"Cross validation score: {cross_val.mean():.3f} +/- {cross_val.std():.3f}")
y_pred = svc.predict(bow_vector.transform(X_test))

conf_mat = confusion_matrix(y_test, y_pred, normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=party_encoding.keys())
disp.plot(cmap=plt.cm.Blues)

### Term Frequency-Inverse Document Frequency (TF-IDF)


In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5
# df_cv = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, tfidf_features, tfidf_labels, scoring="accuracy", cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

df_cv = pd.DataFrame(entries, columns=["model_name", "fold_idx", "accuracy"])

sns.boxplot(x="model_name", y="accuracy", data=df_cv)
sns.stripplot(x="model_name", y="accuracy", data=df_cv, size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_modeling["filter_text"], df_modeling["party"], test_size=0.2, random_state=42
)

svc = LinearSVC()
svc.fit(tfidf_vector.fit_transform(X_train), y_train)
cross_val = cross_val_score(svc, tfidf_vector.transform(X_train), y_train, cv=5)
print(f"Cross validation score: {cross_val.mean():.3f} +/- {cross_val.std():.3f}")
y_pred = svc.predict(tfidf_vector.transform(X_test))

conf_mat = confusion_matrix(y_test, y_pred, normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=party_encoding.keys())
disp.plot(cmap=plt.cm.Blues)

In [None]:
# from lazypredict.Supervised import LazyClassifier
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(tfidf_features, tfidf_labels, test_size=0.2, random_state=42)

# clf = LazyClassifier(verbose=1, ignore_warnings=True, custom_metric=None)
# models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# print(models)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_modeling["filter_text"], df_modeling["party"], test_size=0.2, random_state=42
)


In [None]:
df_test = pd.DataFrame({"text": X_test, "party": y_test})

In [None]:
test = df_modeling.copy()

with open(Path("train.txt"), "w") as f:
    for index, row in test[:int(test.shape[0] * 0.8)].iterrows():
        f.write(f"__label__{row['party']} {row['filter_text']}\n")

with open(Path("test.txt"), "w") as f:
    for index, row in test[int(test.shape[0] * 0.8):].iterrows():
        f.write(f"__label__{row['party']} {row['filter_text']}\n")


In [None]:
def print_results(N, p, r):
    f1 = 2 *((p*r)/(p+r))
    print("N\t" + str(N))
    print("F1\t" + str(f1))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [None]:
import fasttext
import fasttext.util

# fasttext.util.download_model("de", if_exists="ignore")


In [None]:
parameter = {"input": "train.txt", "pretrainedVectors": "cc.de.300.vec"} # "epoch": 50, "lr": 0.05, "wordNgrams": 2, "verbose": 2, "minCount":1, "loss": "ns", "lrUpdateRate": 100, "thread": 4, "ws": 5, "dim": 300,
model = fasttext.train_supervised(input="train.txt", epoch=5, lr=0.1, wordNgrams=2, loss="softmax", dim=300, pretrainedVectors="cc.de.300.vec")

In [None]:
test_score = model.test("test.txt")
print_results(*test_score)

In [None]:
df_test["prediction"] = df_test["text"].apply(lambda x: int(model.predict(x)[0][0].replace("__label__", "")))

In [None]:
cm = confusion_matrix(df_test["party"], df_test["prediction"], normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=party_encoding.keys())
disp.plot()