In [None]:
import ast
from pathlib import Path

import numpy as np
import pandas as pd
import spacy
from fasttext import load_model
from sklearn.model_selection import train_test_split

from studienarbeit.config import party_encoding
from studienarbeit.utils.load import EDataTypes, Load

In [None]:
party_decoding = {value: key for key, value in party_encoding.items()}
file_name = "prep_tweets_fast_full.parquet"
data_type = EDataTypes.TWEETS
data_dir = Path("../../data/") / data_type.value

load = Load(data_type=data_type)
nlp = spacy.load("de_core_news_lg")

In [None]:
df = load.load_dataframe("tweets.parquet", columns=["clean_text", "stemm_text", "lemma_text", "party"])
df.head()

In [None]:
model = load_model("cc.de.300.bin")

In [None]:
df["vectorized_text"] = df["filter_text"].apply(model.get_sentence_vector)
df.head()

In [None]:
df["party"].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["vectorized_text"], df["party"], train_size=0.7, random_state=42)

print(f'Train dimensions: {len(X_train), len(y_train)}')
print(f'Test dimensions: {len(X_test), len(y_test)}')

# Check out target distribution
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
X_train_2d = np.array([x for x in X_train])
X_test_2d = np.array([x for x in X_test])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
clf = GradientBoostingClassifier().fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=party_encoding.keys())
disp.plot()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier().fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=party_encoding.keys())
disp.plot()

In [None]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier().fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer()# Fit to the data and transform to feature matrix
X_train_tfidf = vectoriser.fit_transform(X_train)
X_train_tfidf.shape

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

sgd_clf = SGDClassifier(random_state=123)
sgf_clf_scores = cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(sgf_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (sgf_clf_scores.mean(), sgf_clf_scores.std() * 2))

In [None]:
from sklearn.model_selection import GridSearchCV

grid = {'fit_intercept': [True,False],
        'early_stopping': [True, False],
        'loss' : ['hinge', 'log_loss', 'squared_hinge'],
        'penalty' : ['l2', 'l1', 'none']}
search = GridSearchCV(estimator=sgd_clf, param_grid=grid, cv=5)
search.fit(X_train_tfidf, y_train)
search.best_params_

In [None]:
grid_sgd_clf_scores = cross_val_score(search.best_estimator_, X_train_tfidf, y_train, cv=5)
print(grid_sgd_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (grid_sgd_clf_scores.mean(), grid_sgd_clf_scores.std() * 2))

In [None]:
## for data
import json
import pandas as pd
import numpy as np

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for processing
import re
import nltk

## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection

## for explainer
# from lime import lime_text

## for word embedding
import gensim
import gensim.downloader as gensim_api

## for deep learning
# from tensorflow.keras import models, layers, preprocessing as kprocessing
# from tensorflow.keras import backend as K

## for bert language model
import transformers

In [None]:
## Count (classic BoW)
vectorizer_bow = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

## Tf-Idf (advanced variant of BoW)
vectorizer_tfidf = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [None]:
vectorizer_tfidf.fit(X_train)
dic_vocabulary = vectorizer_tfidf.vocabulary_

In [None]:
dic_vocabulary["afd"]

In [None]:
X_train.head()

In [None]:
nlp = gensim_api.load("word2vec-google-news-300")

In [None]:
from sklearn.utils import shuffle

shuffle(df_base)

In [None]:
import gensim.downloader as api

model1 = api.load("word2vec-google-news-300")

In [None]:
model1.wv.most_similar("grüne")