In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

arb = pd.read_csv('../dev_phase/subtask1/train/arb.csv')

# Split
X_train, X_test, y_train, y_test = train_test_split(
    arb["text"], arb["polarization"], test_size=0.2, stratify=arb["polarization"], random_state=42
)

# Pipeline 1: TF-IDF
pipe_tfidf = Pipeline([
    ("vect", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=2000, random_state=0))
])

# Pipeline 2: Count
pipe_count = Pipeline([
    ("vect", CountVectorizer()),
    ("clf", LogisticRegression(max_iter=2000, random_state=0))
])

# Grid
param_grid_tfidf = {
    "vect__analyzer": ["word", "char"],
    "vect__ngram_range": [(1,1), (1,2), (1,3), (3,5)],
    "vect__min_df": [1,2,5],
    "vect__max_df": [0.9,0.925,0.95,0.975,1],
    "vect__sublinear_tf": [True, False],

    "clf__C": [0.01,0.1,1,10,100],
    "clf__solver": ['lbfgs','newton-cg','liblinear','sag','saga'],
    "clf__class_weight": [None, "balanced"],
    "clf__penalty": ["l2", "l1"]
}

# GridSearch TF-IDF
grid_tfidf = GridSearchCV(pipe_tfidf, param_grid_tfidf, cv=5, scoring="f1_macro", n_jobs=-1, verbose=2)
grid_tfidf.fit(X_train, y_train)

print("Beste Parameter:", grid_tfidf.best_params_)
print("CV F1 (macro):", grid_tfidf.best_score_)

y_pred = grid_tfidf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

# GridSearch Count
# Grid
param_grid_count = {
    "vect__analyzer": ["word", "char"],
    "vect__ngram_range": [(1,1), (1,2), (1,3), (3,5)],
    "vect__min_df": [1,2,5],
    "vect__max_df": [0.9,0.925,0.95,0.975,1],
    "vect__binary": [True, False],

    "clf__C": [0.01,0.1,1,10,100],
    "clf__solver": ['lbfgs','newton-cg','liblinear','sag','saga'],
    "clf__class_weight": [None, "balanced"],
    "clf__penalty": ["l2", "l1"]
}

grid_count = GridSearchCV(pipe_count, param_grid_count, cv=5, scoring="f1_macro", n_jobs=-1, verbose=2)
grid_count.fit(X_train, y_train)

print("Beste Parameter:", grid_count.best_params_)
print("CV F1 (macro):", grid_count.best_score_)

y_pred = grid_count.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
Beste Parameter: {'clf__C': 10, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'saga', 'vect__analyzer': 'char', 'vect__max_df': 0.9, 'vect__min_df': 2, 'vect__ngram_range': (3, 5), 'vect__sublinear_tf': True}
CV F1 (macro): 0.7511269901926774
              precision    recall  f1-score   support

           0     0.7471    0.6791    0.7115       374
           1     0.6429    0.7152    0.6771       302

    accuracy                         0.6953       676
   macro avg     0.6950    0.6972    0.6943       676
weighted avg     0.7005    0.6953    0.6961       676

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
Beste Parameter: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'vect__analyzer': 'char', 'vect__binary': False, 'vect__max_df': 0.9, 'vect__min_df': 2, 'vect__ngram_range': (3, 5)}
CV F1 (macro): 0.7407227731244775
              