In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import time
from tqdm import tqdm

# Zeitmessung starten
start_time = time.time()

# Ordnerpfad angeben
folder_path = 'trainingandtestdata'  # Update this to the folder containing your data files

# Dateien laden
train_file_path = f"{folder_path}/processed_training_data.csv"
test_file_path = f"{folder_path}/processed_test_data.csv"

# 1. Daten laden
print("Lade die Trainingsdaten...")
train_data = pd.read_csv(train_file_path)  
test_data = pd.read_csv(test_file_path)  
print(f"Trainingsdaten geladen. Anzahl der Zeilen: {len(train_data)}")
print(f"Testdaten geladen. Anzahl der Zeilen: {len(test_data)}")
print(f"Zeit zum Laden der Daten: {time.time() - start_time:.2f} Sekunden")

# NaN-Werte entfernen oder ersetzen
print("\nBereinige NaN-Werte...")
train_data['processed_text'] = train_data['processed_text'].fillna('')
test_data['processed_text'] = test_data['processed_text'].fillna('')
print("Bereinigung abgeschlossen.")

# 2. Trainings- und Testdaten aufteilen
print("\nTrainiere mit Trainingsdaten und evaluiere mit Testdaten...")
X_train = train_data['processed_text']
y_train = train_data['Sentiment']
X_test = test_data['processed_text']
y_test = test_data['Sentiment']

print(f"Trainingsdaten: {len(X_train)}, Testdaten: {len(X_test)}")
print(f"Zeit für die Datenaufteilung: {time.time() - start_time:.2f} Sekunden")

# 3. TF-IDF-Transformation
print("\nStarte TF-IDF-Transformation...")
tfidf_start = time.time()
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000, ngram_range=(1, 3)
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print(f"TF-IDF-Transformation abgeschlossen. Zeit: {time.time() - tfidf_start:.2f} Sekunden")

# 4. Hyperparameter-Tuning mit GridSearchCV
print("\nStarte GridSearch für Hyperparameter-Tuning...")
grid_start = time.time()
param_grid = [
    {'C': [0.1, 1, 10], 'solver': ['liblinear'], 'penalty': ['l1', 'l2']},  # l1 und l2 mit liblinear
    {'C': [0.1, 1, 10], 'solver': ['saga'], 'penalty': ['l2']}  # Nur l2 mit saga
]
grid_search = GridSearchCV(
    LogisticRegression(max_iter=5000, class_weight='balanced'), 
    param_grid, 
    cv=3, 
    scoring='accuracy', 
    verbose=3
)
grid_search.fit(X_train_tfidf, y_train)
print(f"GridSearch abgeschlossen. Zeit: {time.time() - grid_start:.2f} Sekunden")

# Initialisiere bestes Modell und beste Metriken
best_epoch_model = None
best_epoch_params = None
best_f1_score = 0
n_epochs = 10

# Training über mehrere Epochen
print("\nTraining über 10 Epochen...")
for epoch in tqdm(range(1, n_epochs + 1), desc="Epochen"):
    print(f"\nEpoche {epoch} beginnt...")
    
    # Erstelle und trainiere das Modell mit den besten Parametern
    model = LogisticRegression(**grid_search.best_params_, max_iter=5000, class_weight='balanced')
    model.fit(X_train_tfidf, y_train)
    
    # Vorhersagen auf Testdaten
    y_pred = model.predict(X_test_tfidf)
    
    # Berechnung der Metriken
    epoch_accuracy = accuracy_score(y_test, y_pred)
    epoch_f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"  Accuracy: {epoch_accuracy:.4f}")
    print(f"  F1-Score: {epoch_f1:.4f}")
    
    # Speichere das Modell, falls es besser ist
    if epoch_f1 > best_f1_score:
        best_f1_score = epoch_f1
        best_epoch_model = model
        best_epoch_params = grid_search.best_params_
        print(f"  Bestes Modell aktualisiert in Epoche {epoch}. F1-Score: {best_f1_score:.4f}")

# Finales Modell evaluieren
print("\nEvaluierung des besten Modells...")
y_pred_final = best_epoch_model.predict(X_test_tfidf)

# Berechnung der endgültigen Metriken
final_accuracy = accuracy_score(y_test, y_pred_final)
final_precision = precision_score(y_test, y_pred_final, average='weighted')
final_recall = recall_score(y_test, y_pred_final, average='weighted')
final_f1 = f1_score(y_test, y_pred_final, average='weighted')

# Ergebnisse anzeigen
print(f"\nBeste Hyperparameter: {best_epoch_params}")
print(f"  Final Accuracy: {final_accuracy:.4f}")
print(f"  Final Precision: {final_precision:.4f}")
print(f"  Final Recall: {final_recall:.4f}")
print(f"  Final F1-Score: {final_f1:.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_final)}")

# Gesamtlaufzeit
print(f"\nGesamtlaufzeit: {time.time() - start_time:.2f} Sekunden")


Lade die Trainingsdaten...
Trainingsdaten geladen. Anzahl der Zeilen: 1600000
Testdaten geladen. Anzahl der Zeilen: 359
Zeit zum Laden der Daten: 1.02 Sekunden

Bereinige NaN-Werte...
Bereinigung abgeschlossen.

Trainiere mit Trainingsdaten und evaluiere mit Testdaten...
Trainingsdaten: 1600000, Testdaten: 359
Zeit für die Datenaufteilung: 1.17 Sekunden

Starte TF-IDF-Transformation...
TF-IDF-Transformation abgeschlossen. Zeit: 179.49 Sekunden

Starte GridSearch für Hyperparameter-Tuning...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END C=0.1, penalty=l1, solver=liblinear;, score=0.787 total time=   3.7s
[CV 2/3] END C=0.1, penalty=l1, solver=liblinear;, score=0.789 total time=   3.7s
[CV 3/3] END C=0.1, penalty=l1, solver=liblinear;, score=0.793 total time=   3.7s
[CV 1/3] END C=0.1, penalty=l2, solver=liblinear;, score=0.787 total time=   2.4s
[CV 2/3] END C=0.1, penalty=l2, solver=liblinear;, score=0.790 total time=   2.4s
[CV 3/3] END C=0.1, penalty=l2, so

Epochen:   0%|                                                                                                                                                                       | 0/10 [00:00<?, ?it/s]


Epoche 1 beginnt...


Epochen:  10%|███████████████▉                                                                                                                                               | 1/10 [00:15<02:16, 15.11s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920
  Bestes Modell aktualisiert in Epoche 1. F1-Score: 0.7920

Epoche 2 beginnt...


Epochen:  20%|███████████████████████████████▊                                                                                                                               | 2/10 [00:25<01:39, 12.47s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 3 beginnt...


Epochen:  30%|███████████████████████████████████████████████▋                                                                                                               | 3/10 [00:36<01:21, 11.71s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 4 beginnt...


Epochen:  40%|███████████████████████████████████████████████████████████████▌                                                                                               | 4/10 [00:47<01:07, 11.29s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 5 beginnt...


Epochen:  50%|███████████████████████████████████████████████████████████████████████████████▌                                                                               | 5/10 [00:57<00:54, 10.92s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 6 beginnt...


Epochen:  60%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 6/10 [01:07<00:42, 10.73s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 7 beginnt...


Epochen:  70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 7/10 [01:18<00:32, 10.81s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 8 beginnt...


Epochen:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 8/10 [01:30<00:21, 10.97s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 9 beginnt...


Epochen:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 9/10 [01:40<00:10, 10.74s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Epoche 10 beginnt...


Epochen: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:51<00:00, 11.13s/it]

  Accuracy: 0.7939
  F1-Score: 0.7920

Evaluierung des besten Modells...

Beste Hyperparameter: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
  Final Accuracy: 0.7939
  Final Precision: 0.8030
  Final Recall: 0.7939
  Final F1-Score: 0.7920
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.70      0.77       177
           1       0.75      0.88      0.81       182

    accuracy                           0.79       359
   macro avg       0.80      0.79      0.79       359
weighted avg       0.80      0.79      0.79       359


Gesamtlaufzeit: 498.66 Sekunden



