# Model Comparison: Random Forest vs. Neural Network

Dieses Notebook vergleicht die beiden trainierten Modelle zur Weinqualitäts-Vorhersage:

**Inhalt:**
1. Metriken-Tabelle (R², RMSE, MAE, Overfitting-Indikator)
2. Side-by-Side Visualisierungen
3. Ensemble-Vorhersage Test
4. Error Analyse
5. Statistische Tests

**Voraussetzungen:**
- Ausgeführtes `Random_Forest_GridSearch.ipynb`
- Ausgeführtes `NN_Model.ipynb`
- Gespeicherte Modell-Summaries in `model_history/` und `NN_model_history/`


In [None]:
# ============================================================================
# IMPORTS UND KONFIGURATION
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import glob
from scipy import stats
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import joblib
import warnings

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)
pd.set_option('display.max_columns', None)

# Pfade
DATA_PATH = Path('Base-Data/winequality-red.csv')
RF_HISTORY_DIR = Path('model_history')
NN_HISTORY_DIR = Path('NN_model_history')

# Konfiguration
RANDOM_STATE = 42
TEST_SIZE = 0.2
OUTLIER_THRESHOLD = 3.0

print("=" * 70)
print("MODEL COMPARISON NOTEBOOK")
print("=" * 70)
print(f"Random Forest History: {RF_HISTORY_DIR}")
print(f"Neural Network History: {NN_HISTORY_DIR}")
print("=" * 70)


In [None]:
# ============================================================================
# MODELL-SUMMARIES LADEN
# ============================================================================

def load_latest_summary(history_dir, pattern='*.csv', exclude_patterns=['combined', 'history', 'analysis']):
    """Lädt die neueste Summary-Datei aus einem History-Verzeichnis"""
    csv_files = list(history_dir.glob(pattern))
    # Filtere unerwünschte Dateien
    csv_files = [f for f in csv_files if not any(excl in f.name for excl in exclude_patterns)]
    
    if not csv_files:
        return None, None
    
    # Sortiere nach Änderungsdatum (neueste zuerst)
    csv_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    latest_file = csv_files[0]
    
    return pd.read_csv(latest_file), latest_file.name

# Random Forest Summary laden
rf_summary, rf_filename = load_latest_summary(RF_HISTORY_DIR)
print("=" * 70)
print("RANDOM FOREST SUMMARY")
print("=" * 70)
if rf_summary is not None:
    print(f"Datei: {rf_filename}")
    print(f"Timestamp: {rf_summary['timestamp'].values[0]}")
    print(f"Test R²: {rf_summary['test_r2'].values[0]:.4f}")
    print(f"Test RMSE: {rf_summary['test_rmse'].values[0]:.4f}")
else:
    print("Keine Random Forest Summary gefunden!")

# Neural Network Summary laden
nn_summary, nn_filename = load_latest_summary(NN_HISTORY_DIR, pattern='*_summary.csv', exclude_patterns=['history'])
print("\n" + "=" * 70)
print("NEURAL NETWORK SUMMARY")
print("=" * 70)
if nn_summary is not None:
    print(f"Datei: {nn_filename}")
    print(f"Timestamp: {nn_summary['timestamp'].values[0]}")
    print(f"Test R²: {nn_summary['test_r2'].values[0]:.4f}")
    print(f"Test RMSE: {nn_summary['test_rmse'].values[0]:.4f}")
else:
    print("Keine Neural Network Summary gefunden!")


In [None]:
# ============================================================================
# METRIKEN-VERGLEICHSTABELLE
# ============================================================================

print("=" * 80)
print("METRIKEN-VERGLEICHSTABELLE")
print("=" * 80)

# Erstelle Vergleichstabelle
comparison_data = {
    'Metrik': ['Test R²', 'Test RMSE', 'Test MAE', 'Train R²', 'Train RMSE', 
               'Train-Test R² Gap', 'Training Zeit (s)', 'Test Accuracy (gerundet)'],
    'Random Forest': [
        rf_summary['test_r2'].values[0] if rf_summary is not None else np.nan,
        rf_summary['test_rmse'].values[0] if rf_summary is not None else np.nan,
        rf_summary['test_mae'].values[0] if rf_summary is not None else np.nan,
        rf_summary['train_r2'].values[0] if rf_summary is not None else np.nan,
        rf_summary['train_rmse'].values[0] if rf_summary is not None else np.nan,
        rf_summary['train_test_r2_diff'].values[0] if rf_summary is not None and 'train_test_r2_diff' in rf_summary.columns else np.nan,
        rf_summary['training_time_seconds'].values[0] if rf_summary is not None else np.nan,
        rf_summary['test_accuracy_rounded'].values[0] if rf_summary is not None and 'test_accuracy_rounded' in rf_summary.columns else np.nan,
    ],
    'Neural Network': [
        nn_summary['test_r2'].values[0] if nn_summary is not None else np.nan,
        nn_summary['test_rmse'].values[0] if nn_summary is not None else np.nan,
        nn_summary['test_mae'].values[0] if nn_summary is not None else np.nan,
        nn_summary['train_r2'].values[0] if nn_summary is not None else np.nan,
        nn_summary['train_rmse'].values[0] if nn_summary is not None else np.nan,
        abs(nn_summary['train_r2'].values[0] - nn_summary['test_r2'].values[0]) if nn_summary is not None else np.nan,
        nn_summary['training_time_seconds'].values[0] if nn_summary is not None else np.nan,
        nn_summary['test_acc'].values[0] if nn_summary is not None else np.nan,
    ]
}

comparison_df = pd.DataFrame(comparison_data)
comparison_df['Gewinner'] = comparison_df.apply(
    lambda row: 'RF' if row['Random Forest'] > row['Neural Network'] else 'NN' 
    if row['Metrik'] in ['Test R²', 'Train R²', 'Test Accuracy (gerundet)'] else
    'RF' if row['Random Forest'] < row['Neural Network'] else 'NN',
    axis=1
)

# Markiere Gewinner mit *
display(comparison_df.round(4))

# Zusammenfassung
rf_wins = (comparison_df['Gewinner'] == 'RF').sum()
nn_wins = (comparison_df['Gewinner'] == 'NN').sum()
print(f"\n{'='*40}")
print(f"ZUSAMMENFASSUNG:")
print(f"  Random Forest gewinnt: {rf_wins} Metriken")
print(f"  Neural Network gewinnt: {nn_wins} Metriken")
print(f"{'='*40}")


In [None]:
# ============================================================================
# DATEN VORBEREITEN (gleiche Vorverarbeitung wie in den Modellen)
# ============================================================================

# Daten laden und bereinigen
df = pd.read_csv(DATA_PATH)
initial_shape = df.shape[0]

# Duplikate entfernen
df_clean = df.drop_duplicates()

# Ausreißer entfernen
feature_cols = [c for c in df_clean.columns if c != 'quality']
z_scores = np.abs(stats.zscore(df_clean[feature_cols]))
outlier_mask = (z_scores < OUTLIER_THRESHOLD).all(axis=1)
df_final = df_clean[outlier_mask].copy()

print(f"Datensatz: {initial_shape} → {df_final.shape[0]} Samples")

# Features und Target
X = df_final.drop('quality', axis=1)
y = df_final['quality']

# Train-Test Split (gleich wie in den Modellen)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# Scaling für NN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")


In [None]:
# ============================================================================
# MODELLE TRAINIEREN (für direkten Vergleich)
# ============================================================================

# Random Forest (mit optimierten Parametern)
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=8,
    min_samples_split=12,
    min_samples_leaf=5,
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_pred_train = rf_model.predict(X_train)
rf_pred_test = rf_model.predict(X_test)

# Neural Network (einfache Version ohne Training für schnellen Vergleich)
class SimpleNN(nn.Module):
    def __init__(self, in_features=11):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(32, 1),
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nn_model = SimpleNN(in_features=X_train.shape[1]).to(device)

# Schnelles Training
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=1e-3, weight_decay=1e-4)

X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train.values.reshape(-1, 1)).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

print("Training Neural Network für Vergleich...")
nn_model.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = nn_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 25 == 0:
        print(f"  Epoch {epoch+1}: Loss = {loss.item():.4f}")

nn_model.eval()
with torch.no_grad():
    nn_pred_train = nn_model(X_train_tensor).cpu().numpy().flatten()
    nn_pred_test = nn_model(X_test_tensor).cpu().numpy().flatten()

print("\nModelle trainiert!")
print(f"RF Test R²: {r2_score(y_test, rf_pred_test):.4f}")
print(f"NN Test R²: {r2_score(y_test, nn_pred_test):.4f}")


In [None]:
# ============================================================================
# ENSEMBLE-VORHERSAGE TESTEN
# ============================================================================

print("=" * 80)
print("ENSEMBLE-VORHERSAGE TESTEN")
print("=" * 80)

# Verschiedene Gewichtungen testen
weights = [
    (0.5, 0.5, "50/50"),
    (0.6, 0.4, "60/40 (RF)"),
    (0.4, 0.6, "40/60 (NN)"),
    (0.7, 0.3, "70/30 (RF)"),
    (0.3, 0.7, "30/70 (NN)"),
]

ensemble_results = []
for rf_weight, nn_weight, name in weights:
    ensemble_pred = rf_weight * rf_pred_test + nn_weight * nn_pred_test
    r2 = r2_score(y_test, ensemble_pred)
    rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
    mae = mean_absolute_error(y_test, ensemble_pred)
    
    ensemble_results.append({
        'Gewichtung': name,
        'RF_Weight': rf_weight,
        'NN_Weight': nn_weight,
        'R²': r2,
        'RMSE': rmse,
        'MAE': mae
    })

# Einzelmodell-Ergebnisse hinzufügen
ensemble_results.append({
    'Gewichtung': 'RF Only',
    'RF_Weight': 1.0,
    'NN_Weight': 0.0,
    'R²': r2_score(y_test, rf_pred_test),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_pred_test)),
    'MAE': mean_absolute_error(y_test, rf_pred_test)
})
ensemble_results.append({
    'Gewichtung': 'NN Only',
    'RF_Weight': 0.0,
    'NN_Weight': 1.0,
    'R²': r2_score(y_test, nn_pred_test),
    'RMSE': np.sqrt(mean_squared_error(y_test, nn_pred_test)),
    'MAE': mean_absolute_error(y_test, nn_pred_test)
})

ensemble_df = pd.DataFrame(ensemble_results).sort_values('R²', ascending=False)
display(ensemble_df.round(4))

# Beste Kombination
best_ensemble = ensemble_df.iloc[0]
print(f"\n{'='*40}")
print(f"BESTE KOMBINATION: {best_ensemble['Gewichtung']}")
print(f"R²: {best_ensemble['R²']:.4f}")
print(f"RMSE: {best_ensemble['RMSE']:.4f}")
print(f"{'='*40}")


In [None]:
# ============================================================================
# SIDE-BY-SIDE VISUALISIERUNGEN
# ============================================================================

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Predictions vs Actual - RF
axes[0, 0].scatter(y_test, rf_pred_test, alpha=0.5, color='steelblue', s=40)
axes[0, 0].plot([3, 8], [3, 8], 'k--', linewidth=2)
axes[0, 0].set_xlabel('Tatsächliche Quality')
axes[0, 0].set_ylabel('Vorhergesagte Quality')
axes[0, 0].set_title(f'Random Forest (R²={r2_score(y_test, rf_pred_test):.4f})', fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# 2. Predictions vs Actual - NN
axes[0, 1].scatter(y_test, nn_pred_test, alpha=0.5, color='crimson', s=40)
axes[0, 1].plot([3, 8], [3, 8], 'k--', linewidth=2)
axes[0, 1].set_xlabel('Tatsächliche Quality')
axes[0, 1].set_ylabel('Vorhergesagte Quality')
axes[0, 1].set_title(f'Neural Network (R²={r2_score(y_test, nn_pred_test):.4f})', fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# 3. Predictions vs Actual - Ensemble (50/50)
ensemble_pred_50 = 0.5 * rf_pred_test + 0.5 * nn_pred_test
axes[0, 2].scatter(y_test, ensemble_pred_50, alpha=0.5, color='green', s=40)
axes[0, 2].plot([3, 8], [3, 8], 'k--', linewidth=2)
axes[0, 2].set_xlabel('Tatsächliche Quality')
axes[0, 2].set_ylabel('Vorhergesagte Quality')
axes[0, 2].set_title(f'Ensemble 50/50 (R²={r2_score(y_test, ensemble_pred_50):.4f})', fontweight='bold')
axes[0, 2].grid(alpha=0.3)

# 4. Residual Plot - RF
rf_residuals = y_test - rf_pred_test
axes[1, 0].scatter(rf_pred_test, rf_residuals, alpha=0.5, color='steelblue', s=40)
axes[1, 0].axhline(y=0, color='k', linestyle='--', linewidth=2)
axes[1, 0].set_xlabel('Vorhergesagte Quality')
axes[1, 0].set_ylabel('Residuen')
axes[1, 0].set_title('RF Residual Plot', fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# 5. Residual Plot - NN
nn_residuals = y_test.values - nn_pred_test
axes[1, 1].scatter(nn_pred_test, nn_residuals, alpha=0.5, color='crimson', s=40)
axes[1, 1].axhline(y=0, color='k', linestyle='--', linewidth=2)
axes[1, 1].set_xlabel('Vorhergesagte Quality')
axes[1, 1].set_ylabel('Residuen')
axes[1, 1].set_title('NN Residual Plot', fontweight='bold')
axes[1, 1].grid(alpha=0.3)

# 6. Fehlerverteilung Vergleich
axes[1, 2].hist(rf_residuals, bins=25, alpha=0.5, label='RF', color='steelblue', density=True)
axes[1, 2].hist(nn_residuals, bins=25, alpha=0.5, label='NN', color='crimson', density=True)
axes[1, 2].axvline(x=0, color='k', linestyle='--', linewidth=2)
axes[1, 2].set_xlabel('Residuen')
axes[1, 2].set_ylabel('Dichte')
axes[1, 2].set_title('Fehlerverteilung Vergleich', fontweight='bold')
axes[1, 2].legend()
axes[1, 2].grid(alpha=0.3)

plt.suptitle('Model Comparison: Side-by-Side Visualisierungen', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# ============================================================================
# ERROR ANALYSE: Wo liegen beide Modelle falsch?
# ============================================================================

print("=" * 80)
print("ERROR ANALYSE")
print("=" * 80)

# Absolute Fehler berechnen
rf_abs_error = np.abs(y_test.values - rf_pred_test)
nn_abs_error = np.abs(y_test.values - nn_pred_test)

# Samples wo beide stark falsch liegen (Fehler > 1)
both_wrong = (rf_abs_error > 1) & (nn_abs_error > 1)
rf_only_wrong = (rf_abs_error > 1) & (nn_abs_error <= 1)
nn_only_wrong = (rf_abs_error <= 1) & (nn_abs_error > 1)
both_correct = (rf_abs_error <= 0.5) & (nn_abs_error <= 0.5)

print(f"\nFehleranalyse (Threshold: Fehler > 1):")
print(f"  Beide stark falsch: {both_wrong.sum()} Samples ({100*both_wrong.mean():.1f}%)")
print(f"  Nur RF stark falsch: {rf_only_wrong.sum()} Samples ({100*rf_only_wrong.mean():.1f}%)")
print(f"  Nur NN stark falsch: {nn_only_wrong.sum()} Samples ({100*nn_only_wrong.mean():.1f}%)")
print(f"  Beide gut (Fehler <= 0.5): {both_correct.sum()} Samples ({100*both_correct.mean():.1f}%)")

# Analyse der Samples wo beide falsch liegen
if both_wrong.sum() > 0:
    error_analysis = X_test[both_wrong].copy()
    error_analysis['y_true'] = y_test.values[both_wrong]
    error_analysis['rf_pred'] = rf_pred_test[both_wrong]
    error_analysis['nn_pred'] = nn_pred_test[both_wrong]
    error_analysis['rf_error'] = rf_abs_error[both_wrong]
    error_analysis['nn_error'] = nn_abs_error[both_wrong]
    
    print(f"\n\nMerkmale der schwierigen Samples (beide Modelle > 1 Fehler):")
    print("-" * 50)
    difficult_stats = error_analysis[['y_true', 'rf_error', 'nn_error']].describe()
    display(difficult_stats.round(2))
    
    # Quality-Verteilung der schwierigen Samples
    print(f"\nQuality-Verteilung der schwierigen Samples:")
    print(error_analysis['y_true'].value_counts().sort_index())


In [None]:
# ============================================================================
# STATISTISCHE TESTS
# ============================================================================

print("=" * 80)
print("STATISTISCHE TESTS")
print("=" * 80)

# Paired t-test: Sind die Unterschiede signifikant?
# Vergleiche die quadrierten Fehler (MSE-Beitrag pro Sample)
rf_squared_errors = (y_test.values - rf_pred_test) ** 2
nn_squared_errors = (y_test.values - nn_pred_test) ** 2

t_stat, p_value = stats.ttest_rel(rf_squared_errors, nn_squared_errors)

print(f"\n1. PAIRED T-TEST (quadrierte Fehler)")
print("-" * 50)
print(f"   H0: Kein signifikanter Unterschied zwischen RF und NN")
print(f"   t-Statistik: {t_stat:.4f}")
print(f"   p-Wert: {p_value:.6f}")
print(f"   Interpretation: {'Signifikanter Unterschied (p < 0.05)' if p_value < 0.05 else 'Kein signifikanter Unterschied'}")

# Wilcoxon signed-rank test (nicht-parametrisch)
w_stat, w_p_value = stats.wilcoxon(rf_squared_errors, nn_squared_errors)

print(f"\n2. WILCOXON SIGNED-RANK TEST")
print("-" * 50)
print(f"   H0: Kein signifikanter Unterschied zwischen RF und NN")
print(f"   Statistik: {w_stat:.4f}")
print(f"   p-Wert: {w_p_value:.6f}")
print(f"   Interpretation: {'Signifikanter Unterschied (p < 0.05)' if w_p_value < 0.05 else 'Kein signifikanter Unterschied'}")

# Confidence Intervals für R² (Bootstrap)
print(f"\n3. CONFIDENCE INTERVALS (Bootstrap, n=1000)")
print("-" * 50)

def bootstrap_r2(y_true, y_pred, n_bootstrap=1000, confidence=0.95):
    """Berechnet Bootstrap Confidence Interval für R²"""
    r2_scores = []
    n = len(y_true)
    for _ in range(n_bootstrap):
        indices = np.random.choice(n, size=n, replace=True)
        r2 = r2_score(y_true[indices], y_pred[indices])
        r2_scores.append(r2)
    
    lower = np.percentile(r2_scores, (1 - confidence) / 2 * 100)
    upper = np.percentile(r2_scores, (1 + confidence) / 2 * 100)
    return np.mean(r2_scores), lower, upper

rf_r2_mean, rf_r2_lower, rf_r2_upper = bootstrap_r2(y_test.values, rf_pred_test)
nn_r2_mean, nn_r2_lower, nn_r2_upper = bootstrap_r2(y_test.values, nn_pred_test)

print(f"   Random Forest R²: {rf_r2_mean:.4f} (95% CI: [{rf_r2_lower:.4f}, {rf_r2_upper:.4f}])")
print(f"   Neural Network R²: {nn_r2_mean:.4f} (95% CI: [{nn_r2_lower:.4f}, {nn_r2_upper:.4f}])")

# Überlappen die Confidence Intervals?
overlap = not (rf_r2_upper < nn_r2_lower or nn_r2_upper < rf_r2_lower)
print(f"   CI überlappen: {'Ja' if overlap else 'Nein'}")
print(f"   Interpretation: {'Kein signifikanter Unterschied in R²' if overlap else 'Signifikanter Unterschied in R²'}")

print("\n" + "=" * 80)


## Zusammenfassung

### Vergleichsergebnisse:

**Random Forest:**
- Vorteile: Schnelles Training, interpretierbar (Feature Importance), robust gegen Overfitting mit richtigen Parametern
- Nachteile: Begrenzte Fähigkeit, komplexe nichtlineare Beziehungen zu erfassen

**Neural Network:**
- Vorteile: Kann komplexe Muster lernen, flexibel
- Nachteile: Längere Trainingszeit, "Black Box", anfälliger für Overfitting

**Ensemble:**
- Die Kombination beider Modelle kann oft bessere Ergebnisse liefern als einzelne Modelle
- Optimale Gewichtung hängt von den spezifischen Daten ab

### Empfehlungen:
1. Für Interpretierbarkeit: Random Forest verwenden
2. Für maximale Accuracy: Ensemble-Ansatz testen
3. Für Produktionseinsatz: Das einfachere Modell (RF) bevorzugen, wenn Performance ähnlich ist
