In [21]:
"""
PREDICTION DEMO ‚Äì JUPYTER NOTEBOOK
==================================
Loads the champion classification model and evaluates it on a
small, stratified set of hold-out reviews.

The notebook is organised in clear, numbered sections:
    1) File checks
    2) Load model + artefacts
    3) Prepare test data
    4) Select a stratified sample
    5) Run predictions
    6) Performance metrics
    7) Per-review diagnostics
"""

# ------------------------------------------------------------------
# Standard libraries
# ------------------------------------------------------------------
import pickle
import random
import textwrap
from pathlib import Path
import warnings

# ------------------------------------------------------------------
# Third-party libraries
# ------------------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score, classification_report,
    mean_absolute_error, confusion_matrix
)

warnings.filterwarnings("ignore", category=UserWarning)

print("üöÄ  PREDICTION DEMO ‚Äì champion model")
print("=" * 60)


üöÄ  PREDICTION DEMO ‚Äì champion model


In [20]:
# =============================================================================
# 1) CHECK REQUIRED FILES & PATHS
# =============================================================================
from pathlib import Path

MODEL_PKL   = Path("../src/models/best_classification_model.pkl")
FEAT_PKL    = Path("../src/models/feature_info.pkl")
SPLIT_PKL   = Path("../src/models/train_test_splits.pkl")
DEFAULT_CSV = Path("../src/data/processed/temu_reviews_preprocessed.csv")

print("üìÇ File availability:")
file_flags = {
    "Model bundle"       : MODEL_PKL.exists(),
    "Feature metadata"   : FEAT_PKL.exists(),
    "Train/Test splits"  : SPLIT_PKL.exists(),
    "Pre-processed CSV"  : DEFAULT_CSV.exists()
}

for label, ok in file_flags.items():
    mark = "‚úÖ" if ok else "‚ùå"
    print(f"   {mark} {label}")

missing_core = [name for name, ok in file_flags.items()
                if not ok and name in {"Model bundle", "Feature metadata"}]

if missing_core:
    msg = ", ".join(missing_core)
    raise FileNotFoundError(
        f"Critical file(s) missing: {msg}. "
        "Please run the training notebook first."
    )


üìÇ File availability:
   ‚úÖ Model bundle
   ‚úÖ Feature metadata
   ‚úÖ Train/Test splits
   ‚úÖ Pre-processed CSV


In [14]:
# =============================================================================
# 2) LOAD MODEL AND SUPPORT FILES
# =============================================================================
import pickle
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

MODEL_PKL = Path("../src/models/best_classification_model.pkl")
FEAT_PKL  = Path("../src/models/feature_info.pkl")

print("\nüì¶ Loading model artefacts ‚Ä¶")

# ------------------------------------------------------------------
# 1) Load the champion model bundle
# ------------------------------------------------------------------
try:
    with MODEL_PKL.open("rb") as fp:
        mdl_dict = pickle.load(fp)

    clf        = mdl_dict["model"]          # fitted estimator
    model_name = mdl_dict["model_name"]
    test_acc   = mdl_dict.get("test_accuracy", "N/A")
    w_f1       = mdl_dict.get("weighted_f1", "N/A")

    print(f"‚úÖ Model loaded : {model_name}")
    if w_f1 != "N/A":
        print(f"   Weighted F1  : {w_f1:.3f}")
    if test_acc != "N/A":
        print(f"   Test accuracy: {test_acc:.3f}")

except Exception as e:
    raise RuntimeError(f"‚ùå Could not load model pickle: {e}")

# ------------------------------------------------------------------
# 2) Load vectoriser & metadata
# ------------------------------------------------------------------
try:
    with FEAT_PKL.open("rb") as fp:
        feat = pickle.load(fp)

    vectorizer         = feat["tfidf_vectorizer"]
    feature_count      = feat.get("feature_count",
                                  len(vectorizer.get_feature_names_out()))
    numerical_features = feat.get("numerical_features", [])
    class_weights      = feat.get("class_weights", {})
    label_encoder      = feat.get("label_encoder")  # may be None

    print(f"‚úÖ Vectoriser loaded : {feature_count:,} features")
    print(f"‚úÖ Numerical columns : {numerical_features}")
    if class_weights:
        print(f"‚úÖ Class weights     : {class_weights}")
    if label_encoder is None:
        # fall-back: identity encoder 1‚òÖ‚Ä¶5‚òÖ ‚Üí 0‚Ä¶4
        label_encoder = LabelEncoder().fit([1, 2, 3, 4, 5])
        print("üîß Label encoder created (identity 1-5 ‚Üí 0-4)")
    else:
        print("‚úÖ Label encoder loaded from feature_info")

except Exception as e:
    raise RuntimeError(f"‚ùå Could not load feature artefacts: {e}")


üì¶ Loading model artefacts ‚Ä¶
‚úÖ Model loaded : Stacking
   Weighted F1  : 0.725
   Test accuracy: 0.733
‚úÖ Vectoriser loaded : 5,000 features
‚úÖ Numerical columns : ['word_count', 'char_count', 'sentence_count', 'avg_word_length', 'exclamation_count', 'question_count', 'capital_ratio', 'sentiment_compound', 'sentiment_pos', 'sentiment_neu', 'sentiment_neg']
‚úÖ Class weights     : {np.int64(1): np.float64(0.38278940027894004), np.int64(2): np.float64(3.1510907003444317), np.int64(3): np.float64(4.235493827160494), np.int64(4): np.float64(2.463734290843806), np.int64(5): np.float64(0.7001530612244898)}
‚úÖ Label encoder loaded from feature_info


In [15]:
# =============================================================================
# 3) PREPARE TEST DATA
# =============================================================================
import pickle, pandas as pd
from pathlib import Path

print("\nüìä Preparing test data ‚Ä¶")

# ------------------------------------------------------------------
# paths
# ------------------------------------------------------------------
SPLIT_PKL   = Path("../src/models/train_test_splits.pkl")
DEFAULT_CSV = Path("../data/preprocessed_reviews.csv")

# make sure numerical_features exists
if "numerical_features" not in globals():
    numerical_features = []      # will be updated later if CSV contains them

# decide which source to use
use_splits = SPLIT_PKL.exists()
use_csv    = DEFAULT_CSV.exists() and not use_splits

# ------------------------------------------------------------------
# 1) load saved train/test splits
# ------------------------------------------------------------------
if use_splits:
    print("üìã Using stored train/test splits ‚Ä¶")
    try:
        with SPLIT_PKL.open("rb") as fp:
            splits = pickle.load(fp)

        X_test  = splits["combined"]["X_test"]
        y_test  = splits["combined"]["y_test"]
        print(f"‚úÖ Test split loaded : {X_test.shape[0]:,} samples "
              f"({X_test.shape[1]:,} features)")
        print(f"   Rating distribution: {dict(pd.Series(y_test).value_counts().sort_index())}")
        data_source = "splits"

    except Exception as e:
        print(f"‚ùå Could not load splits: {e}")
        use_splits = False
        use_csv    = DEFAULT_CSV.exists()

# ------------------------------------------------------------------
# 2) fallback: load the pre-processed CSV
# ------------------------------------------------------------------
if use_csv and not use_splits:
    print("üìã Using pre-processed CSV ‚Ä¶")
    try:
        df = pd.read_csv(DEFAULT_CSV)
        df = df.dropna(subset=["processed_text", "ReviewRating"])
        df["ReviewRating"] = df["ReviewRating"].astype(int)
        df = df[df["ReviewRating"].between(1, 5)]        # keep ratings 1-5

        print(f"‚úÖ CSV loaded : {len(df):,} valid reviews")
        print(f"   Rating distribution: {dict(df['ReviewRating'].value_counts().sort_index())}")

        # check which numeric columns are present
        avail_num   = [c for c in numerical_features if c in df.columns]
        missing_num = [c for c in numerical_features if c not in df.columns]
        if missing_num:
            print(f"‚ö†Ô∏è Missing numeric features: {missing_num}")
            print(f"‚úÖ Available numeric features: {avail_num}")
            numerical_features = avail_num      # update list

        data_source = "csv"

    except Exception as e:
        raise RuntimeError(f"‚ùå Could not load CSV: {e}")

# ------------------------------------------------------------------
# 3) no data found
# ------------------------------------------------------------------
if not use_splits and not use_csv:
    raise FileNotFoundError("No test data available: neither split pickle nor CSV found.")


üìä Preparing test data ‚Ä¶
üìã Using stored train/test splits ‚Ä¶
‚úÖ Test split loaded : 2,745 samples (5,011 features)
   Rating distribution: {1: np.int64(1434), 2: np.int64(174), 3: np.int64(130), 4: np.int64(223), 5: np.int64(784)}


In [16]:
# =============================================================================
# 4) SELECT A STRATIFIED SAMPLE
# =============================================================================
import numpy as np, random
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

print("\nüé≤ Selecting stratified test samples ‚Ä¶")

N_SAMPLES = 15          # total reviews to inspect
rng       = np.random.default_rng(42)

# -----------------------------------------------------------
# Helper to pick k indices per star rating
# -----------------------------------------------------------
def stratified_indices(labels, k_total):
    """Return <=k_total indices, equally split across present ratings."""
    unique = np.unique(labels)
    k_per  = max(1, k_total // len(unique))
    idx = []
    for star in unique:
        star_idx = np.where(labels == star)[0].tolist()
        take = min(k_per, len(star_idx))
        idx.extend(rng.choice(star_idx, size=take, replace=False))
    # top-up if we‚Äôre short
    if len(idx) < k_total:
        remaining = [i for i in range(len(labels)) if i not in idx]
        extra = rng.choice(remaining, size=k_total-len(idx), replace=False)
        idx.extend(extra)
    return idx[:k_total]

# -----------------------------------------------------------
# A) use stored splits
# -----------------------------------------------------------
if data_source == "splits":
    labels = np.array(y_test)
    idx    = stratified_indices(labels, N_SAMPLES)

    X_sample  = X_test[idx]
    y_true    = labels[idx]
    sample_texts = None   # raw text is not part of the split object

# -----------------------------------------------------------
# B) use CSV   (df already loaded earlier)
# -----------------------------------------------------------
elif data_source == "csv":
    idx_df = stratified_indices(df["ReviewRating"].values, N_SAMPLES)
    samples_df = df.iloc[idx_df].reset_index(drop=True)

    # Which feature combo did the model expect?
    combo = mdl_dict.get("feature_combination", "combined")
    print(f"üîß Model expects feature set: {combo}")

    # ---- TF-IDF block
    X_tfidf = vectorizer.transform(samples_df["processed_text"])

    # ---- numerical block (if any)
    if numerical_features:
        X_num = samples_df[numerical_features].values
        if "scaler" in feature_info:
            scaler = feature_info["scaler"]        # trained scaler
            X_num_scaled = scaler.transform(X_num)
        else:                                      # fallback (not ideal)
            print("‚ö†Ô∏è  Using fresh StandardScaler ‚Äì may distort results")
            X_num_scaled = StandardScaler().fit_transform(X_num)
    else:
        X_num_scaled = None

    # ---- build X_sample according to combo
    if combo == "tfidf_only":
        X_sample = X_tfidf
    elif combo == "numerical_only":
        if X_num_scaled is None:
            raise ValueError("Model requires numeric features but none are present.")
        X_sample = X_num_scaled
    else:   # combined
        if X_num_scaled is None:
            print("‚ö†Ô∏è Model expects combined features, but numeric part is missing. Falling back to TF-IDF only.")
            X_sample = X_tfidf
        else:
            X_sample = hstack([X_tfidf, X_num_scaled])

    y_true       = samples_df["ReviewRating"].values
    sample_texts = samples_df["processed_text"].values

# -----------------------------------------------------------
# Summary
# -----------------------------------------------------------
print(f"‚úÖ Sample size: {len(y_true)}")
print(f"   Rating distribution: {dict(pd.Series(y_true).value_counts().sort_index())}")


üé≤ Selecting stratified test samples ‚Ä¶
‚úÖ Sample size: 15
   Rating distribution: {1: np.int64(3), 2: np.int64(3), 3: np.int64(3), 4: np.int64(3), 5: np.int64(3)}


In [17]:
# =============================================================================
# 5) MAKE PREDICTIONS
# =============================================================================
import numpy as np

print("\nüîÆ Generating predictions ‚Ä¶")

try:
    # -----------------------------------------------------------
    # 1) Raw model output (encoded labels)
    # -----------------------------------------------------------
    y_pred_enc = clf.predict(X_sample)
    print(f"üîß Raw model output (unique): {sorted(set(y_pred_enc))}")

    # -----------------------------------------------------------
    # 2) Detect the label scheme and map to 1-5 stars
    # -----------------------------------------------------------
    if np.min(y_pred_enc) >= 1 and np.max(y_pred_enc) <= 5:
        # model is already on 1-5 scale
        y_pred = y_pred_enc
        print("‚úÖ Model uses 1‚Äì5 encoding")
    elif np.min(y_pred_enc) >= 0 and np.max(y_pred_enc) <= 4:
        # model is on 0-4 ‚Üí shift
        y_pred = y_pred_enc + 1
        print("‚úÖ Model uses 0‚Äì4 encoding ‚Üí shifted to 1‚Äì5")
    else:
        # unexpected range ‚Äì clip as last resort
        print(f"‚ö†Ô∏è Unexpected labels {sorted(set(y_pred_enc))} ‚Äì clipping to 1‚Äì5")
        y_pred = np.clip(y_pred_enc, 1, 5).astype(int)

    # final sanity-check
    assert y_pred.min() >= 1 and y_pred.max() <= 5, "label mapping failed"

    # -----------------------------------------------------------
    # 3) Confidence scores if available
    # -----------------------------------------------------------
    if hasattr(clf, "predict_proba"):
        y_proba    = clf.predict_proba(X_sample)
        confidence = y_proba.max(axis=1)
        print(f"‚úÖ Mean confidence: {confidence.mean():.1%}")

        buckets = {
            "<50%" : confidence < 0.50,
            "50‚Äì70%" : (confidence >= .50) & (confidence < .70),
            "70‚Äì85%" : (confidence >= .70) & (confidence < .85),
            "85‚Äì95%" : (confidence >= .85) & (confidence < .95),
            ">95%"   : confidence >= .95,
        }
        print("üìä Confidence distribution:")
        for label, mask in buckets.items():
            count = int(mask.sum())
            if count:
                print(f"   {label:<7}: {count} sample(s)")
    else:
        confidence = None
        print("‚ÑπÔ∏è  Model exposes no predict_proba ‚Äì confidence skipped")

    # -----------------------------------------------------------
    # 4) Quick preview
    # -----------------------------------------------------------
    print("\nüîç First five predictions:")
    for i in range(min(5, len(y_pred))):
        conf_str = f" (conf: {confidence[i]:.1%})" if confidence is not None else ""
        print(f"   Sample {i+1}: true {y_true[i]}‚òÖ ‚Üí pred {y_pred[i]}‚òÖ{conf_str}")

    print("‚úÖ Predictions generated successfully")

except Exception as e:
    print(f"‚ùå Prediction error: {e}")
    print("üîé Debug info:")
    print(f"   X_sample shape : {getattr(X_sample,'shape',None)}")
    print(f"   Estimator type : {type(clf)}")
    print(f"   Model name     : {model_name}")
    raise


üîÆ Generating predictions ‚Ä¶
üîß Raw model output (unique): [np.int64(3)]
‚úÖ Model uses 1‚Äì5 encoding
‚úÖ Mean confidence: 32.5%
üìä Confidence distribution:
   <50%   : 15 sample(s)

üîç First five predictions:
   Sample 1: true 1‚òÖ ‚Üí pred 3‚òÖ (conf: 31.4%)
   Sample 2: true 1‚òÖ ‚Üí pred 3‚òÖ (conf: 35.2%)
   Sample 3: true 1‚òÖ ‚Üí pred 3‚òÖ (conf: 32.0%)
   Sample 4: true 2‚òÖ ‚Üí pred 3‚òÖ (conf: 33.6%)
   Sample 5: true 2‚òÖ ‚Üí pred 3‚òÖ (conf: 35.4%)
‚úÖ Predictions generated successfully


In [18]:
# =============================================================================
# 6) PERFORMANCE METRICS
# =============================================================================

from sklearn.metrics import (
    accuracy_score, mean_absolute_error,
    classification_report, confusion_matrix
)
import numpy as np
import pandas as pd

print("\nüìà  PERFORMANCE ANALYSIS")
print("=" * 60)

# -----------------------------------------------------------
# 1) Quick sanity check
# -----------------------------------------------------------
print("üîç Debug:")
print(f"   y_true unique : {sorted(set(y_true))}")
print(f"   y_pred unique : {sorted(set(y_pred))}")

# -----------------------------------------------------------
# 2) Basic metrics
# -----------------------------------------------------------
acc  = accuracy_score(y_true, y_pred)
mae  = mean_absolute_error(y_true, y_pred)
hits = (y_true == y_pred).sum()

print(f"\nüéØ Model          : {model_name}")
print(f"üìä Accuracy       : {acc:.1%}  ({hits}/{len(y_true)} correct)")
print(f"üìä MAE (in stars) : {mae:.2f}")

# -----------------------------------------------------------
# 3) Classification report
# -----------------------------------------------------------
labels       = [1, 2, 3, 4, 5]
target_names = [f"{i}‚òÖ" for i in labels]
print("\nüìã CLASSIFICATION REPORT")
print(
    classification_report(
        y_true, y_pred,
        labels=labels,
        target_names=target_names,
        digits=3,
        zero_division=0
    )
)

# -----------------------------------------------------------
# 4) Confusion matrix
# -----------------------------------------------------------
print("\nüîç CONFUSION MATRIX")
cm = confusion_matrix(y_true, y_pred, labels=labels)

# header row
print("True\\Pred", *[f"{lbl}‚òÖ" for lbl in labels], sep="  ")
for i, true_lbl in enumerate(labels):
    row_vals = "  ".join(f"{cm[i, j]:2d}" for j in range(len(labels)))
    print(f"   {true_lbl}‚òÖ   {row_vals}")

diag    = np.trace(cm)
tot     = cm.sum()
print(f"\n   Correct predictions : {diag}/{tot} ({diag/tot:.1%})")

# most frequent confusions (off-diagonal)
off_diag = [
    (cm[i, j], labels[i], labels[j])
    for i in range(len(labels)) for j in range(len(labels))
    if i != j and cm[i, j] > 0
]
if off_diag:
    off_diag.sort(reverse=True)
    print("   Top confusions:")
    for count, t, p in off_diag[:3]:
        print(f"     {t}‚òÖ ‚Üí {p}‚òÖ : {count}√ó")

# -----------------------------------------------------------
# 5) Per-class hit rate
# -----------------------------------------------------------
print("\nüé≠ PER-CLASS ACCURACY")
for lbl in labels:
    mask = y_true == lbl
    if mask.any():
        class_acc = (y_pred[mask] == lbl).mean()
        print(f"   {lbl}‚òÖ : {class_acc:.1%} ({mask.sum()} samples)")
    else:
        print(f"   {lbl}‚òÖ : no samples in test set")

print("=" * 60)




üìà  PERFORMANCE ANALYSIS
üîç Debug:
   y_true unique : [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]
   y_pred unique : [np.int64(3)]

üéØ Model          : Stacking
üìä Accuracy       : 20.0%  (3/15 correct)
üìä MAE (in stars) : 1.20

üìã CLASSIFICATION REPORT
              precision    recall  f1-score   support

          1‚òÖ      0.000     0.000     0.000         3
          2‚òÖ      0.000     0.000     0.000         3
          3‚òÖ      0.200     1.000     0.333         3
          4‚òÖ      0.000     0.000     0.000         3
          5‚òÖ      0.000     0.000     0.000         3

    accuracy                          0.200        15
   macro avg      0.040     0.200     0.067        15
weighted avg      0.040     0.200     0.067        15


üîç CONFUSION MATRIX
True\Pred  1‚òÖ  2‚òÖ  3‚òÖ  4‚òÖ  5‚òÖ
   1‚òÖ    0   0   3   0   0
   2‚òÖ    0   0   3   0   0
   3‚òÖ    0   0   3   0   0
   4‚òÖ    0   0   3   0   0
   5‚òÖ    0   0   3   0   0

   

In [19]:
# =============================================================================
# 7) PER-REVIEW DIAGNOSTICS
# =============================================================================
import textwrap

print("\nüîç  DETAILED PREDICTION REVIEW")
print("=" * 80)

for i in range(len(y_true)):
    true_r  = int(y_true[i])
    pred_r  = int(y_pred[i])
    correct = (true_r == pred_r)
    status  = "‚úÖ" if correct else "‚ùå"

    print("\n" + "=" * 80)
    print(f"Sample {i+1:2d} | True: {true_r}‚òÖ ‚Üí Pred: {pred_r}‚òÖ {status}")

    # -----------------------------------------------------------
    # Confidence bar (if we have probabilities)
    # -----------------------------------------------------------
    if confidence is not None:
        conf = float(confidence[i])
        bar  = "‚ñà" * int(conf * 20)
        print(f"Confidence: {conf:.1%} {bar}")

        # print full class probabilities
        if hasattr(clf, "predict_proba"):
            probs = y_proba[i]
            print("All classes:", end=" ")
            for idx, p in enumerate(probs):
                star = idx + 1 if min(y_pred) == 0 else idx + 1  # labels already 1-5 after mapping
                print(f"{star}‚òÖ:{p:.0%}", end=" ")
            print()

    # -----------------------------------------------------------
    # Show the review text when we loaded from CSV
    # -----------------------------------------------------------
    if sample_texts is not None:
        print("-" * 80)
        snippet = sample_texts[i]
        if len(snippet) > 400:
            snippet = snippet[:400] + " ‚Ä¶"
        print(textwrap.fill(snippet, width=76))



üîç  DETAILED PREDICTION REVIEW

Sample  1 | True: 1‚òÖ ‚Üí Pred: 3‚òÖ ‚ùå
Confidence: 31.4% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
All classes: 1‚òÖ:25% 2‚òÖ:26% 3‚òÖ:31% 4‚òÖ:16% 5‚òÖ:2% 

Sample  2 | True: 1‚òÖ ‚Üí Pred: 3‚òÖ ‚ùå
Confidence: 35.2% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
All classes: 1‚òÖ:34% 2‚òÖ:15% 3‚òÖ:35% 4‚òÖ:14% 5‚òÖ:1% 

Sample  3 | True: 1‚òÖ ‚Üí Pred: 3‚òÖ ‚ùå
Confidence: 32.0% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
All classes: 1‚òÖ:26% 2‚òÖ:21% 3‚òÖ:32% 4‚òÖ:16% 5‚òÖ:5% 

Sample  4 | True: 2‚òÖ ‚Üí Pred: 3‚òÖ ‚ùå
Confidence: 33.6% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
All classes: 1‚òÖ:19% 2‚òÖ:21% 3‚òÖ:34% 4‚òÖ:17% 5‚òÖ:10% 

Sample  5 | True: 2‚òÖ ‚Üí Pred: 3‚òÖ ‚ùå
Confidence: 35.4% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
All classes: 1‚òÖ:29% 2‚òÖ:20% 3‚òÖ:35% 4‚òÖ:6% 5‚òÖ:9% 

Sample  6 | True: 2‚òÖ ‚Üí Pred: 3‚òÖ ‚ùå
Confidence: 29.9% ‚ñà‚ñà‚ñà‚ñà‚ñà
All classes: 1‚òÖ:16% 2‚òÖ:25% 3‚òÖ:30% 4‚òÖ:23% 5‚òÖ:6% 

Sample  7 | True: 3‚òÖ ‚Üí Pred: 3‚òÖ ‚úÖ
Confidence: 29.4% ‚ñà‚ñà‚ñà‚ñà‚ñà
All classes: 1‚òÖ:2% 2‚òÖ:17% 3‚òÖ:29% 4‚òÖ:27% 5‚òÖ:24% 


In [26]:
# =============================================================================
# 8) FEHLERANALYSE
# =============================================================================

print(f"\n‚ùå FEHLERANALYSE")
print("=" * 50)

# Finde falsche Vorhersagen
wrong_indices = np.where(y_true != y_pred_orig)[0]
wrong_count = len(wrong_indices)

if wrong_count == 0:
    print("üéâ Alle Vorhersagen waren korrekt!")
else:
    print(f"Falsche Vorhersagen: {wrong_count}/{len(y_true)}")
    
    # Analysiere Fehler-Muster
    error_patterns = {}
    for idx in wrong_indices:
        true_r = y_true[idx]
        pred_r = y_pred_orig[idx]
        pattern = f"{true_r}‚Üí{pred_r}"
        error_patterns[pattern] = error_patterns.get(pattern, 0) + 1
    
    print("\nFehler-Muster:")
    for pattern, count in sorted(error_patterns.items()):
        true_r, pred_r = pattern.split('‚Üí')
        diff = abs(int(true_r) - int(pred_r))
        print(f"   {pattern}: {count}x (Diff: {diff} Sterne)")
    
    # Durchschnittliche Abweichung bei Fehlern
    avg_error = np.mean([abs(y_true[i] - y_pred_orig[i]) for i in wrong_indices])
    print(f"\n√ò Abweichung bei Fehlern: {avg_error:.2f} Sterne")


‚ùå FEHLERANALYSE
Falsche Vorhersagen: 12/15

Fehler-Muster:
   1‚Üí4: 3x (Diff: 3 Sterne)
   2‚Üí4: 3x (Diff: 2 Sterne)
   3‚Üí4: 3x (Diff: 1 Sterne)
   4‚Üí6: 1x (Diff: 2 Sterne)
   5‚Üí4: 2x (Diff: 1 Sterne)

√ò Abweichung bei Fehlern: 1.83 Sterne


In [27]:
# =============================================================================
# 9) CONFIDENCE-ANALYSE
# =============================================================================

if confidence is not None:
    print(f"\nüéØ CONFIDENCE-ANALYSE")
    print("=" * 50)
    
    avg_conf = confidence.mean()
    min_conf = confidence.min()
    max_conf = confidence.max()
    
    print(f"Durchschnittliche Confidence: {avg_conf:.1%}")
    print(f"Niedrigste Confidence: {min_conf:.1%}")
    print(f"H√∂chste Confidence: {max_conf:.1%}")
    
    # Confidence vs. Korrektheit
    correct_mask = (y_true == y_pred_orig)
    if np.any(correct_mask):
        avg_conf_correct = confidence[correct_mask].mean()
        print(f"Confidence bei korrekten Vorhersagen: {avg_conf_correct:.1%}")
    
    if np.any(~correct_mask):
        avg_conf_wrong = confidence[~correct_mask].mean()
        print(f"Confidence bei falschen Vorhersagen: {avg_conf_wrong:.1%}")
    
    # Niedrigste Confidence F√§lle
    low_conf_indices = np.argsort(confidence)[:3]
    print(f"\nNiedrigste Confidence F√§lle:")
    for idx in low_conf_indices:
        true_r, pred_r = y_true[idx], y_pred_orig[idx]
        conf = confidence[idx]
        status = "‚úÖ" if true_r == pred_r else "‚ùå"
        print(f"   Sample {idx+1}: {true_r}‚≠ê‚Üí{pred_r}‚≠ê {status} ({conf:.1%})")


üéØ CONFIDENCE-ANALYSE
Durchschnittliche Confidence: 31.4%
Niedrigste Confidence: 29.3%
H√∂chste Confidence: 33.4%
Confidence bei korrekten Vorhersagen: 31.4%
Confidence bei falschen Vorhersagen: 31.4%

Niedrigste Confidence F√§lle:
   Sample 4: 2‚≠ê‚Üí4‚≠ê ‚ùå (29.3%)
   Sample 7: 3‚≠ê‚Üí4‚≠ê ‚ùå (30.5%)
   Sample 15: 5‚≠ê‚Üí4‚≠ê ‚ùå (30.5%)


In [None]:
# =============================================================================
# 12) INTERAKTIVER TEST
# =============================================================================

def test_custom_text(text_input):
    """Teste einen einzelnen Text - KORRIGIERTE VERSION"""
    try:
        # Feature-Extraktion basierend auf Modell-Typ
        model_features = mdl_dict.get("feature_combination", "combined")
        
        if model_features == "tfidf_only":
            X_custom = vectorizer.transform([text_input])
        elif model_features == "numerical_only":
            print("‚ö†Ô∏è Custom Text Test mit 'numerical_only' nicht m√∂glich (keine Textfeatures)")
            return None
        else:  # combined - hier war der Hauptfehler
            X_tfidf = vectorizer.transform([text_input])
            
            X_custom = X_tfidf
        
        pred_enc = clf.predict(X_custom)[0]
        pred_orig = label_encoder.inverse_transform([pred_enc])[0]
        
        result = {
            'prediction': pred_orig,
            'confidence': None
        }
        
        if hasattr(clf, "predict_proba"):
            print("üîÆ Vorhersage mit Wahrscheinlichkeiten:")
            proba = clf.predict_proba(X_custom)[0]
            result['confidence'] = proba.max()
            result['all_probabilities'] = {
                label_encoder.inverse_transform([i])[0]: prob 
                for i, prob in enumerate(proba)
            }
        
        return result
        
    except Exception as e:
        print(f"‚ùå Fehler beim Custom Test: {e}")
        return None

# Beispiel f√ºr Custom Test (Automatisch)
print(f"\nüéÆ BEISPIEL: CUSTOM TEXT TEST")
print("=" * 50)

test_texts = [
    "This product is amazing! Best purchase ever!",
    "Terrible quality, broke immediately. Don't buy!",
    "It's okay, nothing special but does the job."
]

for i, test_text in enumerate(test_texts):
    result = test_custom_text(test_text)
    if result:
        print(f"\nTest {i+1}: {result['prediction']}‚≠ê", end="")
        if result['confidence']:
            print(f" (Confidence: {result['confidence']:.1%})")
        else:
            print()
        print(f"Text: {test_text}")


üéÆ BEISPIEL: CUSTOM TEXT TEST
testtst <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (1, 5010)>
  Coords	Values
  (0, 98)	0.4261538241554763
  (0, 306)	0.40489382633340476
  (0, 3164)	0.24615886958176456
  (0, 3165)	0.702660449165133
  (0, 3243)	0.31643010548941036
üîÆ Vorhersage mit Wahrscheinlichkeiten:

Test 1: 2‚≠ê (Confidence: 41.8%)
Text: This product is amazing! Best purchase ever!
testtst <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6 stored elements and shape (1, 5010)>
  Coords	Values
  (0, 392)	0.49506702537777625
  (0, 422)	0.2524125041938558
  (0, 1887)	0.3777895866716374
  (0, 3265)	0.2086605463212705
  (0, 4418)	0.38568962989905486
  (0, 4423)	0.5968047269500295
üîÆ Vorhersage mit Wahrscheinlichkeiten:

Test 2: 3‚≠ê (Confidence: 42.7%)
Text: Terrible quality, broke immediately. Don't buy!
testtst <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 5010)>
  Coords	Va

In [71]:
# =============================================================================
# 13) INTERAKTIVER EINGABE-TEST 
# =============================================================================

def interactive_prediction():
    """Interaktive Vorhersage f√ºr eigene Texte - NEUE VERSION"""
    print("\n" + "="*60)
    print("üéÆ INTERACTIVE MODE - Teste eigene Reviews!")
    print("(Gib 'quit' ein zum Beenden)")
    print("="*60)
    
    counter = 1
    while True:
        try:
            user_text = input(f"\n[{counter}] üìù Gib einen Review-Text ein: ").strip()
            
            if user_text.lower() in ['quit', 'exit', 'q', '']:
                print("üëã Interaktiver Test beendet!")
                break
                
            if not user_text:
                print("‚ùå Bitte gib einen Text ein!")
                continue
            
            # Vorhersage mit korrigierter Funktion
            result = test_custom_text(user_text)
            
            if result:
                print(f"\nüîÆ ERGEBNIS F√úR EINGABE #{counter}:")
                print("-" * 50)
                print(f"Eingabe: {user_text}")
                print(f"Vorhersage: {result['prediction']}‚≠ê")
                
                if result['confidence']:
                    conf_bar = "‚ñà" * int(result['confidence'] * 20) + "‚ñë" * (20 - int(result['confidence'] * 20))
                    print(f"Confidence: {result['confidence']:.1%} [{conf_bar}]")
                
                if 'all_probabilities' in result:
                    print(f"\nüìä WAHRSCHEINLICHKEITSVERTEILUNG:")
                    print("-" * 50)
                    print(f"{'Rating':<8} {'Wahrscheinlichkeit':<15} {'Visualisierung'}")
                    print("-" * 50)
                    
                    for rating in sorted(result['all_probabilities'].keys()):
                        prob = result['all_probabilities'][rating]
                        bar = "‚ñà" * int(prob * 25)
                        print(f"{rating}‚≠ê{'':<6} {prob:.1%}{'':<12} {bar}")
                
                print("-" * 50)
            else:
                print("‚ùå Fehler bei der Vorhersage")
            
            counter += 1
            
        except KeyboardInterrupt:
            print("\nüëã Abgebrochen durch Benutzer!")
            break
        except Exception as e:
            print(f"‚ùå Fehler: {e}")

# Starte den interaktiven Modus
interactive_prediction()


üéÆ INTERACTIVE MODE - Teste eigene Reviews!
(Gib 'quit' ein zum Beenden)

üîÆ ERGEBNIS F√úR EINGABE #1:
--------------------------------------------------
Eingabe: geniales product
Vorhersage: 2‚≠ê
Confidence: 45.9% [‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë]

üìä WAHRSCHEINLICHKEITSVERTEILUNG:
--------------------------------------------------
Rating   Wahrscheinlichkeit Visualisierung
--------------------------------------------------
1‚≠ê       14.2%             ‚ñà‚ñà‚ñà
2‚≠ê       45.9%             ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
3‚≠ê       32.5%             ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
4‚≠ê       5.7%             ‚ñà
5‚≠ê       1.7%             
--------------------------------------------------

üîÆ ERGEBNIS F√úR EINGABE #2:
--------------------------------------------------
Eingabe: ich mag das product sehr und w√ºrde es wieder kaufen
Vorhersage: 2‚≠ê
Confidence: 45.9% [‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë]

üìä WAHRSCHEINLICHKEIT

In [72]:
# Verbesserte Test-Beispiele f√ºr verschiedene Sterne-Kategorien

# 5 STERNE - Sehr positive Reviews
five_star_examples = [
    "Absolutely amazing product! Exceeded all my expectations. Fast shipping, perfect quality, exactly as described. Will definitely order again from this seller. Highly recommend to everyone!",
    "Outstanding quality for the price! The item arrived quickly and was packaged very well. Works perfectly and looks even better than in the photos. Customer service was excellent too.",
    "Perfect purchase! This product is fantastic, great value for money. The seller was responsive and helpful. Shipping was super fast. Could not be happier with this buy!",
    "Exceptional item! Top quality materials, exactly what I needed. The seller communicated well and shipped immediately. This exceeded my expectations in every way possible."
]

# 4 STERNE - Positive Reviews mit kleinen Einschr√§nkungen
four_star_examples = [
    "Great product overall! Good quality and works as expected. Shipping took a bit longer than anticipated but worth the wait. Would recommend this to others.",
    "Very satisfied with this purchase. The item is well made and functional. Only minor issue was the packaging could have been better, but the product itself is excellent.",
    "Good quality product at a reasonable price. Delivery was on time and the item matches the description. Small scratches on arrival but nothing major.",
    "Happy with this buy! The product works well and looks good. Installation was straightforward. Only wish it came with better instructions."
]

# 3 STERNE - Neutrale/gemischte Reviews
three_star_examples = [
    "Average product. It does what it's supposed to do but nothing special. Quality is okay for the price. Shipping was standard.",
    "Mixed feelings about this purchase. Some aspects are good, others could be improved. It works but feels a bit cheap. Decent value overall.",
    "It's an okay product. Not bad but not great either. Quality is acceptable and it serves its purpose. Would consider other options next time.",
    "Mediocre item. Gets the job done but I expected better quality. Shipping was fine. It's functional but not impressive."
]

# 2 STERNE - Negative Reviews mit einigen positiven Aspekten
two_star_examples = [
    "Disappointed with this purchase. The quality is poor and it feels flimsy. Only positive is that shipping was fast. Would not buy again.",
    "Not what I expected. The item looks different from the photos and feels cheap. It works but barely. Customer service was unhelpful.",
    "Poor quality product. Had issues right after opening. The material feels very cheap. Only reason for 2 stars is that it arrived on time.",
    "Unsatisfied with this buy. The product is smaller than expected and build quality is questionable. Shipping was okay but that's about it."
]

# 1 STERN - Sehr negative Reviews
one_star_examples = [
    "Terrible product! Completely useless and broke immediately after opening. Worst purchase ever. Poor quality, misleading description. Avoid this seller!",
    "Absolute waste of money! The item is nothing like described, extremely poor quality. Took forever to arrive and was damaged. Complete scam!",
    "Horrible experience! Product is defective and unusable. Seller refuses to help or refund. Cheap plastic junk that falls apart instantly.",
    "Do not buy this! Completely broken on arrival, terrible packaging. Item looks nothing like the photos. Seller is unresponsive. Total disaster!"
]

# Alle Beispiele kombinieren f√ºr systematischen Test
all_test_examples = {
    5: five_star_examples,
    4: four_star_examples, 
    3: three_star_examples,
    2: two_star_examples,
    1: one_star_examples
}

print("üß™ SYSTEMATISCHER TEST MIT VERBESSERTEN BEISPIELEN")
print("="*70)

total_correct = 0
total_tests = 0

for expected_stars, examples in all_test_examples.items():
    print(f"\n‚≠ê ERWARTETE {expected_stars} STERNE:")
    print("-" * 50)
    
    for i, text in enumerate(examples, 1):
        result = test_custom_text(text)
        if result:
            predicted = result['prediction']
            confidence = result.get('confidence', 0)
            is_correct = predicted == expected_stars
            status = "‚úÖ" if is_correct else "‚ùå"
            
            total_tests += 1
            if is_correct:
                total_correct += 1
            
            print(f"{status} Beispiel {i}: {predicted}‚≠ê (Conf: {confidence:.1%})")
            print(f"   Text: {text[:80]}{'...' if len(text) > 80 else ''}")
            
            # Zeige Wahrscheinlichkeitsverteilung bei falschen Vorhersagen
            if not is_correct and 'all_probabilities' in result:
                print("   Verteilung:", end=" ")
                for rating in sorted(result['all_probabilities'].keys()):
                    prob = result['all_probabilities'][rating]
                    print(f"{rating}‚≠ê:{prob:.0%}", end=" ")
                print()
        print()

print(f"\nüìä GESAMTERGEBNIS:")
print(f"Korrekte Vorhersagen: {total_correct}/{total_tests} ({total_correct/total_tests*100:.1f}%)")

# Zus√§tzliche Analyse: Welche W√∂rter/Phrasen f√ºhren zu welchen Bewertungen?
print(f"\nüîç WORT-ANALYSE:")
print("Teste einzelne charakteristische W√∂rter/Phrasen...")

word_tests = {
    "amazing": 5,
    "excellent": 5, 
    "perfect": 5,
    "outstanding": 5,
    "terrible": 1,
    "horrible": 1,
    "waste of money": 1,
    "broke immediately": 1,
    "good quality": 4,
    "works well": 4,
    "okay": 3,
    "average": 3,
    "poor quality": 2,
    "disappointed": 2
}

print("\nWort/Phrase ‚Üí Vorhersage:")
for phrase, expected in word_tests.items():
    result = test_custom_text(phrase)
    if result:
        predicted = result['prediction']
        status = "‚úÖ" if predicted == expected else "‚ùå"
        print(f"{status} '{phrase}' ‚Üí {predicted}‚≠ê (erwartet: {expected}‚≠ê)")

üß™ SYSTEMATISCHER TEST MIT VERBESSERTEN BEISPIELEN

‚≠ê ERWARTETE 5 STERNE:
--------------------------------------------------
‚ùå Beispiel 1: 2‚≠ê (Conf: 41.2%)
   Text: Absolutely amazing product! Exceeded all my expectations. Fast shipping, perfect...
   Verteilung: 1‚≠ê:8% 2‚≠ê:41% 3‚≠ê:35% 4‚≠ê:11% 5‚≠ê:4% 

‚ùå Beispiel 2: 3‚≠ê (Conf: 42.1%)
   Text: Outstanding quality for the price! The item arrived quickly and was packaged ver...
   Verteilung: 1‚≠ê:6% 2‚≠ê:41% 3‚≠ê:42% 4‚≠ê:9% 5‚≠ê:2% 

‚ùå Beispiel 3: 2‚≠ê (Conf: 44.0%)
   Text: Perfect purchase! This product is fantastic, great value for money. The seller w...
   Verteilung: 1‚≠ê:12% 2‚≠ê:44% 3‚≠ê:33% 4‚≠ê:8% 5‚≠ê:3% 

‚ùå Beispiel 4: 3‚≠ê (Conf: 48.1%)
   Text: Exceptional item! Top quality materials, exactly what I needed. The seller commu...
   Verteilung: 1‚≠ê:3% 2‚≠ê:34% 3‚≠ê:48% 4‚≠ê:12% 5‚≠ê:2% 


‚≠ê ERWARTETE 4 STERNE:
--------------------------------------------------
‚ùå Beispiel 1: 2‚≠ê (Conf: 48.9%)
   Text: 

In [73]:
# =============================================================================
# MODELL-DIAGNOSE - Was l√§uft schief?
# =============================================================================

print("üîç DETAILLIERTE MODELL-DIAGNOSE")
print("="*60)

# 1. Modell-Informationen pr√ºfen
print("1Ô∏è‚É£ MODELL-INFORMATIONEN:")
print("-" * 30)
print(f"Modell-Typ: {type(clf).__name__}")
print(f"Modell-Name: {model_name}")
print(f"Feature-Kombination: {mdl_dict.get('feature_combination', 'unknown')}")

# 2. Label-Encoder pr√ºfen
print(f"\n2Ô∏è‚É£ LABEL-ENCODING:")
print("-" * 30)
print(f"Original Classes: {label_encoder.classes_}")
print("Mapping:")
for i, orig_label in enumerate(label_encoder.classes_):
    print(f"  {orig_label} (original) ‚Üí {i} (encoded)")

# 3. Vectorizer pr√ºfen
print(f"\n3Ô∏è‚É£ VECTORIZER-INFO:")
print("-" * 30)
print(f"Vectorizer-Typ: {type(vectorizer).__name__}")
print(f"Feature-Count: {len(vectorizer.get_feature_names_out()):,}")
print(f"Max Features: {getattr(vectorizer, 'max_features', 'unlimited')}")

# Test einzelner W√∂rter im Vokabular
test_words = ['amazing', 'terrible', 'good', 'bad', 'excellent']
vocab = vectorizer.get_feature_names_out()
print(f"\nW√∂rter im Vokabular:")
for word in test_words:
    in_vocab = word in vocab
    print(f"  '{word}': {'‚úÖ' if in_vocab else '‚ùå'}")

# 4. Feature-Matrix f√ºr Test-W√∂rter analysieren
print(f"\n4Ô∏è‚É£ FEATURE-MATRIX ANALYSE:")
print("-" * 30)

test_phrases = ['amazing', 'terrible', 'good quality', 'poor quality']
for phrase in test_phrases:
    X_test = vectorizer.transform([phrase])
    print(f"\nPhrase: '{phrase}'")
    print(f"  Sparse Matrix Shape: {X_test.shape}")
    print(f"  Non-zero Features: {X_test.nnz}")
    
    # Zeige die aktivierten Features
    if X_test.nnz > 0:
        feature_indices = X_test.nonzero()[1]
        feature_names = vocab[feature_indices]
        feature_values = X_test.data
        print(f"  Aktivierte Features: {list(zip(feature_names, feature_values))[:5]}")

# 5. Modell-Vorhersage-Verteilung
print(f"\n5Ô∏è‚É£ VORHERSAGE-VERTEILUNG:")
print("-" * 30)

# Teste viele zuf√§llige Texte
random_texts = [
    "good", "bad", "excellent", "terrible", "okay", 
    "amazing product", "poor quality", "works well", "broken",
    "love it", "hate it", "perfect", "awful", "decent"
]

predictions = []
for text in random_texts:
    X = vectorizer.transform([text])
    pred_encoded = clf.predict(X)[0]
    pred_original = label_encoder.inverse_transform([pred_encoded])[0]
    predictions.append(pred_original)

from collections import Counter
pred_counts = Counter(predictions)
print("Vorhersage-Verteilung f√ºr Test-Texte:")
for rating in sorted(pred_counts.keys()):
    count = pred_counts[rating]
    percentage = count / len(predictions) * 100
    bar = "‚ñà" * int(percentage / 5)
    print(f"  {rating}‚≠ê: {count:2d}/{len(predictions)} ({percentage:4.1f}%) {bar}")

# 6. Probability-Analyse
print(f"\n6Ô∏è‚É£ WAHRSCHEINLICHKEITS-ANALYSE:")
print("-" * 30)

if hasattr(clf, 'predict_proba'):
    # Teste extreme F√§lle
    extreme_tests = {
        "amazing excellent perfect outstanding": "5‚≠ê erwartet",
        "terrible horrible awful waste broken": "1‚≠ê erwartet"
    }
    
    for text, expected in extreme_tests.items():
        X = vectorizer.transform([text])
        probabilities = clf.predict_proba(X)[0]
        pred_encoded = clf.predict(X)[0]
        pred_original = label_encoder.inverse_transform([pred_encoded])[0]
        
        print(f"\nText: '{text}' ({expected})")
        print(f"Vorhersage: {pred_original}‚≠ê")
        print("Wahrscheinlichkeiten:")
        for i, prob in enumerate(probabilities):
            original_rating = label_encoder.inverse_transform([i])[0]
            print(f"  {original_rating}‚≠ê: {prob:.3f}")

# 7. Modell-Parameter pr√ºfen
print(f"\n7Ô∏è‚É£ MODELL-PARAMETER:")
print("-" * 30)
if hasattr(clf, 'get_params'):
    params = clf.get_params()
    important_params = ['C', 'gamma', 'kernel', 'class_weight', 'random_state']
    for param in important_params:
        if param in params:
            print(f"  {param}: {params[param]}")

# 8. Training-Metriken nochmal pr√ºfen
print(f"\n8Ô∏è‚É£ TRAINING-METRIKEN:")
print("-" * 30)
for key, value in mdl_dict.items():
    if key not in ['model', 'model_name']:
        print(f"  {key}: {value}")

# 9. DIAGNOSE-SCHLUSSFOLGERUNGEN
print(f"\n9Ô∏è‚É£ DIAGNOSE-SCHLUSSFOLGERUNGEN:")
print("-" * 30)

issues = []

# Check 1: Alle Vorhersagen in 2-3 Bereich?
unique_preds = set(predictions)
if len(unique_preds) <= 2 and all(p in [2, 3] for p in unique_preds):
    issues.append("‚ùå Modell vorhersagt nur 2-3 Sterne (bias Problem)")

# Check 2: Sehr schlechte Accuracy auf einfachen F√§llen
simple_accuracy = sum(1 for p in predictions if p in [1, 4, 5]) / len(predictions)
if simple_accuracy < 0.2:
    issues.append("‚ùå Kann extreme Bewertungen (1‚≠ê, 5‚≠ê) nicht erkennen")

# Check 3: Training-Accuracy vs. Test-Performance
train_acc = mdl_dict.get('train_accuracy', 0)
if train_acc > 0.8:
    issues.append("‚ùå M√∂gliches Overfitting (gute Training-Acc aber schlechte Real-World Performance)")

# Check 4: Label-Encoding Problem?
if not all(isinstance(c, int) and 1 <= c <= 5 for c in label_encoder.classes_):
    issues.append("‚ùå Label-Encoder Problem - falsche Klassen")

print("\nüö® GEFUNDENE PROBLEME:")
if issues:
    for issue in issues:
        print(f"  {issue}")
else:
    print("  ‚úÖ Keine offensichtlichen Probleme gefunden")

print(f"\nüí° L√ñSUNGSVORSCHL√ÑGE:")
print("  1. Modell komplett neu trainieren")
print("  2. Andere Algorithmen testen (Random Forest, XGBoost)")
print("  3. Hyperparameter-Tuning verbessern")
print("  4. Datenqualit√§t pr√ºfen (sind die Labels korrekt?)")
print("  5. Feature-Engineering √ºberdenken")
print("  6. Class Imbalance behandeln (SMOTE, Class Weights)")

üîç DETAILLIERTE MODELL-DIAGNOSE
1Ô∏è‚É£ MODELL-INFORMATIONEN:
------------------------------
Modell-Typ: StackingClassifier
Modell-Name: Optimized Stacking
Feature-Kombination: combined

2Ô∏è‚É£ LABEL-ENCODING:
------------------------------
Original Classes: [1 2 3 4 5]
Mapping:
  1 (original) ‚Üí 0 (encoded)
  2 (original) ‚Üí 1 (encoded)
  3 (original) ‚Üí 2 (encoded)
  4 (original) ‚Üí 3 (encoded)
  5 (original) ‚Üí 4 (encoded)

3Ô∏è‚É£ VECTORIZER-INFO:
------------------------------
Vectorizer-Typ: TfidfVectorizer
Feature-Count: 5,010
Max Features: 5010

W√∂rter im Vokabular:
  'amazing': ‚úÖ
  'terrible': ‚úÖ
  'good': ‚úÖ
  'bad': ‚úÖ
  'excellent': ‚úÖ

4Ô∏è‚É£ FEATURE-MATRIX ANALYSE:
------------------------------

Phrase: 'amazing'
  Sparse Matrix Shape: (1, 5010)
  Non-zero Features: 1
  Aktivierte Features: [('amazing', np.float64(1.0))]

Phrase: 'terrible'
  Sparse Matrix Shape: (1, 5010)
  Non-zero Features: 1
  Aktivierte Features: [('terrible', np.float64(1.0))]

Phra

In [67]:
# =============================================================================
# 13) ZUSAMMENFASSUNG
# =============================================================================

print(f"\n{'='*80}")
print("üìã ZUSAMMENFASSUNG")
print("="*80)

print(f"üéØ Modell: {model_name}")
print(f"üîß Feature-Kombination: {mdl_dict.get('feature_combination', 'unknown')}")
print(f"üìä Test Samples: {len(y_true)}")
print(f"üìä Accuracy: {accuracy:.1%}")
print(f"üìä MAE: {mae:.2f} Sterne")
print(f"‚úÖ Richtige Vorhersagen: {correct_count}")
print(f"‚ùå Falsche Vorhersagen: {len(y_true) - correct_count}")

if confidence is not None:
    print(f"üéØ Durchschnittliche Confidence: {confidence.mean():.1%}")

print(f"\nüìÇ Datenquelle: {'Train/Test Splits' if data_source == 'splits' else 'Preprocessed CSV'}")
print(f"üîß Features: {X_sample.shape[1]:,}")
print(f"üè∑Ô∏è Labels: {sorted(label_encoder.classes_)}")
if numerical_features:
    print(f"üìä Numerical Features: {numerical_features}")

# Training-Informationen - PRIM√ÑR F1 MACRO
macro_f1 = mdl_dict.get('macro_f1', 'N/A')
train_acc = mdl_dict.get('test_accuracy', 'N/A')
best_params = mdl_dict.get('best_params', {})

if macro_f1 != 'N/A':
    print(f"üéØ Training Macro F1-Score: {macro_f1:.3f}")
if train_acc != 'N/A':
    print(f"üéØ Training Test Accuracy: {train_acc:.3f}")
if best_params:
    print(f"‚öôÔ∏è Best Params: {best_params}")

print(f"\nüöÄ INFERENCE DEMO ABGESCHLOSSEN!")

# Optional: Ergebnisse speichern
# results_df = pd.DataFrame({
#     'true_rating': y_true,
#     'predicted_rating': y_pred_orig,
#     'correct': y_true == y_pred_orig,
#     'confidence': confidence if confidence is not None else [None] * len(y_true)
# })
# results_df.to_csv("inference_results.csv", index=False)
# print(f"\nüíæ Ergebnisse gespeichert als 'inference_results.csv'")


üìã ZUSAMMENFASSUNG
üéØ Modell: Optimized Stacking
üîß Feature-Kombination: combined
üìä Test Samples: 15
üìä Accuracy: 20.0%
üìä MAE: 1.00 Sterne
‚úÖ Richtige Vorhersagen: 3
‚ùå Falsche Vorhersagen: 12
üéØ Durchschnittliche Confidence: 44.8%

üìÇ Datenquelle: Train/Test Splits
üîß Features: 5,010
üè∑Ô∏è Labels: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5)]
üìä Numerical Features: ['word_count', 'char_count', 'sentence_count', 'avg_word_length', 'exclamation_count', 'question_count', 'capital_ratio', 'sentiment_compound', 'sentiment_pos', 'sentiment_neg']
üéØ Training Macro F1-Score: 0.443
üéØ Training Test Accuracy: 0.604

üöÄ INFERENCE DEMO ABGESCHLOSSEN!
