# 04 - Model Optimization

Bu notebook'ta farklƒ± modelleri deneyip hyperparameter tuning yapacaƒüƒ±z.

## Hedefler:
- Farklƒ± modelleri kar≈üƒ±la≈ütƒ±rmak (Random Forest, XGBoost)
- GridSearchCV ile hyperparameter tuning
- En iyi modeli se√ßmek

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully")

## 1. Prepare Data

In [None]:
# Load and prepare data
df = pd.read_csv('../data/creditcard.csv')

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Training set: {X_train_smote.shape}")
print(f"Test set: {X_test_scaled.shape}")

## 2. Model Comparison

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train_smote, y_train_smote)
    
    # Predict
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Evaluate
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Cross-validation
    cv_scores = cross_val_score(
        model, X_train_smote, y_train_smote,
        cv=5, scoring='roc_auc', n_jobs=-1
    )
    
    results[name] = {
        'model': model,
        'roc_auc': roc_auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

In [None]:
# Visualize model comparison
model_names = list(results.keys())
roc_aucs = [results[m]['roc_auc'] for m in model_names]
cv_means = [results[m]['cv_mean'] for m in model_names]

fig = go.Figure()

fig.add_trace(go.Bar(
    name='Test ROC-AUC',
    x=model_names,
    y=roc_aucs,
    marker_color='lightblue'
))

fig.add_trace(go.Bar(
    name='CV ROC-AUC',
    x=model_names,
    y=cv_means,
    marker_color='darkblue'
))

fig.update_layout(
    title='Model Performance Comparison',
    yaxis_title='ROC-AUC Score',
    barmode='group',
    height=500
)
fig.show()

## 3. Hyperparameter Tuning - Random Forest

In [None]:
# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

# GridSearchCV
print("Starting GridSearchCV for Random Forest...")
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    rf_param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train_smote, y_train_smote)

print(f"\nBest parameters: {rf_grid.best_params_}")
print(f"Best CV score: {rf_grid.best_score_:.4f}")

## 4. Hyperparameter Tuning - XGBoost

In [None]:
# Define parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

# GridSearchCV
print("Starting GridSearchCV for XGBoost...")
xgb_grid = GridSearchCV(
    xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    xgb_param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

xgb_grid.fit(X_train_smote, y_train_smote)

print(f"\nBest parameters: {xgb_grid.best_params_}")
print(f"Best CV score: {xgb_grid.best_score_:.4f}")

## 5. Select Best Model

In [None]:
# Evaluate tuned models
tuned_models = {
    'Random Forest (Tuned)': rf_grid.best_estimator_,
    'XGBoost (Tuned)': xgb_grid.best_estimator_
}

tuned_results = {}

for name, model in tuned_models.items():
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    tuned_results[name] = roc_auc
    print(f"{name} - ROC-AUC: {roc_auc:.4f}")

# Select best model
best_model_name = max(tuned_results, key=tuned_results.get)
best_model = tuned_models[best_model_name]

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"ROC-AUC: {tuned_results[best_model_name]:.4f}")

## 6. Summary

### Model Optimization Results:
- Compared multiple models (Logistic Regression, Random Forest, XGBoost)
- Performed hyperparameter tuning with GridSearchCV
- Selected the best performing model

### Next Steps:
1. Detailed evaluation of the best model
2. Feature importance analysis
3. Final model validation

In [None]:
print("‚úÖ Model optimization completed!")
print("\nNext: Run 05_model_evaluation.ipynb")