# Model Training v√† Testing Pipeline

Notebook n√†y th·ª±c hi·ªán:
1. Load preprocessed data t·ª´ notebook 01
2. Test pipeline v·ªõi m·ªôt v√†i m√¥ h√¨nh nh·ªè ƒë·ªÉ verify
3. Train c√°c ensemble models (s·∫Ω ƒë∆∞·ª£c ho√†n thi·ªán sau)


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
import os
import sys
from pathlib import Path
warnings.filterwarnings('ignore')

# Get project root directory
current_dir = Path.cwd()
project_root = current_dir
if (current_dir / 'src').exists():
    project_root = current_dir
elif (current_dir.parent / 'src').exists():
    project_root = current_dir.parent
else:
    project_root = current_dir

# Add project root to path
project_root_str = str(project_root.absolute())
if project_root_str not in sys.path:
    sys.path.insert(0, project_root_str)

# Import modules
from src.data_preprocessing import scale_features, split_data, apply_smote, get_class_weights
from src.evaluate import evaluate_model, get_metrics_dict

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

print(f"Project root: {project_root.absolute()}")
print("Libraries imported successfully!")


## 1. Load Preprocessed Data

Load d·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c preprocess t·ª´ notebook 01. 
**L∆∞u √Ω**: C·∫ßn ch·∫°y notebook 01 tr∆∞·ªõc ƒë·ªÉ c√≥ d·ªØ li·ªáu ƒë√£ preprocess.


In [None]:
# Load preprocessed data
# Note: In a real scenario, you would load the preprocessed data from notebook 01
# For testing, we'll reload and preprocess the data here

data_path = project_root / 'data' / 'creditcard.csv'
df = pd.read_csv(data_path)

# Prepare features and target
feature_cols = [f'V{i}' for i in range(1, 29)] + ['Amount']
X = df[feature_cols]
y = df['Class']

print(f"Data loaded: {X.shape[0]:,} samples, {X.shape[1]} features")
print(f"Class distribution: {y.value_counts().to_dict()}")


In [None]:
# Scale features
X_scaled, scaler = scale_features(X, feature_cols=feature_cols, fit=True)

# Split data
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    X_scaled, y, 
    test_size=0.15, 
    val_size=0.15, 
    random_state=42
)

print("Data preprocessing completed!")
print(f"Train: {X_train.shape[0]:,} samples")
print(f"Validation: {X_val.shape[0]:,} samples")
print(f"Test: {X_test.shape[0]:,} samples")


## 2. Test Pipeline v·ªõi M√¥ h√¨nh Nh·ªè

Test pipeline v·ªõi Logistic Regression v√† Random Forest nh·ªè ƒë·ªÉ verify m·ªçi th·ª© ho·∫°t ƒë·ªông ƒë√∫ng.


In [None]:
# Test 1: Logistic Regression v·ªõi class weights
print("="*60)
print("Testing Logistic Regression with class weights")
print("="*60)

# Get class weights
class_weights = get_class_weights(y_train)
print(f"Class weights: {class_weights}")

# Train model
lr_model = LogisticRegression(
    class_weight=class_weights,
    random_state=42,
    max_iter=1000
)
lr_model.fit(X_train, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_val)
y_pred_proba_lr = lr_model.predict_proba(X_val)[:, 1]

# Evaluate
metrics_lr, figures_lr = evaluate_model(
    y_val, y_pred_lr, y_pred_proba_lr,
    model_name='Logistic Regression',
    plot_cm=True,
    plot_roc=True
)


In [None]:
# Test 2: Random Forest nh·ªè
print("="*60)
print("Testing Random Forest (small)")
print("="*60)

# Train model
rf_model = RandomForestClassifier(
    n_estimators=50,  # Small for testing
    max_depth=10,
    class_weight=class_weights,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_val)
y_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

# Evaluate
metrics_rf, figures_rf = evaluate_model(
    y_val, y_pred_rf, y_pred_proba_rf,
    model_name='Random Forest',
    plot_cm=True,
    plot_roc=True
)


## 3. Verify Pipeline Ho·∫°t ƒë·ªông

Ki·ªÉm tra xem pipeline c√≥ ho·∫°t ƒë·ªông ƒë√∫ng kh√¥ng.


In [None]:
# Collect metrics for comparison
test_metrics = [
    get_metrics_dict(y_val, y_pred_lr, y_pred_proba_lr, 'Logistic Regression'),
    get_metrics_dict(y_val, y_pred_rf, y_pred_proba_rf, 'Random Forest')
]

# Create comparison DataFrame
comparison_df = pd.DataFrame(test_metrics)
print("\n" + "="*60)
print("Pipeline Test Results - Metrics Comparison")
print("="*60)
print(comparison_df.to_string(index=False))
print("="*60)

print("\n‚úÖ Pipeline test completed successfully!")
print("‚úÖ All models trained and evaluated correctly")
print("‚úÖ Evaluation functions working properly")
print("\nNext steps: Train full ensemble models (Random Forest, AdaBoost, XGBoost)")


## 4. Train c√°c Ensemble Models ƒë·∫ßy ƒë·ªß

Train 3 ensemble models: Random Forest, AdaBoost, XGBoost v·ªõi parameters t·ªëi ∆∞u.


In [None]:
# Import model training functions
from src.models import (
    train_random_forest, train_adaboost, train_xgboost,
    save_model, evaluate_model_performance
)

# Dictionary to store all models and results
trained_models = {}
model_predictions = {}
model_probabilities = {}


### 4.1 Train Random Forest


In [None]:
# Train Random Forest v·ªõi parameters t·ªëi ∆∞u
rf_model = train_random_forest(
    X_train, y_train,
    n_estimators=200,
    max_depth=15,
    class_weight=class_weights,
    random_state=42,
    n_jobs=-1
)

# Predictions on validation set
y_pred_rf = rf_model.predict(X_val)
y_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

# Store results
trained_models['Random Forest'] = rf_model
model_predictions['Random Forest'] = y_pred_rf
model_probabilities['Random Forest'] = y_pred_proba_rf

# Evaluate
evaluate_model_performance(rf_model, X_val, y_val, 'Random Forest')


### 4.2 Train AdaBoost


In [None]:
# Train AdaBoost
ada_model = train_adaboost(
    X_train, y_train,
    n_estimators=100,
    learning_rate=0.5,
    random_state=42
)

# Predictions on validation set
y_pred_ada = ada_model.predict(X_val)
y_pred_proba_ada = ada_model.predict_proba(X_val)[:, 1]

# Store results
trained_models['AdaBoost'] = ada_model
model_predictions['AdaBoost'] = y_pred_ada
model_probabilities['AdaBoost'] = y_pred_proba_ada

# Evaluate
evaluate_model_performance(ada_model, X_val, y_val, 'AdaBoost')


### 4.3 Train XGBoost


In [None]:
# Calculate scale_pos_weight for XGBoost (ratio of negative to positive class)
n_negative = (y_train == 0).sum()
n_positive = (y_train == 1).sum()
scale_pos_weight = n_negative / n_positive if n_positive > 0 else 1.0

print(f"Scale pos weight for XGBoost: {scale_pos_weight:.2f}")

# Train XGBoost
xgb_model = train_xgboost(
    X_train, y_train,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

# Predictions on validation set
y_pred_xgb = xgb_model.predict(X_val)
y_pred_proba_xgb = xgb_model.predict_proba(X_val)[:, 1]

# Store results
trained_models['XGBoost'] = xgb_model
model_predictions['XGBoost'] = y_pred_xgb
model_probabilities['XGBoost'] = y_pred_proba_xgb

# Evaluate
evaluate_model_performance(xgb_model, X_val, y_val, 'XGBoost')


## 5. ƒê√°nh gi√° Models v·ªõi evaluate.py

S·ª≠ d·ª•ng module evaluate.py ƒë·ªÉ t√≠nh metrics chi ti·∫øt cho t·ª´ng model.


In [None]:
# Evaluate all models using evaluate.py
all_metrics = []

for model_name in trained_models.keys():
    y_pred = model_predictions[model_name]
    y_pred_proba = model_probabilities[model_name]
    
    # Get metrics
    metrics = get_metrics_dict(y_val, y_pred, y_pred_proba, model_name)
    all_metrics.append(metrics)
    
    # Print detailed metrics
    print_metrics(y_val, y_pred, y_pred_proba, model_name)

# Create comparison DataFrame
comparison_df = pd.DataFrame(all_metrics)
print("\n" + "="*70)
print("Metrics Comparison - All Ensemble Models")
print("="*70)
print(comparison_df.to_string(index=False))
print("="*70)


## 6. Visualize Metrics Comparison


In [None]:
# Plot metrics comparison
from src.evaluate import plot_metrics_comparison

fig = plot_metrics_comparison(all_metrics, figsize=(14, 7))
plt.show()


## 7. L∆∞u Models ƒë√£ Train

L∆∞u t·∫•t c·∫£ models v√†o th∆∞ m·ª•c models/ ƒë·ªÉ s·ª≠ d·ª•ng sau n√†y.


In [None]:
# Save all trained models
models_dir = project_root / 'models'
models_dir.mkdir(exist_ok=True)

for model_name, model in trained_models.items():
    save_model(model, model_name, save_dir=models_dir)

print(f"\n‚úÖ All models saved to {models_dir}")
print(f"‚úÖ Total models saved: {len(trained_models)}")


## 8. T√≥m t·∫Øt

T·∫•t c·∫£ models ƒë√£ ƒë∆∞·ª£c train, ƒë√°nh gi√° v√† l∆∞u. K·∫øt qu·∫£ s·∫Ω ƒë∆∞·ª£c s·ª≠ d·ª•ng trong notebook 03 ƒë·ªÉ so s√°nh v√† ƒë√°nh gi√° cu·ªëi c√πng.


In [None]:
print("="*70)
print("TRAINING SUMMARY")
print("="*70)
print(f"\n‚úÖ Models trained: {len(trained_models)}")
print(f"   - Random Forest")
print(f"   - AdaBoost")
print(f"   - XGBoost")
print(f"\n‚úÖ Models evaluated on validation set: {len(y_val):,} samples")
print(f"‚úÖ Models saved to: {models_dir}")
print(f"\nüìä Next steps:")
print(f"   1. Review metrics comparison above")
print(f"   2. Run notebook 03 for final evaluation and comparison")
print(f"   3. Models are ready for use in fraud_detection_app.py")
print("="*70)
