In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [9]:
data = pd.read_csv("AmpiricDataset/V4/final.csv")

df = pd.DataFrame(data)

# Split dataset into features and labels
X = df.drop(columns=['Label'])
y = df['Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                             roc_auc_score, confusion_matrix, classification_report,
                             make_scorer)
from sklearn.model_selection import cross_validate
import joblib

# Improved cross-validation setup
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Enhanced models configuration with proper naming for pipeline steps
models = {
    "Logistic Regression": make_pipeline(
        StandardScaler(),
        LogisticRegression(C=0.01, penalty='l2', random_state=42, max_iter=1000)
    ),
    "SVM": make_pipeline(
        StandardScaler(),
        SVC(C=0.1, kernel='linear', random_state=42, probability=True)
    ),
    "KNN": make_pipeline(
        StandardScaler(),
        KNeighborsClassifier(n_neighbors=10, weights='uniform')
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=150, max_depth=5, min_samples_split=10,
        max_features=0.5, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=150, learning_rate=0.05, max_depth=3,
        subsample=0.8, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=150, learning_rate=0.05, max_depth=6,
        reg_lambda=1.0, reg_alpha=0.5, gamma=0.1,
        random_state=42, n_jobs=-1, eval_metric='logloss',
        scale_pos_weight=1),  # Adjust if class imbalance exists
    "Naïve Bayes": GaussianNB(var_smoothing=1e-9)
}

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1': 'f1_weighted',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'roc_auc': 'roc_auc'
}

# Enhanced evaluation function
def evaluate_model(name, model, X_train, X_test, y_train, y_test, cv):
    print(f"\n---- {name} ----")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Performance metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred, average='weighted'),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted')
    }
    
    if y_proba is not None:
        metrics['ROC AUC'] = roc_auc_score(y_test, y_proba)
    
    # Print metrics
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Cross-validation with multiple metrics
    cv_results = cross_validate(
        model, X_train, y_train, cv=cv,
        scoring=scoring, n_jobs=-1, error_score='raise'
    )
    
    print("\nCross-validation Results:")
    for metric in scoring.keys():
        mean_score = np.mean(cv_results[f'test_{metric}'])
        std_score = np.std(cv_results[f'test_{metric}'])
        print(f"{metric}: {mean_score:.4f} ± {std_score:.4f}")

    
# Run evaluation for all models
for name, model in models.items():
    evaluate_model(
        name=name,
        model=model,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test,
        cv=skf
    )


---- Logistic Regression ----
Accuracy: 0.9691
F1: 0.9691
Precision: 0.9691
Recall: 0.9691
ROC AUC: 0.9932

Cross-validation Results:
accuracy: 0.9690 ± 0.0017
f1: 0.9690 ± 0.0017
precision: 0.9690 ± 0.0017
recall: 0.9690 ± 0.0017
roc_auc: 0.9924 ± 0.0006

---- SVM ----
Accuracy: 0.9772
F1: 0.9772
Precision: 0.9775
Recall: 0.9772
ROC AUC: 0.9958

Cross-validation Results:
accuracy: 0.9775 ± 0.0012
f1: 0.9775 ± 0.0012
precision: 0.9778 ± 0.0012
recall: 0.9775 ± 0.0012
roc_auc: 0.9952 ± 0.0003

---- KNN ----
Accuracy: 0.9967
F1: 0.9967
Precision: 0.9967
Recall: 0.9967
ROC AUC: 0.9996

Cross-validation Results:
accuracy: 0.9963 ± 0.0002
f1: 0.9963 ± 0.0002
precision: 0.9963 ± 0.0002
recall: 0.9963 ± 0.0002
roc_auc: 0.9993 ± 0.0004

---- Random Forest ----
Accuracy: 0.9964
F1: 0.9964
Precision: 0.9964
Recall: 0.9964
ROC AUC: 0.9998

Cross-validation Results:
accuracy: 0.9953 ± 0.0008
f1: 0.9953 ± 0.0008
precision: 0.9953 ± 0.0008
recall: 0.9953 ± 0.0008
roc_auc: 0.9998 ± 0.0001

---- Grad

In [14]:
# Save the model and scaler
import joblib

# Save the models using loop 
for name, model in models.items():
    joblib.dump(model, f'./models/{name}.pkl')

print("Saved to disk!")

Saved to disk!
