In [58]:
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import graphviz
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score, f1_score, fbeta_score, brier_score_loss, recall_score
from sklearn.datasets import make_classification
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from xgboost import plot_importance
from sklearn.datasets import make_classification
import lightgbm as lgb
import multiprocessing
import warnings
warnings.filterwarnings('ignore')

# Reading Datasets

In [59]:
df1 = pd.read_csv('../Data/Edited_bitsM.csv') # Contains 300 negatives and 36 positives from BITSM
df2 = pd.read_csv('../Data/ESIdf.csv') # Contains 75 positives from ESI

df1.head()

Unnamed: 0.1,Unnamed: 0,Gender,Age,Haemoglobin(gms%),Total WBC Count(/Cumm),Neutrophils(%),Lymphocytes(%),Eosinophils(%),Monocytes(%),Basophils(%),Others,Total RBC Count(millions/Cu),HCT(%),MCV(f L),MCH(pg),MCHC(gms%),RDWCV(%),Platelet Count(Lakh / Cumm),Result
0,0,Male,63.0,11.6,10200,72,23,2,3,0,0,4.2,35.0,84.0,28.3,33.7,14.4,2.2,Negative
1,1,Female,27.0,13.7,7100,63,34,1,2,0,0,4.36,40.1,92.0,31.3,34.1,14.1,2.48,Negative
2,2,Male,63.0,11.1,8500,59,35,3,3,0,0,4.0,33.5,84.0,27.9,33.3,14.0,2.68,Negative
3,3,Male,35.0,14.0,5500,45,50,2,3,0,0,4.73,42.0,89.0,29.7,33.4,14.3,2.34,Negative
4,4,Male,19.0,13.2,5100,61,33,3,3,0,0,3.27,42.2,80.0,25.1,31.4,15.4,1.94,Negative


In [60]:
# Creating the external dataset validation for Indian model
external_validation_df = df1[df1['Result'] == 'Positive']

In [61]:
df1 = df1[df1['Result'] != 'Positive'] # Removing positives from the training set of BITSM
# Now, df1 contains 300 negatives from BITSM and df2 contains 75 positives from ESI

In [62]:
# Creating the training dataset by merging df1 and df2
df = pd.concat([df1, df2], join = 'inner')

# Preprocessing data

In [63]:
encoder = LabelEncoder()

df['Result'] = encoder.fit_transform(df['Result'])  
df['Gender'] = encoder.fit_transform(df['Gender'])

In [64]:
df = df.drop(columns = [df.columns[0], 'Others'])

#Prepare for training and testing
X = df.drop(columns = ['Result'])
Y = df['Result']

## Creation of SMOTE dataset

In [65]:
smote = SMOTE(random_state=42)

In [66]:
X_resampled, Y_resampled = smote.fit_resample(X, Y)

df_smote = pd.DataFrame(X_resampled, columns = X.columns)
df_smote['Result'] = Y_resampled

In [67]:
for x in df_smote.columns:
    df_smote[x] = df_smote[x].apply(lambda x : round(x, 2))       #rounding till 2, but age to int, and haemo to 1

df_smote['Age'] = df_smote['Age'].apply(lambda x : int(x))
df_smote['Haemoglobin(gms%)'] = df_smote['Haemoglobin(gms%)'].apply(lambda x : round(x, 1))

In [68]:
# df_smote.to_csv('../Data/smoteesimedc.csv') 

In [69]:
# dropping gender and age columns from both the datasets because we are not considering them
df = df.drop(columns = ['Gender', 'Age'])
df_smote = df_smote.drop(columns = ['Gender', 'Age'])

## Training 4 models on 375 dataset

# End of Model Training and Evaluation

All four models have been trained and evaluated on both the original and SMOTE-balanced datasets. Metrics, ROC curves, and confusion matrices are shown above for each case.

In [None]:
# --- LightGBM on df ---
import lightgbm as lgb
lgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [15, 31, 63],
    'random_state': [42]
}
lgb_model = lgb.LGBMClassifier()
best_lgb = run_grid_search(lgb_model, lgb_param_grid, X_train_rf, y_train_rf)
lgb_stats = evaluate_model(best_lgb, X_test_rf, y_test_rf, model_name="LightGBM (df)")

# --- LightGBM on df_smote ---
best_lgb_sm = run_grid_search(lgb.LGBMClassifier(), lgb_param_grid, X_train_rf_sm, y_train_rf_sm)
lgb_sm_stats = evaluate_model(best_lgb_sm, X_test_rf_sm, y_test_rf_sm, model_name="LightGBM (df_smote)")

## LightGBM Classifier

In [None]:
# --- AdaBoost on df ---
from sklearn.ensemble import AdaBoostClassifier
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'random_state': [42]
}
ada_model = AdaBoostClassifier()
best_ada = run_grid_search(ada_model, ada_param_grid, X_train_rf, y_train_rf)
ada_stats = evaluate_model(best_ada, X_test_rf, y_test_rf, model_name="AdaBoost (df)")

# --- AdaBoost on df_smote ---
best_ada_sm = run_grid_search(AdaBoostClassifier(), ada_param_grid, X_train_rf_sm, y_train_rf_sm)
ada_sm_stats = evaluate_model(best_ada_sm, X_test_rf_sm, y_test_rf_sm, model_name="AdaBoost (df_smote)")

## AdaBoost Classifier

In [None]:
# --- XGBoost on df ---
import xgboost as xgb
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'random_state': [42]
}
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
best_xgb = run_grid_search(xgb_model, xgb_param_grid, X_train_rf, y_train_rf)
xgb_stats = evaluate_model(best_xgb, X_test_rf, y_test_rf, model_name="XGBoost (df)")

# --- XGBoost on df_smote ---
best_xgb_sm = run_grid_search(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_param_grid, X_train_rf_sm, y_train_rf_sm)
xgb_sm_stats = evaluate_model(best_xgb_sm, X_test_rf_sm, y_test_rf_sm, model_name="XGBoost (df_smote)")

## XGBoost Classifier

In [None]:
# --- Random Forest on df ---
from sklearn.ensemble import RandomForestClassifier
X_rf = df.drop(columns=['Result'])
y_rf = df['Result']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42, stratify=y_rf)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'random_state': [42]
}
rf_model = RandomForestClassifier()
best_rf = run_grid_search(rf_model, rf_param_grid, X_train_rf, y_train_rf)
rf_stats = evaluate_model(best_rf, X_test_rf, y_test_rf, model_name="Random Forest (df)")

# --- Random Forest on df_smote ---
X_rf_sm = df_smote.drop(columns=['Result'])
y_rf_sm = df_smote['Result']
X_train_rf_sm, X_test_rf_sm, y_train_rf_sm, y_test_rf_sm = train_test_split(X_rf_sm, y_rf_sm, test_size=0.2, random_state=42, stratify=y_rf_sm)
best_rf_sm = run_grid_search(RandomForestClassifier(), rf_param_grid, X_train_rf_sm, y_train_rf_sm)
rf_sm_stats = evaluate_model(best_rf_sm, X_test_rf_sm, y_test_rf_sm, model_name="Random Forest (df_smote)")

## Random Forest Classifier

In [None]:
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, confusion_matrix, roc_curve, auc

def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    except ImportError:
        pass
    try:
        import xgboost as xgb
        xgb.random.seed(seed)
    except ImportError:
        pass
    try:
        import lightgbm as lgb
        lgb.basic._config.set_config(seed=seed)
    except Exception:
        pass
set_seeds(42)

def run_grid_search(model, param_grid, X_train, y_train, scoring='f1', cv=5):
    grid = GridSearchCV(model, param_grid, scoring=scoring, cv=cv, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    print(f"Best parameters: {grid.best_params_}")
    return grid.best_estimator_

def evaluate_model(model, X_test, y_test, model_name="Model", plot_roc=True):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, 'predict_proba') else None
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"{model_name} Test Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")
    if roc_auc is not None:
        print(f"ROC-AUC: {roc_auc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    if plot_roc and y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.figure(figsize=(5,4))
        plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f"{model_name} ROC Curve")
        plt.legend(loc="lower right")
        plt.show()
    return {"accuracy": acc, "f1": f1, "roc_auc": roc_auc, "recall": recall, "precision": precision, "confusion_matrix": cm}

# Model Training and Evaluation

In this section, we train and evaluate four machine learning models (Random Forest, XGBoost, AdaBoost, LightGBM) on two datasets: the original (`df`) and the SMOTE-balanced (`df_smote`).

- Hyperparameter tuning is performed using GridSearchCV.
- An 80:20 train-test split is used.
- All metrics are reported for the test set only.
- ROC curves and confusion matrices are displayed for each model.
- Random seeds are set for reproducibility.

In [70]:
# --- Start of Model Training and Evaluation Section ---

# End of Model Training and Evaluation

All four models have been trained and evaluated on both the original and SMOTE-balanced datasets. Metrics, ROC curves, and confusion matrices are shown above for each case.

In [71]:
# --- LightGBM on df_smote ---
best_lgb_sm = run_grid_search(lgb.LGBMClassifier(), lgb_param_grid, X_train_rf_sm, y_train_rf_sm)
lgb_sm_stats = evaluate_model(best_lgb_sm, X_test_rf_sm, y_test_rf_sm, model_name="LightGBM (df_smote)")

NameError: name 'run_grid_search' is not defined

In [None]:
# --- LightGBM on df ---
import lightgbm as lgb
lgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [15, 31, 63],
    'random_state': [42]
}

lgb_model = lgb.LGBMClassifier()
best_lgb = run_grid_search(lgb_model, lgb_param_grid, X_train_rf, y_train_rf)
lgb_stats = evaluate_model(best_lgb, X_test_rf, y_test_rf, model_name="LightGBM (df)")

## LightGBM Classifier

In [None]:
# --- AdaBoost on df_smote ---
best_ada_sm = run_grid_search(AdaBoostClassifier(), ada_param_grid, X_train_rf_sm, y_train_rf_sm)
ada_sm_stats = evaluate_model(best_ada_sm, X_test_rf_sm, y_test_rf_sm, model_name="AdaBoost (df_smote)")

In [None]:
# --- AdaBoost on df ---
from sklearn.ensemble import AdaBoostClassifier
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'random_state': [42]
}

ada_model = AdaBoostClassifier()
best_ada = run_grid_search(ada_model, ada_param_grid, X_train_rf, y_train_rf)
ada_stats = evaluate_model(best_ada, X_test_rf, y_test_rf, model_name="AdaBoost (df)")

## AdaBoost Classifier

In [None]:
# --- XGBoost on df_smote ---
best_xgb_sm = run_grid_search(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_param_grid, X_train_rf_sm, y_train_rf_sm)
xgb_sm_stats = evaluate_model(best_xgb_sm, X_test_rf_sm, y_test_rf_sm, model_name="XGBoost (df_smote)")

In [None]:
# --- XGBoost on df ---
import xgboost as xgb
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'random_state': [42]
}

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
best_xgb = run_grid_search(xgb_model, xgb_param_grid, X_train_rf, y_train_rf)
xgb_stats = evaluate_model(best_xgb, X_test_rf, y_test_rf, model_name="XGBoost (df)")

## XGBoost Classifier

In [None]:
# --- Random Forest on df_smote ---
X_rf_sm = df_smote.drop(columns=['Result'])
y_rf_sm = df_smote['Result']
X_train_rf_sm, X_test_rf_sm, y_train_rf_sm, y_test_rf_sm = train_test_split(X_rf_sm, y_rf_sm, test_size=0.2, random_state=42, stratify=y_rf_sm)

best_rf_sm = run_grid_search(RandomForestClassifier(), rf_param_grid, X_train_rf_sm, y_train_rf_sm)
rf_sm_stats = evaluate_model(best_rf_sm, X_test_rf_sm, y_test_rf_sm, model_name="Random Forest (df_smote)")

In [None]:
# --- Random Forest on df ---
from sklearn.ensemble import RandomForestClassifier

X_rf = df.drop(columns=['Result'])
y_rf = df['Result']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42, stratify=y_rf)

rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'random_state': [42]
}

rf_model = RandomForestClassifier()
best_rf = run_grid_search(rf_model, rf_param_grid, X_train_rf, y_train_rf)
rf_stats = evaluate_model(best_rf, X_test_rf, y_test_rf, model_name="Random Forest (df)")

## Random Forest Classifier

In [None]:
def run_grid_search(model, param_grid, X_train, y_train, scoring='f1', cv=5):
    grid = GridSearchCV(model, param_grid, scoring=scoring, cv=cv, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    print(f"Best parameters: {grid.best_params_}")
    return grid.best_estimator_

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, X_test, y_test, model_name="Model", plot_roc=True):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, 'predict_proba') else None
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"{model_name} Test Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")
    if roc_auc is not None:
        print(f"ROC-AUC: {roc_auc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    if plot_roc and y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.figure(figsize=(5,4))
        plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f"{model_name} ROC Curve")
        plt.legend(loc="lower right")
        plt.show()
    return {"accuracy": acc, "f1": f1, "roc_auc": roc_auc, "recall": recall, "precision": precision, "confusion_matrix": cm}

In [None]:
import random

def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    except ImportError:
        pass
    try:
        import xgboost as xgb
        xgb.random.seed(seed)
    except ImportError:
        pass
    try:
        import lightgbm as lgb
        lgb.basic._config.set_config(seed=seed)
    except Exception:
        pass

set_seeds(42)

# Model Training and Evaluation

In this section, we train and evaluate four machine learning models (Random Forest, XGBoost, AdaBoost, LightGBM) on two datasets: the original (`df`) and the SMOTE-balanced (`df_smote`).

- Hyperparameter tuning is performed using GridSearchCV.
- An 80:20 train-test split is used.
- All metrics are reported for the test set only.
- ROC curves and confusion matrices are displayed for each model.
- Random seeds are set for reproducibility.