In [72]:
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import graphviz
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score, f1_score, fbeta_score, brier_score_loss, recall_score
from sklearn.datasets import make_classification
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from xgboost import plot_importance
from sklearn.datasets import make_classification
import lightgbm as lgb
import multiprocessing
import warnings
warnings.filterwarnings('ignore')

# Reading Datasets

In [73]:
df1 = pd.read_csv('../Data/Edited_bitsM.csv') # Contains 300 negatives and 36 positives from BITSM
df2 = pd.read_csv('../Data/ESIdf.csv') # Contains 75 positives from ESI

df1.head()

Unnamed: 0.1,Unnamed: 0,Gender,Age,Haemoglobin(gms%),Total WBC Count(/Cumm),Neutrophils(%),Lymphocytes(%),Eosinophils(%),Monocytes(%),Basophils(%),Others,Total RBC Count(millions/Cu),HCT(%),MCV(f L),MCH(pg),MCHC(gms%),RDWCV(%),Platelet Count(Lakh / Cumm),Result
0,0,Male,63.0,11.6,10200,72,23,2,3,0,0,4.2,35.0,84.0,28.3,33.7,14.4,2.2,Negative
1,1,Female,27.0,13.7,7100,63,34,1,2,0,0,4.36,40.1,92.0,31.3,34.1,14.1,2.48,Negative
2,2,Male,63.0,11.1,8500,59,35,3,3,0,0,4.0,33.5,84.0,27.9,33.3,14.0,2.68,Negative
3,3,Male,35.0,14.0,5500,45,50,2,3,0,0,4.73,42.0,89.0,29.7,33.4,14.3,2.34,Negative
4,4,Male,19.0,13.2,5100,61,33,3,3,0,0,3.27,42.2,80.0,25.1,31.4,15.4,1.94,Negative


In [74]:
# Creating the external dataset validation for Indian model
external_validation_df = df1[df1['Result'] == 'Positive']

In [75]:
df1 = df1[df1['Result'] != 'Positive'] # Removing positives from the training set of BITSM
# Now, df1 contains 300 negatives from BITSM and df2 contains 75 positives from ESI

In [76]:
# Creating the training dataset by merging df1 and df2
df = pd.concat([df1, df2], join = 'inner')

# Preprocessing data

In [77]:
encoder = LabelEncoder()

df['Result'] = encoder.fit_transform(df['Result'])  
df['Gender'] = encoder.fit_transform(df['Gender'])

In [78]:
df = df.drop(columns = [df.columns[0], 'Others'])

#Prepare for training and testing
X = df.drop(columns = ['Result'])
Y = df['Result']

## Creation of SMOTE dataset

In [79]:
smote = SMOTE(random_state=42)

In [80]:
X_resampled, Y_resampled = smote.fit_resample(X, Y)

df_smote = pd.DataFrame(X_resampled, columns = X.columns)
df_smote['Result'] = Y_resampled

In [81]:
for x in df_smote.columns:
    df_smote[x] = df_smote[x].apply(lambda x : round(x, 2))       #rounding till 2, but age to int, and haemo to 1

df_smote['Age'] = df_smote['Age'].apply(lambda x : int(x))
df_smote['Haemoglobin(gms%)'] = df_smote['Haemoglobin(gms%)'].apply(lambda x : round(x, 1))

In [82]:
# df_smote.to_csv('../Data/smoteesimedc.csv') 

In [83]:
# dropping gender and age columns from both the datasets because we are not considering them
df = df.drop(columns = ['Gender', 'Age'])
df_smote = df_smote.drop(columns = ['Gender', 'Age'])

## Training 4 models on 375 dataset

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier

# 80:20 split on the 375 dataset (df)
X_train, X_test, y_train, y_test = train_test_split(df.drop('Result', axis=1), df['Result'], test_size=0.2, random_state=42, stratify=df['Result'])

# Define models and their parameter grids (with random_state=42 for reproducibility)
models = {
    'XGBoost': (xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1]
    }),
    'AdaBoost': (AdaBoostClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 1.0]
    }),
    'RandomForest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 7]
    }),
    'DecisionTree': (DecisionTreeClassifier(random_state=42), {
        'max_depth': [3, 5, 7, None],
        'criterion': ['gini', 'entropy']
    })
}

best_estimators = {}
results = {}

for name, (model, param_grid) in models.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_estimators[name] = grid.best_estimator_
    results[name] = {
        'best_params': grid.best_params_,
        'train_score': grid.best_score_,
        'test_score': grid.best_estimator_.score(X_test, y_test)
    }
    print(f"Best params: {grid.best_params_}")
    print(f"Train CV accuracy: {grid.best_score_:.4f}")
    print(f"Test accuracy: {grid.best_estimator_.score(X_test, y_test):.4f}")


Training XGBoost...
Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Train CV accuracy: 0.9667
Test accuracy: 0.9733

Training AdaBoost...
Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Train CV accuracy: 0.9667
Test accuracy: 0.9733

Training AdaBoost...
Best params: {'learning_rate': 1.0, 'n_estimators': 50}
Train CV accuracy: 0.9600
Test accuracy: 0.9867

Training RandomForest...
Best params: {'learning_rate': 1.0, 'n_estimators': 50}
Train CV accuracy: 0.9600
Test accuracy: 0.9867

Training RandomForest...
Best params: {'max_depth': 7, 'n_estimators': 50}
Train CV accuracy: 0.9567
Test accuracy: 0.9600

Training DecisionTree...
Best params: {'criterion': 'gini', 'max_depth': None}
Train CV accuracy: 0.9433
Test accuracy: 0.9333
Best params: {'max_depth': 7, 'n_estimators': 50}
Train CV accuracy: 0.9567
Test accuracy: 0.9600

Training DecisionTree...
Best params: {'criterion': 'gini', 'max_depth': None}
Train CV accuracy: 0.9433
Tes

In [86]:
# Display summary of results for all models
display_results = []
for name, res in results.items():
    display_results.append({
        'Model': name,
        'Best Params': res['best_params'],
        'Train CV Accuracy': res['train_score'],
        'Test Accuracy': res['test_score']
    })

import pandas as pd
df_results = pd.DataFrame(display_results)
df_results

Unnamed: 0,Model,Best Params,Train CV Accuracy,Test Accuracy
0,XGBoost,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.966667,0.973333
1,AdaBoost,"{'learning_rate': 1.0, 'n_estimators': 50}",0.96,0.986667
2,RandomForest,"{'max_depth': 7, 'n_estimators': 50}",0.956667,0.96
3,DecisionTree,"{'criterion': 'gini', 'max_depth': None}",0.943333,0.933333


The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


## Training on SMOTE (600)

In [None]:
# Training 4 models on SMOTE dataset (600 samples)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier

X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(
    df_smote.drop('Result', axis=1),
    df_smote['Result'],
    test_size=0.2,
    random_state=42,
    stratify=df_smote['Result']
)

# Define models and their parameter grids (with random_state=42 for reproducibility)
models_sm = {
    'XGBoost': (xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1]
    }),
    'AdaBoost': (AdaBoostClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 1.0]
    }),
    'RandomForest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 7]
    }),
    'DecisionTree': (DecisionTreeClassifier(random_state=42), {
        'max_depth': [3, 5, 7, None],
        'criterion': ['gini', 'entropy']
    })
}

best_estimators_sm = {}
results_sm = {}

for name, (model, param_grid) in models_sm.items():
    print(f"\nTraining {name} on SMOTE data...")
    grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_sm, y_train_sm)
    best_estimators_sm[name] = grid.best_estimator_
    results_sm[name] = {
        'best_params': grid.best_params_,
        'train_score': grid.best_score_,
        'test_score': grid.best_estimator_.score(X_test_sm, y_test_sm)
    }
    print(f"Best params: {grid.best_params_}")
    print(f"Train CV accuracy: {grid.best_score_:.4f}")
    print(f"Test accuracy: {grid.best_estimator_.score(X_test_sm, y_test_sm):.4f}")

# Display summary of results for all models on SMOTE data
display_results_sm = []
for name, res in results_sm.items():
    display_results_sm.append({
        'Model': name,
        'Best Params': res['best_params'],
        'Train CV Accuracy': res['train_score'],
        'Test Accuracy': res['test_score']
    })

df_results_sm = pd.DataFrame(display_results_sm)
df_results_sm


Training XGBoost on SMOTE data...
Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Train CV accuracy: 0.9625
Test accuracy: 0.9917

Training AdaBoost on SMOTE data...
Best params: {'learning_rate': 1.0, 'n_estimators': 100}
Train CV accuracy: 0.9667
Test accuracy: 0.9500

Training RandomForest on SMOTE data...
Best params: {'max_depth': 7, 'n_estimators': 100}
Train CV accuracy: 0.9583
Test accuracy: 0.9583

Training DecisionTree on SMOTE data...
Best params: {'criterion': 'entropy', 'max_depth': None}
Train CV accuracy: 0.9354
Test accuracy: 0.9500


Unnamed: 0,Model,Best Params,Train CV Accuracy,Test Accuracy
0,XGBoost,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.9625,0.991667
1,AdaBoost,"{'learning_rate': 1.0, 'n_estimators': 100}",0.966667,0.95
2,RandomForest,"{'max_depth': 7, 'n_estimators': 100}",0.958333,0.958333
3,DecisionTree,"{'criterion': 'entropy', 'max_depth': None}",0.935417,0.95
