In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv('datasets/thyroid_cancer_risk_data.csv').drop(columns=['Patient_ID', "Thyroid_Cancer_Risk"])

In [3]:
X = df.drop(columns=['Diagnosis'])
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Diagnosis'])
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [4]:
preprocessor = ColumnTransformer(
    transformers=[ 
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'LightGBM': LGBMClassifier()
}

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
results = []

for name, model in models.items():
    try:
        print(f"\n🔍 Training: {name}")
        clf = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_proba = clf.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else np.zeros_like(y_pred)

        results.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_proba) if y_proba.any() else None
        })
    except Exception as e:
        print(f"❌ {name} failed: {e}")


🔍 Training: Logistic Regression

🔍 Training: Random Forest

🔍 Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



🔍 Training: KNN

🔍 Training: Naive Bayes

🔍 Training: Decision Tree

🔍 Training: LDA

🔍 Training: QDA





🔍 Training: Gradient Boosting

🔍 Training: LightGBM
[LightGBM] [Info] Number of positive: 39571, number of negative: 130581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1149
[LightGBM] [Info] Number of data points in the train set: 170152, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.232563 -> initscore=-1.193897
[LightGBM] [Info] Start training from score -1.193897


In [6]:
# 8. Show all results as a DataFrame
results_df = pd.DataFrame(results).sort_values(by="F1", ascending=False)
import tabulate
print(tabulate.tabulate(results_df, headers='keys', tablefmt='grid'))

+----+---------------------+------------+-------------+----------+----------+-----------+
|    | Model               |   Accuracy |   Precision |   Recall |       F1 |   ROC AUC |
|  8 | Gradient Boosting   |   0.825055 |    0.693967 | 0.4474   | 0.544051 |  0.692534 |
+----+---------------------+------------+-------------+----------+----------+-----------+
|  9 | LightGBM            |   0.825008 |    0.693932 | 0.447098 | 0.543817 |  0.694533 |
+----+---------------------+------------+-------------+----------+----------+-----------+
|  2 | XGBoost             |   0.823856 |    0.693151 | 0.439541 | 0.537954 |  0.692542 |
+----+---------------------+------------+-------------+----------+----------+-----------+
|  1 | Random Forest       |   0.822281 |    0.693074 | 0.427549 | 0.528855 |  0.692303 |
+----+---------------------+------------+-------------+----------+----------+-----------+
|  3 | KNN                 |   0.786619 |    0.573333 | 0.333636 | 0.42181  |  0.665259 |
+----+----

In [7]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.metrics import recall_score
top3 = pd.DataFrame({'Model': ['XGBoost', 'QDA']})
# Define search spaces for each model (QDA instead of Random Forest)
search_spaces = {
    'QDA': {
        'classifier__reg_param': hp.uniform('classifier__reg_param', 0.0, 1.0),
        'classifier__tol': hp.uniform('classifier__tol', 1e-5, 1e-1)
    },
    'XGBoost': {
        'classifier__n_estimators': hp.quniform('classifier__n_estimators', 100, 500, 1),
        'classifier__max_depth': hp.quniform('classifier__max_depth', 3, 10, 1),
        'classifier__learning_rate': hp.loguniform('classifier__learning_rate', np.log(0.01), np.log(0.3)),
        'classifier__subsample': hp.uniform('classifier__subsample', 0.5, 1.0),
        'classifier__colsample_bytree': hp.uniform('classifier__colsample_bytree', 0.5, 1.0),
        'classifier__gamma': hp.uniform('classifier__gamma', 0, 5)
    }
}

for idx, row in top3.iterrows():
    model_name = row['Model']
    model = models[model_name]
    space = search_spaces.get(model_name)
    if not space:
        print(f"No search space for {model_name}. Skipping.")
        continue

    def objective(params):
        # Convert float hyperparameters to int where needed
        if model_name in ['LightGBM', 'XGBoost']:
            params['classifier__n_estimators'] = int(params['classifier__n_estimators'])
            params['classifier__max_depth'] = int(params['classifier__max_depth'])
            params['classifier__n_estimators'] = int(params['classifier__n_estimators'])
            params['classifier__max_depth'] = int(params['classifier__max_depth'])
        clf = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model.__class__(**{k.split('__')[1]: v for k, v in params.items()}))
        ])
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        recall = recall_score(y_test, y_pred)
        return {'loss': -recall, 'status': STATUS_OK}

    print(f"\n🔎 Hyperopt tuning for {model_name} (max recall)...")
    trials = Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=30,
        trials=trials,
        rstate=np.random.default_rng(42)
    )
    print(f"Best hyperparameters for {model_name}: {best}")
    print(f"Best recall: {-trials.best_trial['result']['loss']:.4f}")


🔎 Hyperopt tuning for XGBoost (max recall)...
100%|██████████| 30/30 [01:14<00:00,  2.49s/trial, best loss: -0.44740024183796856]
Best hyperparameters for XGBoost: {'classifier__colsample_bytree': 0.6746284734176358, 'classifier__gamma': 4.967030128376157, 'classifier__learning_rate': 0.07031878961231573, 'classifier__max_depth': 3.0, 'classifier__n_estimators': 409.0, 'classifier__subsample': 0.8616492018371803}
Best recall: 0.4474

🔎 Hyperopt tuning for QDA (max recall)...
  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]




  7%|▋         | 2/30 [00:01<00:22,  1.23trial/s, best loss: -0.0]





 13%|█▎        | 4/30 [00:03<00:20,  1.28trial/s, best loss: -0.4467956469165659]




 17%|█▋        | 5/30 [00:03<00:19,  1.29trial/s, best loss: -0.4467956469165659]




 20%|██        | 6/30 [00:04<00:18,  1.30trial/s, best loss: -0.4467956469165659]




 23%|██▎       | 7/30 [00:05<00:17,  1.30trial/s, best loss: -0.4467956469165659]




 27%|██▋       | 8/30 [00:06<00:16,  1.31trial/s, best loss: -0.4467956469165659]




 30%|███       | 9/30 [00:06<00:15,  1.32trial/s, best loss: -0.4467956469165659]





 37%|███▋      | 11/30 [00:08<00:14,  1.31trial/s, best loss: -0.4472994760177348]





 40%|████      | 12/30 [00:09<00:14,  1.28trial/s, best loss: -0.4472994760177348]




 47%|████▋     | 14/30 [00:11<00:13,  1.21trial/s, best loss: -0.4472994760177348]




 50%|█████     | 15/30 [00:11<00:12,  1.24trial/s, best loss: -0.4472994760177348]





 53%|█████▎    | 16/30 [00:12<00:11,  1.21trial/s, best loss: -0.4472994760177348]




 57%|█████▋    | 17/30 [00:13<00:10,  1.20trial/s, best loss: -0.4472994760177348]




 60%|██████    | 18/30 [00:14<00:09,  1.22trial/s, best loss: -0.4472994760177348]




 63%|██████▎   | 19/30 [00:15<00:08,  1.24trial/s, best loss: -0.4472994760177348]




 67%|██████▋   | 20/30 [00:15<00:07,  1.25trial/s, best loss: -0.4472994760177348]




 70%|███████   | 21/30 [00:16<00:07,  1.27trial/s, best loss: -0.4472994760177348]




 73%|███████▎  | 22/30 [00:17<00:06,  1.28trial/s, best loss: -0.4493147924224103]




 77%|███████▋  | 23/30 [00:18<00:05,  1.30trial/s, best loss: -0.4493147924224103]




 83%|████████▎ | 25/30 [00:19<00:03,  1.29trial/s, best loss: -0.4493147924224103]




 87%|████████▋ | 26/30 [00:20<00:03,  1.29trial/s, best loss: -0.449919387343813] 





 90%|█████████ | 27/30 [00:21<00:02,  1.29trial/s, best loss: -0.449919387343813]




 97%|█████████▋| 29/30 [00:22<00:00,  1.29trial/s, best loss: -0.449919387343813]




100%|██████████| 30/30 [00:23<00:00,  1.29trial/s, best loss: -0.449919387343813]




100%|██████████| 30/30 [00:23<00:00,  1.27trial/s, best loss: -0.449919387343813]
Best hyperparameters for QDA: {'classifier__reg_param': 0.01023471895223894, 'classifier__tol': 0.01491982802401582}
Best recall: 0.4499
