# Model Selection

## Relevant Modules

In [None]:
import os 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss, cohen_kappa_score, f1_score
from sklearn.model_selection import cross_validate, GridSearchCV, RepeatedStratifiedKFold
from xgboost import XGBClassifier

In [4]:
os.chdir('c:\\Users\\JustinOkeke\\disease_prediction')

In [49]:
# Data has already been processed, so split it into train/test 
np.random.seed(1)
diseases_test = pd.read_csv('data/processed/Training_processed.csv')
diseases_train = pd.read_csv('data/processed/Testing_processed.csv')

predictors = diseases_test.columns.difference(['prognosis'])

x_train, y_train = diseases_train[predictors], diseases_train['prognosis']

x_test, y_test = diseases_test[predictors], diseases_test['prognosis']

### Target Variable Transformation

In [50]:
# Target Transformation Functions

# Step 1: Fit function for target transformation
def fit_label_encoder(y):
    encoder = LabelEncoder()
    encoder.fit(y)
    return encoder

# Step 2: Transform function for target transformation
def transform_target(encoder, y):
    return encoder.transform(y)

# Step 3: Inverse transform function for decoding predictions
def inverse_transform_target(encoder, encoded_y):
    return encoder.inverse_transform(encoded_y)

## Pipeline 1: Decision Tree Classifier

In [51]:
# Decision Tree Classifier  

pipeline1 = Pipeline(
    [
        ('classifier', DecisionTreeClassifier(random_state=40))
    ]
)

pipeline1

In [11]:
# Encode the target variable
target_encoder = fit_label_encoder(y_train)
y_train_encoded = transform_target(target_encoder, y_train)


# Hyperparameter Tuning
param_grid_A = {
    'classifier__max_depth': [10, 20, 30],                                                
    'classifier__max_features': ['sqrt', 'log2'] 
                                                
}
# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_weighted': 'f1_weighted',
    'roc_auc_ovr': 'roc_auc_ovr'
}

# GridSearchCV for Decision Tree (example)
grid_search_A = GridSearchCV(
    pipeline1,               # Replace with your pipeline (e.g., Decision Tree or SVM)
    param_grid_A,             # Replace with the parameter grid for the respective model
    scoring=scoring,          # Use multiple scoring metrics
    refit='accuracy',         # Refit the model using the best accuracy
    cv=2,                     # Cross-validation with 5 folds
    n_jobs=-1                 # Use all available CPU cores
)
#grid_search_A = GridSearchCV(
 #   pipeline_A,          # The pipeline containing the model and any preprocessing steps
  #  param_grid_A,        # Hyperparameters to tune, defined in the parameter grid
   # cv=5,                # Number of cross-validation folds (5-fold cross-validation)
    #scoring='accuracy',  # Metric used to evaluate model performance (accuracy score)
    #refit=True           # Refits the best model on the entire training dataset after tuning
#)


# Perform GridSearchCV
grid_search_A.fit(x_train, y_train_encoded)

# Get Best Results
best_model_A = grid_search_A.best_estimator_
print("Best Parameters:", grid_search_A.best_params_)
print("Best Cross-Validated Accuracy:", grid_search_A.best_score_)

model_A_dt = pd.DataFrame(grid_search_A.cv_results_)
model_A_dt



Best Parameters: {'classifier__max_depth': 20, 'classifier__max_features': 'sqrt'}
Best Cross-Validated Accuracy: 0.023809523809523808


  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,param_classifier__max_features,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1_weighted,split1_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted,split0_test_roc_auc_ovr,split1_test_roc_auc_ovr,mean_test_roc_auc_ovr,std_test_roc_auc_ovr,rank_test_roc_auc_ovr
0,0.005514,0.0005013943,0.023561,0.000501,10,sqrt,"{'classifier__max_depth': 10, 'classifier__max...",0.0,0.0,0.0,0.0,5,0.0,0.0,0.0,0.0,5,0.459524,0.5,0.479762,0.020238,6
1,0.007519,0.001504183,0.025319,0.000253,10,log2,"{'classifier__max_depth': 10, 'classifier__max...",0.0,0.0,0.0,0.0,5,0.0,0.0,0.0,0.0,5,0.467857,0.507143,0.4875,0.019643,5
2,0.01153,0.005514145,0.023315,0.000748,20,sqrt,"{'classifier__max_depth': 20, 'classifier__max...",0.0,0.047619,0.02381,0.02381,1,0.0,0.009524,0.004762,0.004762,1,0.475,0.5,0.4875,0.0125,3
3,0.007519,0.001504183,0.019805,0.001758,20,log2,"{'classifier__max_depth': 20, 'classifier__max...",0.0,0.047619,0.02381,0.02381,1,0.0,0.007937,0.003968,0.003968,3,0.5,0.525,0.5125,0.0125,1
4,0.006015,1.192093e-07,0.025066,0.002006,30,sqrt,"{'classifier__max_depth': 30, 'classifier__max...",0.0,0.047619,0.02381,0.02381,1,0.0,0.009524,0.004762,0.004762,1,0.475,0.5,0.4875,0.0125,3
5,0.005514,0.0005015135,0.022254,0.000806,30,log2,"{'classifier__max_depth': 30, 'classifier__max...",0.0,0.047619,0.02381,0.02381,1,0.0,0.007937,0.003968,0.003968,3,0.5,0.525,0.5125,0.0125,1


In [12]:
grid_search_A.best_estimator_

In [14]:
pipeline1.fit(x_train, y_train)

In [15]:
y_pred_train = pipeline1.predict(x_train)
y_pred_test = pipeline1.predict(x_test)
y_proba_train = pipeline1.predict_proba(x_train)
y_proba_test = pipeline1.predict_proba(x_test)

In [16]:
res = {'accuracy_store_train': accuracy_score(y_train, y_pred_train),
'accuracy_store_test': accuracy_score(y_test, y_pred_test),
'f1_score_train': f1_score(y_train, y_pred_train, average='weighted'),
'f1_score_test': f1_score(y_test, y_pred_test, average='weighted')}

res

{'accuracy_store_train': 1.0,
 'accuracy_store_test': 0.9292682926829269,
 'f1_score_train': np.float64(1.0),
 'f1_score_test': np.float64(0.9352315123936827)}

## Pipeline 2: Logistic Regression Classifier

In [22]:
pipeline2 = Pipeline(
    [
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ('logreg', LogisticRegression(max_iter=500))
    ]
)

pipeline2

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
import pandas as pd

# Define hyperparameter grid
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']  # L2 regularization is compatible with these solvers
c_values = [100, 10, 1.0, 0.1, 0.01]

param_grid_B = {
    'solver': solvers,
    'penalty': penalty,
    'C': c_values
}

# Define cross-validation strategy
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=40)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_weighted': 'f1_weighted',
    'roc_auc_ovr': 'roc_auc_ovr'
}

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)  # Ensure enough iterations for convergence

# GridSearchCV for Logistic Regression
grid_search_B = GridSearchCV(
    estimator=logistic_model,  # Replace with your pipeline if needed
    param_grid=param_grid_B, 
    scoring=scoring,
    refit='accuracy',  # Refit the model using the best accuracy
    cv=cv,  # Use the defined cross-validation strategy
    n_jobs=-1,  # Use all available CPU cores
    error_score='raise'  # Raise errors instead of using 0 or NaN
)

# Perform GridSearchCV
grid_search_B.fit(x_train, y_train)





In [29]:
# Get Best Results
best_model_B = grid_search_B.best_estimator_
print("Best Parameters:", grid_search_B.best_params_)
print("Best Cross-Validated Accuracy:", grid_search_B.best_score_)

# Convert cross-validation results to DataFrame for inspection
model_B_dt = pd.DataFrame(grid_search_B.cv_results_)
model_B_dt


Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Best Cross-Validated Accuracy: 0.03333333333333333


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,split5_test_accuracy,split6_test_accuracy,split7_test_accuracy,split8_test_accuracy,split9_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_f1_weighted,split1_test_f1_weighted,split2_test_f1_weighted,split3_test_f1_weighted,split4_test_f1_weighted,split5_test_f1_weighted,split6_test_f1_weighted,split7_test_f1_weighted,split8_test_f1_weighted,split9_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted,split0_test_roc_auc_ovr,split1_test_roc_auc_ovr,split2_test_roc_auc_ovr,split3_test_roc_auc_ovr,split4_test_roc_auc_ovr,split5_test_roc_auc_ovr,split6_test_roc_auc_ovr,split7_test_roc_auc_ovr,split8_test_roc_auc_ovr,split9_test_roc_auc_ovr,mean_test_roc_auc_ovr,std_test_roc_auc_ovr,rank_test_roc_auc_ovr
0,0.006833,0.000957,0.015629,0.000733,100.0,l2,newton-cg,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.02381,0.016667,0.010911,7,0.480952,0.454762,0.47381,0.466667,0.47381,0.466667,0.47381,0.466667,0.480952,0.454762,0.469286,0.00881,4
1,0.010133,0.00561,0.016899,0.00251,100.0,l2,lbfgs,"{'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.02381,0.016667,0.010911,7,0.478571,0.461905,0.471429,0.469048,0.471429,0.469048,0.471429,0.469048,0.478571,0.461905,0.470238,0.005351,3
2,0.004326,0.000615,0.027892,0.007672,100.0,l2,liblinear,"{'C': 100, 'penalty': 'l2', 'solver': 'libline...",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.02381,0.016667,0.010911,7,0.47619,0.469048,0.469048,0.47619,0.469048,0.47619,0.469048,0.47619,0.47619,0.469048,0.472619,0.003571,1
3,0.007504,0.000718,0.02076,0.005292,10.0,l2,newton-cg,"{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.02381,0.016667,0.010911,7,0.471429,0.457143,0.469048,0.471429,0.469048,0.471429,0.469048,0.471429,0.471429,0.457143,0.467857,0.005455,5
4,0.008638,0.001749,0.019186,0.005932,10.0,l2,lbfgs,"{'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.02381,0.016667,0.010911,7,0.478571,0.464286,0.469048,0.471429,0.469048,0.471429,0.469048,0.471429,0.478571,0.464286,0.470714,0.004647,2
5,0.004192,0.000745,0.021837,0.009685,10.0,l2,liblinear,"{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.02381,0.016667,0.010911,7,0.461905,0.454762,0.452381,0.457143,0.452381,0.457143,0.452381,0.457143,0.461905,0.454762,0.45619,0.003401,8
6,0.00753,0.000932,0.018746,0.004669,1.0,l2,newton-cg,"{'C': 1.0, 'penalty': 'l2', 'solver': 'newton-...",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.031746,0.0,0.031746,0.0,0.031746,0.02381,0.02381,0.019048,0.012895,5,0.469048,0.461905,0.464286,0.461905,0.464286,0.461905,0.464286,0.461905,0.469048,0.461905,0.464048,0.002704,6
7,0.008377,0.000818,0.01764,0.003446,1.0,l2,lbfgs,"{'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.031746,0.0,0.031746,0.0,0.031746,0.02381,0.02381,0.019048,0.012895,5,0.469048,0.461905,0.464286,0.461905,0.464286,0.461905,0.464286,0.461905,0.469048,0.461905,0.464048,0.002704,6
8,0.00466,0.001151,0.016693,0.001439,1.0,l2,liblinear,"{'C': 1.0, 'penalty': 'l2', 'solver': 'libline...",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.02381,0.02381,0.0,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.02381,0.016667,0.010911,7,0.454762,0.45,0.445238,0.457143,0.445238,0.457143,0.445238,0.457143,0.454762,0.45,0.451667,0.004885,9
9,0.006631,0.002519,0.017208,0.00128,0.1,l2,newton-cg,"{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-...",0.047619,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.047619,0.047619,0.033333,0.021822,1,0.031746,0.02381,0.0,0.031746,0.0,0.031746,0.0,0.031746,0.031746,0.02381,0.020635,0.013838,1,0.445238,0.452381,0.440476,0.445238,0.440476,0.445238,0.440476,0.445238,0.445238,0.452381,0.445238,0.004124,13


In [30]:
pipeline2.fit(x_train, y_train)

In [31]:
y_pred_train2 = pipeline2.predict(x_train)
y_pred_test2 = pipeline2.predict(x_test)

In [32]:
res2 = {'accuracy_store_train': accuracy_score(y_train, y_pred_train2),
'accuracy_store_test': accuracy_score(y_test, y_pred_test2),
'f1_score_train': f1_score(y_train, y_pred_train2, average='weighted'),
'f1_score_test': f1_score(y_test, y_pred_test2, average='weighted')}

res2

{'accuracy_store_train': 1.0,
 'accuracy_store_test': 1.0,
 'f1_score_train': np.float64(1.0),
 'f1_score_test': np.float64(1.0)}