In [16]:
# Importing necessary libraries
import pandas as pd 
import seaborn as sns 
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
import missingno as msno
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import accuracy_score, log_loss, cohen_kappa_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.multioutput import MultiOutputClassifier

In [17]:
train_data = pd.read_csv('../data/processed/Training_processed.csv')

test_data = pd.read_csv('../data/processed/Testing_processed.csv')

train_data.head(10) # looking at the first few rows of data 
total_columns = len(train_data.columns)
total_rows = len(train_data['prognosis'])
prognoses = train_data['prognosis'].unique().tolist()

print(f"The training dataset has a total of {total_rows} rows and {total_columns} columns. This means that there are {total_columns - 1} predictor variables. \nwithin the prognosis column, there are {len(prognoses)} diseases included.")

The training dataset has a total of 4920 rows and 132 columns. This means that there are 131 predictor variables. 
within the prognosis column, there are 41 diseases included.


In [18]:
# drop column with no value
#train_data = train_data.drop('Unnamed: 133', axis=1)

In [19]:
# Encode the categorical target for correlation analysis
le = LabelEncoder()
train_data['disease_encoded'] = le.fit_transform(train_data['prognosis'])

In [20]:
train_data['disease_encoded'].unique()

array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
       11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
       31,  5,  0,  2, 38, 35, 27])

In [21]:
train_data['disease_encoded'] = train_data['disease_encoded'].astype(float)

In [22]:
cat_cols = ['prognosis']

In [23]:
# drop column with no value
#train_data = train_data.drop('Unnamed: 133', axis=1)

In [24]:
train_data.shape

(4920, 133)

In [25]:
pipeline_A = Pipeline([
    ('classifier', DecisionTreeClassifier())
])
pipeline_A

In [26]:
pipeline_B = Pipeline([
    ('classifier', RandomForestClassifier())
])
pipeline_B

In [27]:
pipeline_C = Pipeline([
    ('classifier', XGBClassifier())
])
pipeline_C

In [28]:
pipeline_D = Pipeline([
    ('classifier', OneVsRestClassifier(LogisticRegression(max_iter=1000)))
])
pipeline_D

In [29]:
pipeline_E = Pipeline([
    ('classifier', SVC())
])
pipeline_E

In [30]:
X_train = train_data.drop(columns=['prognosis'])

Y_train = train_data['prognosis']

X_test = test_data.drop(columns=['prognosis'])

Y_test = test_data['prognosis']

scoring = ['neg_log_loss', 'roc_auc', 'f1', 'accuracy', 'precision', 'recall']

In [31]:
# Custom target transformer (Label Encoding for classification task)
class CustomTargetTransformer:
    def fit(self, y):
        # Fit the LabelEncoder to the target variable
        self.encoder = LabelEncoder()
        self.encoder.fit(y)
        return self

    def transform(self, y):
        # Transform the target variable to encoded values
        return self.encoder.transform(y)

    def inverse_transform(self, y):
        # Inverse transform to get the original target variable back
        return self.encoder.inverse_transform(y)

Best for: When you want to treat all classes equally, regardless of their size.

Explanation: The macro average computes the metric (precision, recall, or F1) for each class independently, and then averages these scores. This is useful when the classes are of equal importance.

Use Case: If you want to ensure that your model performs equally well across all classes, without regard to class distribution, the macro average may be the best choice.

For now we used f1_weighted as the metric.

In [32]:
param_grid_A = {
    'classifier__max_depth': [10, 20, 30],
    'classifier__max_features': ['sqrt', 'log2']
}

grid_search_A = GridSearchCV(
    pipeline_A,
    param_grid_A, cv=5,
    scoring= 'f1_weighted',
    refit= True
)

# Apply the transformation to the target variable (Y_train) outside of the pipeline
target_transformer = CustomTargetTransformer()
target_transformer.fit(Y_train)

# Fit the target transformer on Y_train and transform it
Y_train_transformed = target_transformer.transform(Y_train)

grid_search_A.fit(X_train, Y_train_transformed)
model_A_df = pd.DataFrame(grid_search_A.cv_results_)
model_A_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,param_classifier__max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01384,0.003768,0.005217,0.00151,10,sqrt,"{'classifier__max_depth': 10, 'classifier__max...",0.242581,0.326952,0.269345,0.416585,0.32878,0.316849,0.05994,5
1,0.008516,0.00094,0.003323,0.000651,10,log2,"{'classifier__max_depth': 10, 'classifier__max...",0.376362,0.306855,0.248659,0.269866,0.245427,0.289434,0.048666,6
2,0.01064,0.00177,0.003886,0.000711,20,sqrt,"{'classifier__max_depth': 20, 'classifier__max...",0.731001,0.586502,0.699246,0.637195,0.703252,0.671439,0.052383,4
3,0.008511,0.000797,0.003081,0.00042,20,log2,"{'classifier__max_depth': 20, 'classifier__max...",0.827294,0.592759,0.722822,0.588076,0.770325,0.700255,0.095601,3
4,0.010672,0.000674,0.003768,0.000637,30,sqrt,"{'classifier__max_depth': 30, 'classifier__max...",0.821868,0.82893,0.922987,1.0,0.906504,0.896058,0.0658,1
5,0.011453,0.003235,0.004087,0.000597,30,log2,"{'classifier__max_depth': 30, 'classifier__max...",0.794625,0.903895,0.869491,0.96748,0.912195,0.889537,0.056929,2


In [33]:
grid_search_A.best_params_

{'classifier__max_depth': 30, 'classifier__max_features': 'sqrt'}

In [34]:
grid_search_A.best_estimator_

In [35]:
param_grid_B = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [10, 20],
    'classifier__max_features': ['sqrt', 'log2']
}

grid_search_B = GridSearchCV(
    pipeline_B,
    param_grid_B, cv=5,
    scoring= 'f1_weighted',
    refit=True
)


# Apply the transformation to the target variable (Y_train) outside of the pipeline
target_transformer = CustomTargetTransformer()
target_transformer.fit(Y_train)

# Fit the target transformer on Y_train and transform it
Y_train_transformed = target_transformer.transform(Y_train)

grid_search_B.fit(X_train, Y_train_transformed)
model_B_df = pd.DataFrame(grid_search_B.cv_results_)
model_B_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,param_classifier__max_features,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.116939,0.014363,0.007604,0.002528,10,sqrt,50,"{'classifier__max_depth': 10, 'classifier__max...",1.0,1.0,0.998983,1.0,1.0,0.999797,0.000407,7
1,0.207218,0.005978,0.009473,0.000752,10,sqrt,100,"{'classifier__max_depth': 10, 'classifier__max...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
2,0.098711,0.013703,0.006296,0.000913,10,log2,50,"{'classifier__max_depth': 10, 'classifier__max...",1.0,1.0,0.998983,1.0,1.0,0.999797,0.000407,7
3,0.179103,0.011959,0.008957,0.000721,10,log2,100,"{'classifier__max_depth': 10, 'classifier__max...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
4,0.1377,0.01664,0.007214,0.000542,20,sqrt,50,"{'classifier__max_depth': 20, 'classifier__max...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
5,0.267336,0.020938,0.011439,0.000725,20,sqrt,100,"{'classifier__max_depth': 20, 'classifier__max...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
6,0.121345,0.008669,0.007097,0.000402,20,log2,50,"{'classifier__max_depth': 20, 'classifier__max...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
7,0.219267,0.013223,0.010584,0.000551,20,log2,100,"{'classifier__max_depth': 20, 'classifier__max...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


In [36]:
grid_search_B.best_params_

{'classifier__max_depth': 10,
 'classifier__max_features': 'sqrt',
 'classifier__n_estimators': 100}

In [37]:
grid_search_B.best_estimator_

In [38]:
param_grid_C = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20],
    #'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    #'classifier__subsample': [0.5, 0.7, 1.0],
    #'classifier__colsample_bytree': [0.5, 0.7, 1.0]
}

grid_search_C = GridSearchCV(
    pipeline_C,
    param_grid_C, cv=5,
    scoring= 'f1_weighted',
    refit=True
)

grid_search_C.fit(X_train, Y_train_transformed)
model_C_df = pd.DataFrame(grid_search_C.cv_results_)
model_C_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.508109,0.068685,0.019212,0.002551,10,50,"{'classifier__max_depth': 10, 'classifier__n_e...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
1,0.783681,0.018214,0.018728,0.001209,10,100,"{'classifier__max_depth': 10, 'classifier__n_e...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
2,1.476934,0.075167,0.019885,0.002011,10,200,"{'classifier__max_depth': 10, 'classifier__n_e...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
3,0.470128,0.029791,0.017982,0.001146,20,50,"{'classifier__max_depth': 20, 'classifier__n_e...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
4,0.834051,0.033222,0.020436,0.002445,20,100,"{'classifier__max_depth': 20, 'classifier__n_e...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
5,1.531376,0.146526,0.01963,0.002363,20,200,"{'classifier__max_depth': 20, 'classifier__n_e...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


In [39]:
grid_search_C.best_params_

{'classifier__max_depth': 10, 'classifier__n_estimators': 50}

In [40]:
grid_search_C.best_estimator_

In [41]:
param_grid_D = {
    #'classifier__n_estimators': [50, 100, 200, 500],
    #'classifier__max_depth': [None, 10, 20, 30],
    #'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    #'classifier__subsample': [0.5, 0.7, 1.0],
    #'classifier__colsample_bytree': [0.5, 0.7, 1.0],
    'classifier__estimator__solver': ['liblinear'],
    'classifier__estimator__penalty': ['l1', 'l2'],
    'classifier__estimator__C': [0.01, 0.1, 1]
}

grid_search_D = GridSearchCV(
    pipeline_D,
    param_grid_D, cv=5,
    scoring= {'f1_macro': 'f1_macro', 'roc_auc_ovr': 'roc_auc_ovr'},
    refit='roc_auc_ovr'
)

grid_search_D.fit(X_train, Y_train_transformed)
model_D_df = pd.DataFrame(grid_search_D.cv_results_)
model_D_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__estimator__C,param_classifier__estimator__penalty,param_classifier__estimator__solver,params,split0_test_f1_macro,split1_test_f1_macro,...,std_test_f1_macro,rank_test_f1_macro,split0_test_roc_auc_ovr,split1_test_roc_auc_ovr,split2_test_roc_auc_ovr,split3_test_roc_auc_ovr,split4_test_roc_auc_ovr,mean_test_roc_auc_ovr,std_test_roc_auc_ovr,rank_test_roc_auc_ovr
0,0.353402,0.01541,0.145722,0.013217,0.01,l1,liblinear,"{'classifier__estimator__C': 0.01, 'classifier...",0.002217,0.002217,...,0.0,6,0.803659,0.803659,0.804268,0.803659,0.803659,0.80378,0.000244,6
1,0.376259,0.008623,0.137794,0.002304,0.01,l2,liblinear,"{'classifier__estimator__C': 0.01, 'classifier...",0.996939,0.997964,...,0.001189,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
2,0.398058,0.005435,0.137317,0.005963,0.1,l1,liblinear,"{'classifier__estimator__C': 0.1, 'classifier_...",1.0,0.996939,...,0.001224,4,0.999987,0.999994,0.999999,1.0,1.0,0.999996,5e-06,5
3,0.40146,0.014066,0.148097,0.015758,0.1,l2,liblinear,"{'classifier__estimator__C': 0.1, 'classifier_...",1.0,1.0,...,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
4,0.474206,0.034531,0.137723,0.005144,1.0,l1,liblinear,"{'classifier__estimator__C': 1, 'classifier__e...",1.0,1.0,...,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
5,0.431799,0.017063,0.140299,0.010444,1.0,l2,liblinear,"{'classifier__estimator__C': 1, 'classifier__e...",1.0,1.0,...,0.0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


In [42]:
grid_search_D.best_params_

{'classifier__estimator__C': 0.01,
 'classifier__estimator__penalty': 'l2',
 'classifier__estimator__solver': 'liblinear'}

In [43]:
grid_search_D.best_estimator_

In [44]:
param_grid_E = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

grid_search_E = GridSearchCV(
    pipeline_E,
    param_grid_E, cv=5,
    scoring= 'f1_weighted',
    refit=True
)

grid_search_E.fit(X_train, Y_train_transformed)
model_E_df = pd.DataFrame(grid_search_E.cv_results_)
model_E_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__gamma,param_classifier__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.077166,0.003327,0.018201,0.000983,0.1,scale,linear,"{'classifier__C': 0.1, 'classifier__gamma': 's...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
1,0.56451,0.009192,0.443018,0.007228,0.1,scale,rbf,"{'classifier__C': 0.1, 'classifier__gamma': 's...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
2,0.073161,0.002228,0.017744,0.000589,0.1,auto,linear,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
3,0.343557,0.009895,0.425804,0.00637,0.1,auto,rbf,"{'classifier__C': 0.1, 'classifier__gamma': 'a...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
4,0.074322,0.002982,0.016232,0.000926,1.0,scale,linear,"{'classifier__C': 1, 'classifier__gamma': 'sca...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
5,0.179248,0.007736,0.262086,0.007915,1.0,scale,rbf,"{'classifier__C': 1, 'classifier__gamma': 'sca...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
6,0.073584,0.003674,0.017188,0.001502,1.0,auto,linear,"{'classifier__C': 1, 'classifier__gamma': 'aut...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
7,0.115121,0.001208,0.105098,0.008019,1.0,auto,rbf,"{'classifier__C': 1, 'classifier__gamma': 'aut...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
8,0.073486,0.002101,0.016286,0.000813,10.0,scale,linear,"{'classifier__C': 10, 'classifier__gamma': 'sc...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
9,0.10237,0.003945,0.054091,0.001909,10.0,scale,rbf,"{'classifier__C': 10, 'classifier__gamma': 'sc...",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


In [45]:
grid_search_E.best_params_

{'classifier__C': 0.1,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear'}

In [46]:
grid_search_E.best_estimator_

In [47]:
best_model = grid_search_D.best_estimator_
best_model

In [48]:
import pickle

# Save the best model to a pickle file
with open("best_model.pkl", "wb") as file:
    pickle.dump(best_model, file)

print("Best model saved to best_model.pkl")

Best model saved to best_model.pkl


In [49]:
Y_train = pd.get_dummies(Y_train).values

In [50]:
print(Y_train.shape)

(4920, 41)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap

# Assuming your dataset is loaded into a DataFrame `df`
# with 'target' as the column for prognosis and symptoms as feature columns

# Initialize and train the RandomForest model
classifier = MultiOutputClassifier(RandomForestClassifier(random_state=42))
classifier.fit(X_train, Y_train)

# Use SHAP to explain the model's predictions
explainer = shap.TreeExplainer(classifier)
shap_values = explainer.shap_values(X_test)

# Display feature importance for each class label
for i, class_name in enumerate(classifier.classes_):
    print(f"\nFeature importances for label: {class_name}")
    shap.summary_plot(shap_values[i].T, X_test, plot_type="bar", max_display=5)

# Optionally display summary plot for more detailed insights
#shap.summary_plot(shap_values, X_test)