In [14]:
# # preprocess.py
import pandas   as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Reading the dataset
# df = pd.read_csv('/content/data/heart_2020_cleaned_test100.csv')
df = pd.read_csv('https://raw.githubusercontent.com/Zivid99/Alzheimer-s-Disease-Vaster/main/heart_2020_cleaned_test100.csv')
# Checking and Removing Duplicate rows
df = df.drop_duplicates()

#  Data Encoding
encode_AgeCategory = {'55-59':57, '80 or older':80, '65-69':67,
                      '75-79':77,'40-44':42,'70-74':72,'60-64':62,
                      '50-54':52,'45-49':47,'18-24':21,'35-39':37,
                      '30-34':32,'25-29':27}
df['AgeCategory'] = df['AgeCategory'].apply(lambda x: encode_AgeCategory[x])
df['AgeCategory'] = df['AgeCategory'].astype('float')

# Integer encode columns with 2 unique values
for col in ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']:
    if df[col].dtype == 'O':
        le = preprocessing.LabelEncoder()
        df[col] = le.fit_transform(df[col])

# One-hot encode columns with more than 2 unique values
df = pd.get_dummies(df, columns=['Race', 'Diabetic', 'GenHealth', ], prefix = ['Race', 'Diabetic', 'GenHealth'])

# Feature Scaling
standardScaler = preprocessing.StandardScaler()
columns_to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'AgeCategory', 'SleepTime']
df[columns_to_scale] = standardScaler.fit_transform(df[columns_to_scale])

# Undersampling
Y = df['HeartDisease']
X = df.drop(['HeartDisease'], axis = 1)

# Separation of the data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 2)

X_train.to_csv('data/X_train.csv', index = 0)
X_test.to_csv('data/X_test.csv', index = 0)
y_train.to_csv('data/y_train.csv', index = 0)
y_test.to_csv('data/y_test.csv', index = 0)

# XGBoost

In [23]:
from sklearn.model_selection import GridSearchCV
import pandas   as pd
import argparse
import pickle
import time
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score, recall_score, auc,roc_curve,f1_score,roc_auc_score,precision_recall_fscore_support

## for xgboost- new library
from xgboost import XGBClassifier

# Parser for command-line options, arguments and sub-commands
temp =  argparse.ArgumentParser()

# read train set
X_train= pd.read_csv('data/X_train.csv')
y_train= pd.read_csv('data/y_train.csv')

#grid search parameters
xgboost_gs_classifier = XGBClassifier()
gs_params ={'colsample_bytree': [0.4,0.5],
 'gamma': [0.3,0.5],
 'learning_rate': [0.01,0.1],
 'max_depth': [5,10],
 'min_child_weight': [5,9],
 'n_estimators': [200,300]}

# dtree_clf = dtree
# record start time
start_time = time.time()
xgboost_gs_model=GridSearchCV(xgboost_gs_classifier,param_grid=gs_params,scoring='neg_log_loss',n_jobs=-1,cv=3,verbose=3)
xgboost_gs_model.fit(X_train,y_train)

# record end time 
end_time = time.time()
time_cost = end_time - start_time

# save the model
with open('pkl/xgboost_clf.pkl','wb') as f:
    pickle.dump((xgboost_gs_classifier,time_cost), f)

xgb_gs_model_preds=xgboost_gs_model.predict(X_test)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


In [24]:
print('XGBoost-GridSearch')
xgb_gs_model_preds_acc=accuracy_score(y_test,xgb_gs_model_preds)
print('Accuracy score: ',xgb_gs_model_preds_acc)
xgb_gs_model_preds_roc=roc_auc_score(y_test,xgb_gs_model_preds)
print('ROC_AUC_Score: ',xgb_gs_model_preds_roc)

XGBoost-GridSearch
Accuracy score:  0.9090909090909091
ROC_AUC_Score:  0.5
