## 1. Importing Libraries

In [1]:
# Data Loading and Numerical Operations
import pandas as pd
import numpy as np

# visualisasi distribusi untuk fitur kategorikal menggunakan bar chart
import matplotlib.pyplot as plt
import seaborn as sns

# Data Resampling
from sklearn.utils import resample

#Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Data Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, classification_report

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Ensembling
# from mlxtend.classifier import StackingCVClassifier

import warnings
warnings.filterwarnings('always')  # "error", "ignore", "always", "default", "module" or "once"

## 2. Reading Dataset

In [2]:
# Reading and converting the data into a pandas dataframe
df = pd.read_excel("C:/Users/ASUS/JupyterNotebook/Machine Learning/Coronary-Heart-Disease-Prediction-master/CardiacPrediction.xlsx") 

# Calculating the dimensions of the dataset
df.shape 

(37079, 51)

In [3]:
print(df['Age'].max())
print(df['Age'].min())

85
20


In [4]:
df.shape

(37079, 51)

## 3. Exploratory Data Analysis

In [5]:
dataframe = df.drop(columns=['SEQN', 'Gender', 'Annual-Family-Income', 'Ratio-Family-Income-Poverty', 'Height', 'Lymphocyte', 'Monocyte', 'Eosinophils', 'Mean-Cell-Vol', 'Mean-Cell-Hgb-Conc.', 'Segmented-Neutrophils', 'Hematocrit', 'Total-Cholesterol', 'Health-Insurance', 'Vigorous-work', 'Moderate-work', 'Diabetes', 'Blood-Rel-Diabetes', 'Blood-Rel-Stroke'])

In [6]:
dataframe.dtypes

Age                              int64
X60-sec-pulse                    int64
Systolic                         int64
Diastolic                        int64
Weight                         float64
Body-Mass-Index                float64
White-Blood-Cells              float64
Basophils                      float64
Red-Blood-Cells                float64
Hemoglobin                     float64
Mean-cell-Hemoglobin           float64
Platelet-count                 float64
Mean-Platelet-Vol              float64
Red-Cell-Distribution-Width    float64
Albumin                          int64
ALP                              int64
AST                              int64
ALT                              int64
Cholesterol                    float64
Creatinine                     float64
Glucose                        float64
GGT                              int64
Iron                           float64
LDH                              int64
Phosphorus                     float64
Bilirubin                

## 4. Pre-Processing

In [7]:
dataframe.describe()
dataframe.shape

(37079, 32)

## Feature Splitting

In [8]:
# Data Splitting
from sklearn.model_selection import train_test_split

X = dataframe.drop(['CoronaryHeartDisease'], axis=1) #x atribut yang mempengaruhi CoronaryHeartDisease (independent variable)
y = dataframe['CoronaryHeartDisease'] #y atribut label (dependent)
train_x, test_x, train_y, test_y = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=100)

In [9]:
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

(25955, 31)
(11124, 31)
(25955,)
(11124,)


In [10]:
y.value_counts()

0    35571
1     1508
Name: CoronaryHeartDisease, dtype: int64

## Feature Scaling

In [11]:
# Data Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

## Resampling imbalanced dataset by oversampling positive cases Using SMOTE

In [12]:
#SMOTE
from imblearn.over_sampling import SMOTE

In [13]:
sm = SMOTE(random_state=30)

# sampling SMOTE
x_sampling, y_sampling = sm.fit_resample(train_x, test_x)

# Convert arrays to DataFrame and Series
x_sampling_df = pd.DataFrame(x_sampling, columns=x.columns)
y_sampling_series = pd.Series(y_sampling, name=y.name)

# Combine the resampled minority class with the majority class
resampled_data = pd.concat([x_sampling_df, y_sampling_series], axis=1)

# Print the resampled data
print(resampled_data)

ValueError: Unknown label type: 'continuous-multioutput'

In [None]:
pd.Series(y_sampling_series).value_counts().plot(kind='bar', title='Class distribution after applying SMOTE', xlabel='CoronaryHeartDisease')

In [None]:
are_rows_equal = resampled_data.duplicated().any()

if are_rows_equal:
    print("Ada baris yang sama di antara kedua dataframe.")
else:
    print("Tidak ada baris yang sama di antara kedua dataframe.")

# Mengecek duplikasi setelah resampling
print("Jumlah baris duplikat setelah resampling:", resampled_data.duplicated().sum())

# Menghitung korelasi

In [None]:
#Checking relationship between variables
cor=dataframe.corr()
plt.figure(figsize=(40,30), facecolor='w')
sns.heatmap(cor,xticklabels=cor.columns,yticklabels=cor.columns,annot=True)
plt.title("Correlation among all the Variables of the Dataset", size=20)
cor

In [None]:
# # Deskripsi statistik dasar untuk fitur kategorikal
# print(dataframe.describe())

# # Menghitung jumlah nilai unik untuk setiap fitur kategorikal
# for col in ['Gender', 'Vigorous-work', 'Moderate-work', 'Health-Insurance', 'Diabetes', 'Blood-Rel-Diabetes', 'Blood-Rel-Stroke']:
#     print(f"Jumlah nilai unik di {col}: {dataframe[col].nunique()}")
#     print(dataframe[col].value_counts())
#     print("-----------------")


### a. Univariate Analysis

#### Categorical Features

In [None]:
# categorical_features = ['Gender', 'Vigorous-work', 'Moderate-work', 'Diabetes', 'Blood-Rel-Diabetes', 'Blood-Rel-Stroke']

# for feature in categorical_features:
#     print(feature,':')
#     print(dataframe[feature].value_counts())
#     print("-----------------")

In [None]:
# categorical_features = ['Gender', 'Vigorous-work', 'Moderate-work', 'Diabetes', 'Blood-Rel-Diabetes', 'Blood-Rel-Stroke']
# for feature in categorical_features:
#     plt.figure(figsize=(6, 4))
#     sns.countplot(x=feature, data=dataframe)
#     plt.title(f'Distribusi {feature}')
#     plt.show()

Diantara fitur kategorikal : 
* `Vigorous-work`, `diabetes`, `Blood-Rel-Stroke` sangat tidak seimbang.
* `Moderate-work` untuk kategori 2 memiliki jumlah tertinggi, yang menunjukkan bahwa mayoritas responden tidak melakukan pekerjaan sedang dan dan kategori 3 (kategori Tidak dapat melakukan aktivitas) memiliki jumlah yang sangat sedikit.
* Distribusi `Gender` menunjukkan distribusi yang hampir seimbang antara jenis kelamin dalam sampel.

#### Numerical Features

In [None]:
#visualisasi distribusi untuk fitur numerikal menggunakan histogram.

# numerical_features = ['Age', 'X60-sec-pulse','Systolic', 'Diastolic', 'Weight', 'Body-Mass-Index', 'White-Blood-Cells', 
#                       'Basophils', 'Red-Blood-Cells', 'Hemoglobin', 'Mean-cell-Hemoglobin', 'Platelet-count',
#                       'Mean-Platelet-Vol', 'Red-Cell-Distribution-Width', 'Albumin', 'ALP', 'AST', 'ALT', 
#                       'Cholesterol', 'Creatinine', 'Glucose', 'GGT', 'Iron', 'LDH', 'Phosphorus', 
#                       'Bilirubin', 'Protein', 'Uric.Acid', 'Triglycerides', 'HDL', 'Glycohemoglobin',
#                       'Diabetes', 'Blood-Rel-Diabetes', 'Blood-Rel-Stroke']

# tipe data numerik standar untuk bilangan pecahan dan bilangan bulat.
# numerical_features = data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Menghapus fitur kategorikal yang mungkin secara tidak sengaja dianggap sebagai numerik
# numerical_features = [feature for feature in numerical_features if feature not in numerical_features]

# for feature in numerical_features:
#     plt.figure(figsize=(6, 3))
#     sns.histplot(dataframe[feature], kde=True)
#     plt.title(f'Distribusi {feature}')
#     plt.show()


# Feature Selection


In [None]:
#To idenfify the features that have larger contribution towards the outcome variable, CoronaryHeartDisease
X=resampled_data.iloc[:,0:31]
y=resampled_data.iloc[:,-1]
print("X - ", X.shape, "\ny - ", y.shape)
print(X)

In [None]:
#Apply SelectKBest and extract top 10 features
best=SelectKBest(score_func=chi2, k=20)

In [None]:
fit=best.fit(X,y)

In [None]:
data_scores=pd.DataFrame(fit.scores_)
data_columns=pd.DataFrame(X.columns)

In [None]:
#Join the two dataframes
scores=pd.concat([data_columns,data_scores],axis=1)
scores.columns=['Feature','Score']
print(scores.nlargest(20,'Score'))

In [None]:
#To visualize feature selection
scores=scores.sort_values(by="Score", ascending=False)
plt.figure(figsize=(20,7), facecolor='w')
sns.barplot(x='Score',y='Feature',data=scores,palette='BuGn_r')
plt.title("Plot showing the best features in descending order", size=20)
plt.show()

This plot shows the `Features` and their respective `chi-square test` scores

In [None]:
#Select 10 features
features=scores["Feature"].tolist()[:10]
features

In [None]:
data=dataframe[['Age','Uric.Acid','Creatinine','Platelet-count','LDH','X60-sec-pulse','Systolic','Diastolic','ALT','Glucose','CoronaryHeartDisease']]
data.head()

## Target Variable

In [None]:
#Distribution of outcome variable, Heart Disease
plt.figure(figsize=(8, 6), facecolor='w')
plt.subplots_adjust(right=1.5)

# Set the color palette
custom_palette = ["steelBlue", "salmon"]

#first subplot
plt.subplot(121)
sns.countplot(x="CoronaryHeartDisease", data=dataframe, palette=custom_palette)

plt.title("Count distribution of coronaryHeartDisease", size=15)


#first subplot
plt.subplot(122)
labels=[0,1]
plt.pie(data["CoronaryHeartDisease"].value_counts(),autopct="%1.1f%%",labels=labels,colors=["steelBlue","salmon"])

plt.show()


# Predictive Modeling

We use the following different machine learning models for the dataset:

1. Random Forest Classifier
2. Gradient Boosting Classifier

In [None]:
m3 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=200, random_state=0,max_depth=12)
rf.fit(train_x,train_y)
rf_predicted = rf.predict(test_x)
rf_conf_matrix = confusion_matrix(test_y, rf_predicted)
rf_acc_score = accuracy_score(test_y, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(test_y,rf_predicted))

In [None]:
m5 = 'Gradient Boosting Classifier'
gvc =  GradientBoostingClassifier()
gvc.fit(train_x,train_y)
gvc_predicted = gvc.predict(test_x)
gvc_conf_matrix = confusion_matrix(test_y, gvc_predicted)
gvc_acc_score = accuracy_score(test_y, gvc_predicted)
print("confussion matrix")
print(gvc_conf_matrix)
print("\n")
print("Accuracy of Gradient Boosting Classifier:",gvc_acc_score*100,'\n')
print(classification_report(test_y,gvc_predicted))

## Hyperparameter Tuning for best Classifier
#### Using Randomized Search Cross Validation

### Random Forest Classifier

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 10, 
                               cv = 3, 
                               verbose=2, 
                               random_state=7, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(train_x,train_y)

In [None]:
rf_hyper = rf_random.best_estimator_
rf_hyper.fit(train_x,train_y)
print("Accuracy on training set is : {}".format(rf_hyper.score(train_x,train_y)))
print("Accuracy on validation set is : {}".format(rf_hyper.score(test_x, test_y)))
rf_predicted = rf_hyper.predict(test_x)
rf_acc_score = accuracy_score(test_y, rf_predicted)
print("Accuracy of Hyper-tuned Random Forest Classifier:",rf_acc_score*100,'\n')
print(classification_report(test_y, rf_predicted))

### Gradient Boosting Classifier

In [None]:
#Number of trees
n_estimators = [int(i) for i in np.linspace(start=100,stop=1000,num=10)]
#Number of features to consider at every split
max_features = ['auto','sqrt']
#Maximum number of levels in tree
max_depth = [int(i) for i in np.linspace(10, 100, num=10)]
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split=[2,5,10]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]

#Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
gb=GradientBoostingClassifier(random_state=0)
#Random search of parameters, using 3 fold cross validation, 
#search across 100 different combinations
gb_random = RandomizedSearchCV(estimator=gb, param_distributions=random_grid,
                              n_iter=10, scoring='f1', 
                              cv=3, verbose=2, random_state=0, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
gb_random.fit(train_x,train_y)

In [None]:
gb_hyper = gb_random.best_estimator_
gb_hyper.fit(train_x,train_y)
print("Accuracy on training set is : {}".format(gb_hyper.score(train_x,train_y)))
print("Accuracy on validation set is : {}".format(gb_hyper.score(test_x, test_y)))
gbc_predicted = gb_hyper.predict(test_x)
gbc_acc_score = accuracy_score(test_y, gbc_predicted)
print("Accuracy of Hyper-tuned Gradient Boosting Classifier:",gbc_acc_score*100,'\n')
print(classification_report(test_y, gbc_predicted))

In [None]:
#Number of neighbors
n_neighbors = np.arange(1, 10)
#Number of weights
weights = ['uniform','distance']
#metric
metric = ['euclidean', 'manhattan', 'minkowski']

#Create the random grid
random_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Berbagai nilai untuk n_neighbors
    'weights': ['uniform', 'distance'],  # Pilihan untuk jenis bobot
#     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Pilihan untuk algoritma
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Pilihan metrik jarak
#     'p': [1, 2],  # Nilai p untuk metrik Minkowski
}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
knn = KNeighborsClassifier()

# Inisialisasi RandomizedSearchCV
knn_random = RandomizedSearchCV(estimator=knn, param_distributions=random_grid,
                                   n_iter=100,  # Jumlah kombinasi yang akan diuji
                                   scoring='f1',  # Metrik evaluasi yang digunakan
                                   cv=3,  # Jumlah lipatan validasi silang (cross-validation)
                                   verbose=2,  # Tingkat verbosity
                                   n_jobs=-1,  # Menggunakan semua CPU yang tersedia
                                   random_state=0,  # Seed untuk reproduksi
                                   return_train_score=True)

# Melatih model dengan pencarian acak
knn_random.fit(train_x, train_y)

In [None]:
knn_hyper = knn_random.best_estimator_
knn_hyper.fit(train_x,train_y)
print("Accuracy on training set is : {}".format(knn_hyper.score(train_x,train_y)))
print("Accuracy on validation set is : {}".format(knn_hyper.score(test_x, test_y)))
knnc_predicted = knn_hyper.predict(test_x)
knnc_acc_score = accuracy_score(test_y, knnc_predicted)
print("Accuracy of Hyper-tuned K Nearest Neighbor:",knn_acc_score*100,'\n')
print(classification_report(test_y, knnc_predicted))

In [None]:
lr_false_positive_rate,lr_true_positive_rate,lr_threshold = roc_curve(test_y,lr_predict)
knn_false_positive_rate,knn_true_positive_rate,knn_threshold = roc_curve(test_y,knn_predict)
rf_false_positive_rate,rf_true_positive_rate,rf_threshold = roc_curve(test_y,rf_predicted)                                                             
dt_false_positive_rate,dt_true_positive_rate,dt_threshold = roc_curve(test_y,dt_predicted)
gbc_false_positive_rate,gbc_true_positive_rate,gbc_threshold = roc_curve(test_y,gbc_predicted)


sns.set_style('whitegrid')
plt.figure(figsize=(15,8), facecolor='w')
plt.title('Reciever Operating Characterstic Curve')
plt.plot(lr_false_positive_rate,lr_true_positive_rate,label='Logistic Regression')
plt.plot(knn_false_positive_rate,knn_true_positive_rate,label='K-Nearest Neighbor')
plt.plot(rf_false_positive_rate,rf_true_positive_rate,label='Random Forest')
plt.plot(dt_false_positive_rate,dt_true_positive_rate,label='Desion Tree')
plt.plot(gbc_false_positive_rate,gbc_true_positive_rate,label='Gradient Boosting Classifier')
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend()
plt.show()

# **Model Evaluation**

In [None]:
model_ev = pd.DataFrame({'Model': ['Logistic Regression','K-Nearest Neighbour','Random Forest',
                                   'Decision Tree','Gradient Boosting'], 'Accuracy': [lr_acc_score*100, knn_acc_score*100, 
                                                                                      rf_acc_score*100, dt_acc_score*100,gbc_acc_score*100]})
model_ev

In [None]:
colors = ['red','green','blue','gold','silver']
plt.figure(figsize=(15,8), facecolor='w')
plt.title("Barplot Representing Accuracy of different models")
plt.ylabel("Accuracy %")
plt.xlabel("Models")
plt.bar(model_ev['Model'],model_ev['Accuracy'],color = colors)
plt.show()

# Ensembling

In order to increase the accuracy of the model we use ensembling. Here we use stacking technique.

In [None]:
scv=StackingCVClassifier(classifiers=[rf_hyper, gb_hyper, knn], meta_classifier= rf)
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)
scv.fit(train_x.values,train_y.values)
scv_predicted = scv.predict(test_x)
scv_conf_matrix = confusion_matrix(test_y, scv_predicted)
scv_acc_score = accuracy_score(test_y, scv_predicted)
scv_rec_score = recall_score(test_y, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print("Recall of StackingCVClassifier:",scv_rec_score*100,'\n')

print(classification_report(test_y,scv_predicted))

In [None]:
scv=StackingCVClassifier(classifiers=[rf_hyper, gb_hyper], meta_classifier= rf)
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)
scv.fit(train_x.values,train_y.values)
scv_predicted = scv.predict(test_x)
scv_conf_matrix = confusion_matrix(test_y, scv_predicted)
scv_acc_score = accuracy_score(test_y, scv_predicted)
scv_rec_score = recall_score(test_y, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print("Recall of StackingCVClassifier:",scv_rec_score*100,'\n')

print(classification_report(test_y,scv_predicted))