### Import Libraries

In [67]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from google.colab import drive

### Data Extraction

In [68]:
# Mount drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [69]:
# Import csv data into dataframe
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Heart Failure Competition/train.csv')

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,PatientId
0,62,M,ATA,131,0,0,Normal,130,N,0.1,Up,0,474
1,60,M,ASY,135,0,0,Normal,63,Y,0.5,Up,1,370
2,50,M,ASY,150,215,0,Normal,140,Y,0.0,Up,0,205
3,61,F,ATA,140,298,1,Normal,120,Y,0.0,Up,0,544
4,53,M,ASY,125,0,1,Normal,120,N,1.5,Up,1,302


### Data Exploration

In [70]:
# Description of Data
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,PatientId
count,688.0,688.0,688.0,688.0,688.0,688.0,688.0,688.0
mean,53.380814,132.90407,200.460756,0.238372,136.594477,0.901017,0.565407,451.77907
std,9.40069,18.025038,110.493566,0.426398,25.67958,1.061356,0.496064,260.959924
min,28.0,92.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,47.0,120.0,177.0,0.0,119.0,0.0,0.0,230.75
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0,446.5
75%,60.0,140.0,267.25,0.0,155.0,1.5,1.0,681.25
max,77.0,200.0,603.0,1.0,202.0,4.4,1.0,916.0


In [71]:
# Frequency Table
print("Frequency of Chest Pain Types")
print(df['ChestPainType'].value_counts())

print("\n\nFrequency of ST Slope Values")
print(df['ST_Slope'].value_counts())

print("\n\nFrequency of Exercise Angina Values")
print(df['ExerciseAngina'].value_counts())

print("\n\nFrequency of Resting ECG Values")
print(df['RestingECG'].value_counts())

Frequency of Chest Pain Types
ASY    375
NAP    149
ATA    129
TA      35
Name: ChestPainType, dtype: int64


Frequency of ST Slope Values
Flat    349
Up      291
Down     48
Name: ST_Slope, dtype: int64


Frequency of Exercise Angina Values
N    407
Y    281
Name: ExerciseAngina, dtype: int64


Frequency of Resting ECG Values
Normal    416
LVH       139
ST        133
Name: RestingECG, dtype: int64


In [72]:
# Correlation Matrix
df.corr()

  df.corr()


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,PatientId
Age,1.0,0.273095,-0.063992,0.201012,-0.397127,0.282825,0.291494,0.292478
RestingBP,0.273095,1.0,0.10272,0.056955,-0.099743,0.188067,0.108307,0.011223
Cholesterol,-0.063992,0.10272,1.0,-0.236551,0.210487,0.100561,-0.211103,0.105995
FastingBS,0.201012,0.056955,-0.236551,1.0,-0.125158,0.063148,0.2565,0.049438
MaxHR,-0.397127,-0.099743,0.210487,-0.125158,1.0,-0.155104,-0.400989,0.158567
Oldpeak,0.282825,0.188067,0.100561,0.063148,-0.155104,1.0,0.392319,0.192431
HeartDisease,0.291494,0.108307,-0.211103,0.2565,-0.400989,0.392319,1.0,0.063664
PatientId,0.292478,0.011223,0.105995,0.049438,0.158567,0.192431,0.063664,1.0


In [73]:
# Checking for null values
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
PatientId         0
dtype: int64

### Data Pre-Processing

In [74]:
# Encoding categorical data
df_encd = pd.get_dummies(df,columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'])
df_encd

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,PatientId,Sex_F,Sex_M,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,62,131,0,0,130,0.1,0,474,0,1,...,0,0,0,1,0,1,0,0,0,1
1,60,135,0,0,63,0.5,1,370,0,1,...,0,0,0,1,0,0,1,0,0,1
2,50,150,215,0,140,0.0,0,205,0,1,...,0,0,0,1,0,0,1,0,0,1
3,61,140,298,1,120,0.0,0,544,1,0,...,0,0,0,1,0,0,1,0,0,1
4,53,125,0,1,120,1.5,1,302,0,1,...,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,51,131,152,1,130,1.0,1,580,0,1,...,0,0,1,0,0,0,1,0,1,0
684,52,125,212,0,168,1.0,1,709,0,1,...,0,0,0,1,0,1,0,0,0,1
685,41,104,0,0,111,0.0,0,459,0,1,...,0,0,0,0,1,1,0,0,0,1
686,56,140,0,1,121,1.8,1,317,0,1,...,0,0,0,1,0,0,1,0,0,1


### Train/Test Split of Data

In [75]:
# Separate the column we are predictiing
y = df_encd['HeartDisease']
y

0      0
1      1
2      0
3      0
4      1
      ..
683    1
684    1
685    0
686    1
687    1
Name: HeartDisease, Length: 688, dtype: int64

In [76]:
df_encd = df_encd.drop(columns=['HeartDisease','PatientId'])

x = df_encd.values

y = y.values
#print(x.shape)

In [77]:
# Split dataset in 60/40 for training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [78]:
# Scaling data
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
#x_train.shape

x_test = sc.transform(x_test)
#x_test.shape

In [79]:
df_encd

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,62,131,0,0,130,0.1,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,60,135,0,0,63,0.5,0,1,1,0,0,0,0,1,0,0,1,0,0,1
2,50,150,215,0,140,0.0,0,1,1,0,0,0,0,1,0,0,1,0,0,1
3,61,140,298,1,120,0.0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
4,53,125,0,1,120,1.5,0,1,1,0,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,51,131,152,1,130,1.0,0,1,1,0,0,0,1,0,0,0,1,0,1,0
684,52,125,212,0,168,1.0,0,1,1,0,0,0,0,1,0,1,0,0,0,1
685,41,104,0,0,111,0.0,0,1,1,0,0,0,0,0,1,1,0,0,0,1
686,56,140,0,1,121,1.8,0,1,1,0,0,0,0,1,0,0,1,0,0,1


### Creating Predictive Models

#### Logistic Regression Model

In [80]:
# Logistic Regression
lrmodel = LogisticRegression()

##### Training the Model

In [81]:
# Fit data to model logistic regession model
lrmodel.fit(x_train,y_train)


##### Testing the Model

In [82]:
# Make predictions with the test data
y_pred1 = lrmodel.predict(x_test)

# Calculate evaluation metrics
acc1 = accuracy_score(y_test, y_pred1)
preci1 = precision_score(y_test, y_pred1, average='weighted')
conf_matrix1 = confusion_matrix(y_test, y_pred1)

# Print out the various metrics for accuracy
print("Evaluation Metrics for the Logistic Regression Model")
print("Accuracy: ",acc1)
print("Precision: ",preci1)
print("Confusion Matrix: ",conf_matrix1)

Evaluation Metrics for the Logistic Regression Model
Accuracy:  0.8478260869565217
Precision:  0.8482598434745297
Confusion Matrix:  [[48 10]
 [11 69]]


#### Random Forest Model

In [83]:
# Random Forest
rfmodel = RandomForestClassifier(n_estimators=100,random_state=42)

##### Training the Model

In [84]:
# Fit data to random forest model
rfmodel.fit(x_train,y_train)

##### Testing the Model

In [85]:
# Make predictions with the test data
y_pred2 = rfmodel.predict(x_test)

# Calculate evaluation metrics
acc2 = accuracy_score(y_test, y_pred2)
preci2 = precision_score(y_test, y_pred2, average='weighted')
conf_matrix2 = confusion_matrix(y_test, y_pred2)

# Print out the various metrics for accuracy
print("\n\nEvaluation Metrics for the Random Forest Model")
print("Accuracy: ",acc2)
print("Precision: ",preci2)
print("Confusion Matrix: ",conf_matrix2)



Evaluation Metrics for the Random Forest Model
Accuracy:  0.8043478260869565
Precision:  0.804874863576579
Confusion Matrix:  [[45 13]
 [14 66]]


#### Decision Tree Model

In [86]:
# Decision Tree Model
dtmodel = DecisionTreeClassifier()

##### Training the Model

In [87]:
# Fit data to Decision Tree model
dtmodel.fit(x_train,y_train)

##### Testing the Model

In [88]:
# Make predictions with the test data
y_pred3 = dtmodel.predict(x_test)

# Calculate evaluation metrics
acc3 = accuracy_score(y_test, y_pred3)
preci3 = precision_score(y_test, y_pred3, average='weighted')
conf_matrix3 = confusion_matrix(y_test, y_pred3)

# Print out the various metrics for accuracy
print("\n\nEvaluation Metrics for the Random Forest Model")
print("Accuracy: ",acc3)
print("Precision: ",preci3)
print("Confusion Matrix: ",conf_matrix3)



Evaluation Metrics for the Random Forest Model
Accuracy:  0.7536231884057971
Precision:  0.7536231884057971
Confusion Matrix:  [[41 17]
 [17 63]]


#### Support Vector Machines

In [89]:
svmModel = SVC(kernel="rbf", gamma=0.5, C=1.0)

##### Training the Model

In [90]:
svmModel.fit(x_train,y_train)

##### Testing the Model

In [91]:
# Make predictions with the test data
y_pred4 = svmModel.predict(x_test)

# Calculate evaluation metrics
acc4 = accuracy_score(y_test, y_pred4)
preci4 = precision_score(y_test, y_pred4, average='weighted')
conf_matrix4 = confusion_matrix(y_test, y_pred4)

# Print out the various metrics for accuracy
print("\n\nEvaluation Metrics for the Random Forest Model")
print("Accuracy: ",acc4)
print("Confusion Matrix: ",conf_matrix4)



Evaluation Metrics for the Random Forest Model
Accuracy:  0.7898550724637681
Confusion Matrix:  [[32 26]
 [ 3 77]]


### Predictions Using the Models

#### Preprocessing of Test Data

In [92]:
# Import test data
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Heart Failure Competition/test.csv')
test_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,PatientId
0,76,F,NAP,140,197,0,ST,116,N,1.1,Flat,688
1,57,M,ASY,110,0,1,ST,131,Y,1.4,Up,297
2,60,F,TA,150,240,0,Normal,171,N,0.9,Up,678
3,51,F,NAP,140,308,0,LVH,142,N,1.5,Up,859
4,61,M,TA,134,234,0,Normal,145,N,2.6,Flat,628


In [93]:
# Check for null values
test_df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
PatientId         0
dtype: int64

In [94]:
# Encode data to resemble the training data
test_dfEnc = pd.get_dummies(test_df,columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope',])
test_dfEnc

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,PatientId,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,76,140,197,0,116,1.1,688,1,0,0,...,1,0,0,0,1,1,0,0,1,0
1,57,110,0,1,131,1.4,297,0,1,1,...,0,0,0,0,1,0,1,0,0,1
2,60,150,240,0,171,0.9,678,1,0,0,...,0,1,0,1,0,1,0,0,0,1
3,51,140,308,0,142,1.5,859,1,0,0,...,1,0,1,0,0,1,0,0,0,1
4,61,134,234,0,145,2.6,628,0,1,0,...,0,1,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,57,152,274,0,88,1.2,737,0,1,1,...,0,0,0,1,0,0,1,0,1,0
226,70,140,0,1,157,2.0,378,0,1,1,...,0,0,0,1,0,0,1,0,1,0
227,45,132,297,0,144,0.0,66,1,0,1,...,0,0,0,1,0,1,0,0,0,1
228,50,115,0,0,120,0.5,381,0,1,1,...,0,0,0,1,0,0,1,0,1,0


In [95]:
# Extract the patient IDs for the submission file
patientID_test = test_dfEnc['PatientId']

# Drop columns not used in training the models
test_dfEnc = test_dfEnc.drop(columns=['PatientId'])
test_dfEnc

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,76,140,197,0,116,1.1,1,0,0,0,1,0,0,0,1,1,0,0,1,0
1,57,110,0,1,131,1.4,0,1,1,0,0,0,0,0,1,0,1,0,0,1
2,60,150,240,0,171,0.9,1,0,0,0,0,1,0,1,0,1,0,0,0,1
3,51,140,308,0,142,1.5,1,0,0,0,1,0,1,0,0,1,0,0,0,1
4,61,134,234,0,145,2.6,0,1,0,0,0,1,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,57,152,274,0,88,1.2,0,1,1,0,0,0,0,1,0,0,1,0,1,0
226,70,140,0,1,157,2.0,0,1,1,0,0,0,0,1,0,0,1,0,1,0
227,45,132,297,0,144,0.0,1,0,1,0,0,0,0,1,0,1,0,0,0,1
228,50,115,0,0,120,0.5,0,1,1,0,0,0,0,1,0,0,1,0,1,0


In [96]:
test_x = test_dfEnc.values

# Scale data to match the training data
test_x = sc.transform(test_x)
test_x

array([[ 2.42837055,  0.3817614 , -0.00887278, ..., -0.26069362,
         0.98555881, -0.86694426],
       [ 0.38632724, -1.28952093, -1.76639688, ..., -0.26069362,
        -1.0146528 ,  1.15347669],
       [ 0.70875513,  0.93885551,  0.37474923, ..., -0.26069362,
        -1.0146528 ,  1.15347669],
       ...,
       [-0.90338433, -0.06391389,  0.88327143, ..., -0.26069362,
        -1.0146528 ,  1.15347669],
       [-0.36600451, -1.01097388, -1.76639688, ..., -0.26069362,
         0.98555881, -0.86694426],
       [-0.6884324 , -0.17533271,  0.49072798, ..., -0.26069362,
        -1.0146528 ,  1.15347669]])

#### Predictions

Predictions were made using all four models to identify overfitting/underfitting as well as to compare their performance against the test data.

In [97]:
# Predicting the outcomes using the Logistic Regression Model
test_pred_y1 = lrmodel.predict(test_x)        # The first Kaggle submission

In [98]:
# Predicting the outcomes using the Random Forest Model
test_pred_y2 = rfmodel.predict(test_x)        # The second Kaggle submission

In [99]:
# Predicting the outcomes using the Decision Tree Model
test_pred_y3 = dtmodel.predict(test_x)        # The fourth Kaggle submission

In [100]:
# Predicting the outcomes using the Support Vector Machine Model
test_pred_y4 = svmModel.predict(test_x)       # The third Kaggle submission

#### Exporting Predictions

The Random Forest Model performed the best against the test data hence only the corresponding results are showed to be exported to a csv file.

In [109]:
# Adding submission data to a dataframe
df_sub = pd.DataFrame({'PatientId': patientID_test,'HeartDisease': test_pred_y2})
df_sub

Unnamed: 0,PatientId,HeartDisease
0,688,1
1,297,1
2,678,0
3,859,0
4,628,1
...,...,...
225,737,1
226,378,1
227,66,0
228,381,1


In [108]:
# Export submission data to csv
df_sub.to_csv('submission.csv', index=False)