[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dvm14/HeartDiseasePrediction/blob/main/MLPipeline.ipynb)

In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


Load the dataset

In [3]:
data=pd.read_csv("heart.csv")

In [4]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [6]:
data.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [7]:
for col in data.columns:
  num_zeros=(data[col]==0).sum()
  print(col,num_zeros)

Age 0
Sex 0
ChestPainType 0
RestingBP 1
Cholesterol 172
FastingBS 704
RestingECG 0
MaxHR 0
ExerciseAngina 0
Oldpeak 368
ST_Slope 0
HeartDisease 410


In [8]:
invalid_zero=["RestingBP","Cholesterol"]
data[invalid_zero]=data[invalid_zero].replace(0,np.nan)

In [9]:
data.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,1
Cholesterol,172
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [13]:
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       917 non-null    float64
 4   Cholesterol     746 non-null    float64
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 86.2+ KB


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,917.0,746.0,918.0,918.0,918.0,918.0
mean,53.510893,132.540894,244.635389,0.233115,136.809368,0.887364,0.553377
std,9.432617,17.999749,59.153524,0.423046,25.460334,1.06657,0.497414
min,28.0,80.0,85.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,207.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,237.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,275.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [15]:
from sklearn.preprocessing import StandardScaler

X=data.drop("HeartDisease",axis=1)
X_encoded=pd.get_dummies(X,drop_first=True)
Y=data["HeartDisease"]



In [16]:
X_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   RestingBP          917 non-null    float64
 2   Cholesterol        746 non-null    float64
 3   FastingBS          918 non-null    int64  
 4   MaxHR              918 non-null    int64  
 5   Oldpeak            918 non-null    float64
 6   Sex_M              918 non-null    bool   
 7   ChestPainType_ATA  918 non-null    bool   
 8   ChestPainType_NAP  918 non-null    bool   
 9   ChestPainType_TA   918 non-null    bool   
 10  RestingECG_Normal  918 non-null    bool   
 11  RestingECG_ST      918 non-null    bool   
 12  ExerciseAngina_Y   918 non-null    bool   
 13  ST_Slope_Flat      918 non-null    bool   
 14  ST_Slope_Up        918 non-null    bool   
dtypes: bool(9), float64(3), int64(3)
memory usage: 51.2 KB


In [17]:
X_encoded.fillna(X_encoded.mean(),inplace=True)

In [18]:
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X_encoded)

In [19]:
x_train,x_test,y_train,y_test=train_test_split(X_scaled,Y,test_size=0.20,random_state=42)

In [30]:
model=LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)

In [21]:
y_pred=model.predict(x_test)

In [22]:
print(accuracy_score(y_test,y_pred))

0.8641304347826086


In [38]:
model2 = RandomForestClassifier(n_estimators=100, max_depth=5)
model2.fit(x_train, y_train)
y_pred2=model2.predict(x_test)
print(accuracy_score(y_test,y_pred2))


0.842391304347826


In [39]:
from sklearn.model_selection import GridSearchCV

params = {'min_samples_leaf':[1,3,10],'n_estimators':[100,1000],
          'max_features':[0.1,0.5,1.],'max_samples':[0.5,None]}

model3 = RandomForestClassifier()
grid_search = GridSearchCV(model3,params,cv=3)
grid_search.fit(x_train,y_train)

In [40]:
grid_search.best_params_

{'max_features': 0.1,
 'max_samples': 0.5,
 'min_samples_leaf': 1,
 'n_estimators': 1000}

In [41]:
final_model = RandomForestClassifier(criterion='gini',max_depth=None, min_samples_leaf=1,n_estimators=1000,
                                 max_features=0.1,max_samples=0.5,random_state=0)

In [43]:
final_model.fit(x_train, y_train)

test_preds = final_model.predict(x_test)
print(accuracy_score(y_test,test_preds))

0.8804347826086957
