## Load the Data

In [1]:
import pandas as pd
df=pd.read_csv("c:/users/deepa/downloads/heart_failure_clinical_records_dataset.csv")
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0.0,20.0,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0.0,38.0,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0.0,20.0,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0.0,20.0,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1.0,20.0,0,327000.0,2.7,116,0,0,8,1


In [2]:
df['DEATH_EVENT'].value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

## Clean and Analayze the data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
age                         299 non-null float64
anaemia                     299 non-null int64
creatinine_phosphokinase    299 non-null int64
diabetes                    298 non-null float64
ejection_fraction           297 non-null float64
high_blood_pressure         299 non-null int64
platelets                   299 non-null float64
serum_creatinine            299 non-null float64
serum_sodium                299 non-null int64
sex                         299 non-null int64
smoking                     299 non-null int64
time                        299 non-null int64
DEATH_EVENT                 299 non-null int64
dtypes: float64(5), int64(8)
memory usage: 30.5 KB


In [4]:
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,298.0,297.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.419463,38.013468,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494301,11.791538,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


#from above we came to know that there are sum null values present in the data, so we have to either remove the null values or add mean,mode or median.
#since the data we are using is very small we can impute the mean using Simple Imputer class of sklearn or using pandas attributes

In [5]:
df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    1
ejection_fraction           2
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [6]:
df['diabetes'].isnull().sum()

1

In [7]:
df['ejection_fraction'].isnull().sum()

2

In [8]:
from sklearn.preprocessing import Imputer
imput=Imputer(strategy='median')
imputed_data=imput.fit_transform(df)
imputed_data

array([[7.500e+01, 0.000e+00, 5.820e+02, ..., 0.000e+00, 4.000e+00,
        1.000e+00],
       [5.500e+01, 0.000e+00, 7.861e+03, ..., 0.000e+00, 6.000e+00,
        1.000e+00],
       [6.500e+01, 0.000e+00, 1.460e+02, ..., 1.000e+00, 7.000e+00,
        1.000e+00],
       ...,
       [4.500e+01, 0.000e+00, 2.060e+03, ..., 0.000e+00, 2.780e+02,
        0.000e+00],
       [4.500e+01, 0.000e+00, 2.413e+03, ..., 1.000e+00, 2.800e+02,
        0.000e+00],
       [5.000e+01, 0.000e+00, 1.960e+02, ..., 1.000e+00, 2.850e+02,
        0.000e+00]])

In [9]:
print(imputed_data)

[[7.500e+01 0.000e+00 5.820e+02 ... 0.000e+00 4.000e+00 1.000e+00]
 [5.500e+01 0.000e+00 7.861e+03 ... 0.000e+00 6.000e+00 1.000e+00]
 [6.500e+01 0.000e+00 1.460e+02 ... 1.000e+00 7.000e+00 1.000e+00]
 ...
 [4.500e+01 0.000e+00 2.060e+03 ... 0.000e+00 2.780e+02 0.000e+00]
 [4.500e+01 0.000e+00 2.413e+03 ... 1.000e+00 2.800e+02 0.000e+00]
 [5.000e+01 0.000e+00 1.960e+02 ... 1.000e+00 2.850e+02 0.000e+00]]


In [10]:
df1=pd.DataFrame(imputed_data,columns=df.columns)
df1.head(25)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0
5,90.0,1.0,47.0,0.0,40.0,1.0,204000.0,2.1,132.0,1.0,1.0,8.0,1.0
6,75.0,1.0,246.0,0.0,15.0,0.0,127000.0,1.2,137.0,1.0,0.0,10.0,1.0
7,60.0,1.0,315.0,1.0,60.0,0.0,454000.0,1.1,131.0,1.0,1.0,10.0,1.0
8,65.0,0.0,157.0,0.0,65.0,0.0,263358.03,1.5,138.0,0.0,0.0,10.0,1.0
9,80.0,1.0,123.0,0.0,38.0,1.0,388000.0,9.4,133.0,1.0,1.0,10.0,1.0


In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
age                         299 non-null float64
anaemia                     299 non-null float64
creatinine_phosphokinase    299 non-null float64
diabetes                    299 non-null float64
ejection_fraction           299 non-null float64
high_blood_pressure         299 non-null float64
platelets                   299 non-null float64
serum_creatinine            299 non-null float64
serum_sodium                299 non-null float64
sex                         299 non-null float64
smoking                     299 non-null float64
time                        299 non-null float64
DEATH_EVENT                 299 non-null float64
dtypes: float64(13)
memory usage: 30.5 KB


In [12]:
df1.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [13]:
df1['diabetes'].value_counts()

0.0    174
1.0    125
Name: diabetes, dtype: int64

In [14]:
df1['DEATH_EVENT'].value_counts()

0.0    203
1.0     96
Name: DEATH_EVENT, dtype: int64

In [15]:
df1['anaemia'].value_counts()

0.0    170
1.0    129
Name: anaemia, dtype: int64

In [16]:
df1['high_blood_pressure'].value_counts()

0.0    194
1.0    105
Name: high_blood_pressure, dtype: int64

Above attribute clearly shows that now there are no null values present in the data.

## Analysing the data using plots and finding the correlation of features with other features as well as the target value

In [None]:
import matplotlib.pyplot as plt
df1.hist(bins=50,figsize=(10,15))
plt.show()

<matplotlib.figure.Figure at 0x1547104a748>

In [None]:
import seaborn as sns
sns.countplot(x=df1['anaemia'])

In [None]:
sns.scatterplot(x=df1['DEATH_EVENT'],y=df1['time'],data=df1)

it indiccates that more number of people with anaemia has less death rates

In [None]:
sns.countplot(x='smoking',data=df1)

In [None]:
plt.scatter(x=df1['DEATH_EVENT'],y=df1['smoking'])

In [None]:
corr_matrix=df1.corr()
corr_matrix['DEATH_EVENT'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes=['DEATH_EVENT','serum_creatinine','age','high_blood_pressure','anaemia','time','ejection_fraction','serum_sodium']
scatter_matrix(df1[attributes],figsize=(20,20))

 Normal range of serum creatinine in blood test
In Adult Male: 0.6-1.2 mg/dl
In Adult Female: 0.5-1.1 mg/dl

In [None]:
plt.scatter(x=df1['age'],y=df1['serum_creatinine'])

In [None]:
import seaborn as sns
corr=df1.corr()
fig,ax=plt.subplots(figsize=(20,10))
sns.heatmap(corr,cmap="GnBu",annot=True)

In [None]:
sns.countplot(x='age',data=df1)

## Splitting the data into train test using

In [None]:
x=df1.iloc[:,0:12]
y=df1.loc[:,'DEATH_EVENT']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

## Using StandardScaler to standarise the dataset


In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(x_train)

In [None]:
xtrain_scaled_data=sc.transform(x_train)
xtest_scaled_data=sc.transform(x_test)
xtest_scaled_data.shape

# Using Principal Component Analysis for Feature Selection

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=6)
Xtrain=pca.fit_transform(xtrain_scaled_data)
Xtest=pca.transform(xtest_scaled_data)

In [None]:
Xtrain.shape

In [None]:
y_train.shape

In [None]:
Xtest.shape

In [None]:
pca.explained_variance_ratio_

In [None]:
import numpy as np
cumsum=np.cumsum(pca.explained_variance_ratio_)
cumsum

In [None]:
As you can see there is no benefit of using PCA as after extracting we are getting that 

## CREATING THE MODEL

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
model=DecisionTreeClassifier()
#model=RandomForestClassifier()
#model=LogisticRegression()

In [None]:
from sklearn.model_selection import cross_val_score
cv=cross_val_score(model,Xtrain,y_train,cv=5,scoring="accuracy")
cv

In [None]:
mean=cv.mean()
mean

In [None]:
model.fit(Xtrain,y_train)

In [None]:
y_train.shape

In [None]:
y_test.value_counts()

In [None]:
y_train.value_counts()

In [None]:
predict=model.predict(Xtest)
predict

In [None]:
y_test

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
print('Accuracy Score :',accuracy_score(y_test,predict))
print('Confusion Matrix :',confusion_matrix(y_test,predict))