In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.model_selection import train_test_split
heart = pd.read_csv('/Users/yuqinhan1229/Desktop/heart_failure_clinical_records_dataset.csv')
heart.head()

In [None]:
# EDA
## Distribution of continuous variables
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
## Find the continuous variable and set the color.
figure = plt.figure(figsize=(30, 20))
continues = ['age','creatinine_phosphokinase','ejection_fraction',\
 'platelets','serum_creatinine','serum_sodium','time']
colors = ['red', 'orange', 'yellow', 'green', 'cyan', 'blue', 'purple']
for i in range(1, 8):
    plt.subplot(3,3,i)
    sns.histplot(heart[continues[i-1]], color = colors[i-1], kde=True)
    plt.xlabel(continues[i-1])
    plt.plot()

In [None]:
## distribution transformation
from sklearn.preprocessing import PowerTransformer
t = np.array(list(heart['creatinine_phosphokinase'])).reshape(-1, 1)
pt = PowerTransformer(method='box-cox', standardize=False)
new = pt.fit_transform(t)
heart['creatinine_phosphokinase'] = new

t = np.array(list(heart['serum_creatinine'])).reshape(-1, 1)
pt = PowerTransformer(method='box-cox', standardize=False)
new = pt.fit_transform(t)
heart['serum_creatinine'] = new
heart.head()

In [None]:
figure = plt.figure(figsize=(15, 10))
modify = ['creatinine_phosphokinase','serum_creatinine']
colors = ['orange', 'cyan']
for i in range(1, 3):
    plt.subplot(2,2,i)
    sns.histplot(heart[continues[i-1]], color = colors[i-1], kde=True)
    plt.xlabel(continues[i-1])
    plt.plot()

In [None]:
import pandas_profiling as pp
report = pp.ProfileReport(heart)
report

In [None]:
# bi-variable analysis
## Correlation analysis for continue variablies
conti_heart = heart[continues]
corrDf = conti_heart.corr()
figure = plt.figure(figsize=(15, 10))
sns.heatmap(corrDf,annot = True)

In [None]:
## Analysis between categorical variables and continues variables
categoricals = ['anaemia','diabetes','high_blood_pressure','sex','smoking','DEATH_EVENT']
cate_heart = heart[categoricals]
cate_heart.head()

In [None]:
## chi - test for smoking&others
from sklearn.feature_selection import chi2
dicts = dict()
index = ('anaemia','diaetes', 'high_blood_pressure', 'sex')
chi2_val, p_val = chi2(cate_heart.iloc[:, 0:4], cate_heart.iloc[:, 4])

itr = 0
for i in index:
    string = f'smoking vs {i}'
    dicts[string] = p_val[itr]
    itr += 1
for key, value in dicts.items():
        print(f'{key}: {value}')

In [None]:
index = ['age','creatinine_phosphokinase','ejection_fraction',\
 'platelets','serum_creatinine','serum_sodium','time','DEATH_EVENT']
ind_heart = heart[index]
ind_heart.head()

In [None]:
## z - test for DEATH_EVENT & Others continues variablies
import statsmodels.stats.weightstats as sw
dicts = dict()
index = ['age','creatinine_phosphokinase','ejection_fraction','platelets'\
         ,'serum_creatinine','serum_sodium','time']
z_val, p_val = sw.ztest(ind_heart.iloc[:, 0:7], ind_heart.iloc[:, 7])

itr = 0
for i in index:
    string = f'DEATH_EVENT vs {i}'
    dicts[string] = p_val[itr]
    itr += 1
for key, value in dicts.items():
        print(f'{key}: {value}')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
#Clustering
heart = pd.read_csv('/Users/yuqinhan1229/Desktop/heart_failure_clinical_records_dataset.csv')
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA 
heart = heart.drop('DEATH_EVENT',axis=1) 
kmeans = KMeans(n_clusters=2,random_state=42).fit(heart.values)
kmeans_2 = pd.DataFrame(kmeans.labels_,columns=['cluster'])

pca = PCA(n_components = 2).fit(heart)
pca_trans = pca.transform(heart)
pca_trans_df = pd.DataFrame(pca_trans,columns=['pca1','pca2'])
kmeans_2 = pd.concat([kmeans_2,pca_trans_df],axis=1)
kmeans_2

In [None]:
fig = sns.lmplot(x='pca1',y='pca2',data=kmeans_2,hue='cluster',fit_reg=False)

In [None]:
heart_all = pd.read_csv('/Users/yuqinhan1229/Desktop/heart_failure_clinical_records_dataset.csv')
kmeans_2 = pd.concat([kmeans_2,heart_all['DEATH_EVENT']],axis=1)
fig = sns.lmplot(x='pca1',y='pca2',data=kmeans_2,
                 fit_reg=False,row='cluster',col='DEATH_EVENT')

In [None]:
# decision tree
## Spliting data set
import torch
train, test = train_test_split(heart,random_state =0, test_size = 0.3)
X_train = train.iloc[:,0:12]
Y_train = train.iloc[:,12]
X_test = test.iloc[:,0:12]
Y_test = test.iloc[:,12]
trainset = torch.utils.data.TensorDataset(torch.from_numpy(X_train.values),\
                                        torch.from_numpy(Y_train.values))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=2,\
                                          shuffle=True)
testset = torch.utils.data.TensorDataset(torch.from_numpy(X_test.values),\
                                         torch.from_numpy(Y_test.values))
test_loader = torch.utils.data.DataLoader(testset, batch_size=2,\
                                         shuffle=False,num_workers=2)
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(X_train, Y_train)
scores = clf.score(X_test, Y_test)
print("Train set score：" + str(clf.score(X_train, Y_train)))
print("Test set score：" + str(clf.score(X_test, Y_test)))

In [None]:
heart = pd.read_csv('/Users/yuqinhan1229/Desktop/heart_failure_clinical_records_dataset.csv')
heart.drop(columns = ['sex', 'diabetes'], inplace = True)

In [None]:
## Simplified decision tree
from sklearn import tree
train, test = train_test_split(heart,random_state =0, test_size = 0.3)
X_train = train.iloc[:,0:10]
Y_train = train.iloc[:,10]
X_test = test.iloc[:,0:10]
Y_test = test.iloc[:,10]
trainset = torch.utils.data.TensorDataset(torch.from_numpy(X_train.values),\
                                        torch.from_numpy(Y_train.values))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=2,\
                                          shuffle=True)
testset = torch.utils.data.TensorDataset(torch.from_numpy(X_test.values),\
                                         torch.from_numpy(Y_test.values))
test_loader = torch.utils.data.DataLoader(testset, batch_size=2,\
                                         shuffle=False,num_workers=2)
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(X_train, Y_train)
scores = clf.score(X_test, Y_test)
print("Train set score：" + str(clf.score(X_train, Y_train)))
print("Test set score：" + str(clf.score(X_test, Y_test)))

In [None]:
# ROC curve of decision tree
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test)
confusion_matrix(Y_test, y_pred)
y_predprob = clf.predict_proba(X_test)
metrics.roc_auc_score(Y_test,y_predprob[:,1])

fpr, tpr, thersholds = roc_curve(Y_test, y_predprob[:,1], pos_label=1)
 
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(roc_auc), lw=2)
 
plt.xlim([-0.05, 1.05]) 
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')  
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression()
lr.fit(X_train, Y_train)


print("Train set score：" + str(lr.score(X_train, Y_train)))
print("Test set score：" + str(lr.score(X_test, Y_test)))

In [None]:
y_pred = lr.predict(X_test)
y_predprob = lr.predict_proba(X_test)
metrics.roc_auc_score(Y_test,y_predprob[:,1])

In [None]:
# ROC curve of logistic regression
y_pred = lr.predict(X_test)
confusion_matrix(Y_test, y_pred)
y_predprob = lr.predict_proba(X_test)
metrics.roc_auc_score(Y_test,y_predprob[:,1])

fpr, tpr, thersholds = roc_curve(Y_test, y_predprob[:,1], pos_label=1)
 
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(roc_auc), lw=2)
 
plt.xlim([-0.05, 1.05]) 
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')  
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# build the fully connected neural network which contain ne hidden layer
import torch
import numpy as np
import torchvision

In [None]:
# set the hyperparameter
input_size=12
hidden_size=84
num_classes=2
num_epochs=10
batch_size=5
learning_rate=0.001

In [None]:
# import the dataset and normalize
import pandas as pd
df=pd.read_csv('/Users/86132/heart_failure_clinical_records_dataset.csv')
df_change=df.apply(lambda x:(x-np.min(x))/(np.max(x)-np.min(x)))
df_change

In [None]:
# divide the dataset
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_change,random_state =1, test_size = 0.3)
X_train = train.iloc[:,0:12]
Y_train = train.iloc[:,12]
X_test = test.iloc[:,0:12]
Y_test = test.iloc[:,12]
trainset = torch.utils.data.TensorDataset(torch.from_numpy(X_train.values),torch.from_numpy(Y_train.values))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=5,shuffle=True)
testset = torch.utils.data.TensorDataset(torch.from_numpy(X_test.values),torch.from_numpy(Y_test.values))
test_loader = torch.utils.data.DataLoader(testset, batch_size=5,shuffle=False,num_workers=2)

In [None]:
# build the model
import torch.nn.functional as F
class NeuralNet(nn.Module):
    def __init__(self,input_size,hidden_size,num_classes):
        super(NeuralNet,self).__init__()
        self.fc1=nn.Linear(input_size,hidden_size)
        self.fc2=nn.Linear(hidden_size,num_classes)
    def forward(self,x):
        x = self.fc1(x)
        x=F.relu(x)
        x = self.fc2(x)
        return x

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=NeuralNet(input_size,hidden_size,num_classes).to(device)

In [None]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
# train the model
total_step=len(train_loader)
for epoch in range(num_epochs):
    for i,data in enumerate(train_loader):
        inputs,labels=data
        inputs=inputs.float()
        labels=labels.long()
        outputs=model(inputs)
        loss=criterion(outputs,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if(i+1)%5==0:
            if(epoch+1)%5==0:
                print('epoch[{}/{}],step[{}/{}],loss:{:.4f}'.format(epoch+1,num_epochs,i+1,total_step,loss.item()))

In [None]:
# test the model and get the accuracy
with torch.no_grad():
    correct=0
    total=0
    for inputs,labels in test_loader:
        inputs=inputs.reshape(-1,12).to(device).float()
        labels=labels.to(device)
        outputs=model(inputs)
        _,predicted=torch.max(outputs.data,1)
        total+=labels.size(0)
        correct+=(predicted==labels).sum().item()
    print('accuracy of the nework is {}%'.format(100*correct/total))

In [None]:
# build the fully connected neural network which contain two hidden layers
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# set hyperparameter
input_size=12
hidden_size_1=84
hidden_size_2=84
num_classes=2
num_epoch=10
batch_size=5
learning_rate=0.001

In [None]:
# import the dataset and normalize it
import pandas as pd
df=pd.read_csv('/Users/86132/heart_failure_clinical_records_dataset.csv')
df_change1=df.apply(lambda x:(x-np.min(x))/(np.max(x)-np.min(x)))
df_change1

In [None]:
# divide the dataset
from sklearn.model_selection import train_test_split
train,test=train_test_split(df_change1,random_state=1,test_size=0.3)
X_train=train.iloc[:,0:12]
Y_train=train.iloc[:,12]
X_test=test.iloc[:,0:12]
Y_test=test.iloc[:,12]
trainset = torch.utils.data.TensorDataset(torch.from_numpy(X_train.values),torch.from_numpy(Y_train.values))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=5,shuffle=True)
testset = torch.utils.data.TensorDataset(torch.from_numpy(X_test.values),torch.from_numpy(Y_test.values))
test_loader = torch.utils.data.DataLoader(testset, batch_size=5,shuffle=False,num_workers=2)

In [None]:
# build the model
class simpleNet(nn.Module):
    def __init__(self,inputs,hidden_size_1,hidden_size_2,outputs):
        super(simpleNet,self).__init__()
        self.fc1=nn.Linear(inputs,hidden_size_1)
        self.fc2=nn.Linear(hidden_size_1,hidden_size_1)
        self.fc3=nn.Linear(hidden_size_2,outputs)
    def forward(self,x):
        x=self.fc1(x)
        x=F.relu(x)
        x=self.fc2(x)
        x=F.relu(x)
        x=self.fc3(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=simpleNet(input_size,hidden_size_1,hidden_size_2,num_classes).to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
# train the model
total_step=len(train_loader)
for epoch in range(num_epoch):
    for i,data in enumerate(train_loader):
        inputs,labels=data
        inputs=inputs.reshape(-1,12).to(device)
        inputs=inputs.float()
        labels=labels.long()
        outputs=model(inputs)
        loss=criterion(outputs,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i+1)%5==0:
            if(epoch+1)%5==0:
                print('epoch[{}/{}],step[{}/{}],loss:{:.4f}'.format(epoch+1,num_epoch,i+1,total_step,loss.item()))

In [None]:
# test the model and get the accuracy
with torch.no_grad():
    correct=0
    total=0
    for inputs,labels in test_loader:
        inputs=inputs.reshape(-1,12).to(device).float()
        labels=labels.to(device)
        outputs=model(inputs)
        _,predicted=torch.max(outputs.data,1)
        total+=labels.size(0)
        correct+=(predicted==labels).sum().item()
    print('accuracy of the network is {}%'.format(100*correct/total))

In [None]:
# build the fully connected neural network which contain three hidden layers
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# set hyperparameter
input_size=12
hidden_size_1=84
hidden_size_2=84
hidden_size_3=84
num_classes=2
num_epoch=10
batch_size=5
learning_rate=0.001

In [None]:
# import dataset and normalize it
import pandas as pd
df=pd.read_csv('/Users/86132/heart_failure_clinical_records_dataset.csv')
df_change2=df.apply(lambda x:(x-np.min(x))/(np.max(x)-np.min(x)))
df_change2

In [None]:
# divide the dataset
from sklearn.model_selection import train_test_split
train,test=train_test_split(df_change2,random_state=2,test_size=0.3)
X_train=train.iloc[:,0:12]
Y_train=train.iloc[:,12]
X_test=test.iloc[:,0:12]
Y_test=test.iloc[:,12]
trainset=torch.utils.data.TensorDataset(torch.from_numpy(X_train.values),torch.from_numpy(Y_train.values))
train_loader=torch.utils.data.DataLoader(trainset,batch_size=5,shuffle=True)
testset=torch.utils.data.TensorDataset(torch.from_numpy(X_test.values),torch.from_numpy(Y_test.values))
test_loader=torch.utils.data.DataLoader(testset,batch_size=5,shuffle=False,num_workers=2)

In [None]:
# build the network
class tripleNet(nn.Module):
    def __init__(self,inputs,hidden_size_1,hidden_size_2,hidden_size_3,outputs):
        super(tripleNet,self).__init__()
        self.fc1=nn.Linear(inputs,hidden_size_1)
        self.fc2=nn.Linear(hidden_size_1,hidden_size_2)
        self.fc3=nn.Linear(hidden_size_2,hidden_size_3)
        self.fc4=nn.Linear(hidden_size_3,outputs)
    def forward(self,x):
        x=self.fc1(x)
        x=F.relu(x)
        x=self.fc2(x)
        x=F.relu(x)
        x=self.fc3(x)
        x=F.relu(x)
        x=self.fc4(x)
        return x

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=tripleNet(input_size,hidden_size_1,hidden_size_2,hidden_size_3,num_classes).to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
# train the model
total_step=len(train_loader)
for epoch in range(num_epoch):
    for i,data in enumerate(train_loader):
        inputs,labels=data
        inputs=inputs.reshape(-1,12).to(device)
        inputs=inputs.float()
        labels=labels.long()
        outputs=model(inputs)
        loss=criterion(outputs,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i+1)%5==0:
            if (epoch+1)%5==0:
                print('epoch[{}/{}],step[{}/{}],loss{:.4f}'.format(epoch+1,num_epoch,i+1,total_step,loss.item()))

In [None]:
# test the model and get the accuracy
with torch.no_grad():
    correct=0
    total=0
    for inputs,labels in test_loader:
        inputs=inputs.reshape(-1,12).to(device).float()
        labels=labels.to(device)
        outputs=model(inputs)
        _,predicted=torch.max(outputs.data,1)
        total+=labels.size(0)
        correct+=(predicted==labels).sum().item()
    print('accuracy of the network is {}%'.format(100*correct/total))

In [None]:
# plot the ROC curve
from sklearn.metrics import roc_curve,auc,roc_auc_score
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

In [None]:
mlp=MLPClassifier(random_state=1,max_iter=300)
mlp=mlp.fit(X_train,Y_train)
y_pred=mlp.predict(X_test)
y_predprob=mlp.predict_proba(X_test)
fpr,tpr,thersholds=roc_curve(Y_test,y_predprob[:,1],pos_label=1)
roc_auc=auc(fpr,tpr)
plt.plot(fpr,tpr,'k--',label='ROC (area = {0:.2f})'.format(roc_auc),lw=2)
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])
plt.xlabel('False Position Rate')
plt.ylabel('True Position Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
heart.head()

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from patsy import dmatrices
kf = KFold(n_splits = 5)
y, X = dmatrices('DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase + '
                 'ejection_fraction + high_blood_pressure + '
                 'platelets + serum_creatinine + serum_sodium + smoking + time', heart)

In [None]:
# 5-fold cross validation of decision tree
scores = []
for train, test in kf.split(X):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    clf = tree.DecisionTreeClassifier(max_depth=5)
    clf = clf.fit(X_train, y_train)
    scores.append(clf.score(X_test,y_test))
print(scores)
sums = 0
for i in scores:
    sums = i + sums
D_sc = sums/5
print('Scores of Decision tree is: ' + str(D_sc))

In [None]:
# 5-fold cross validation of logistic regression
scores = []
for train, test in kf.split(X):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    scores.append(lr.score(X_test,y_test))
print(scores)
sums = 0
for i in scores:
    sums = i + sums
L_sc = sums/5
print('Scores of Decision tree is: ' + str(L_sc))

In [None]:
# 5-fold corss validation of fully connected neural network
from sklearn.model_selection import KFold,cross_val_score
from patsy import dmatrices
kf=KFold(n_splits=10)
y,X=dmatrices('DEATH_EVENT ~ age + anaemia + creatinine_phosphokinase  + ejection_fraction + high_blood_pressure + platelets + serum_creatinine + serum_sodium  + time',df)

In [None]:
scores=[]
for train,test in kf.split(x):
    X_train,X_test=X[train],X[test]
    y_train,y_test=y[train],y[test]
    mlp=MLPClassifier()
    mlp=mlp.fit(X_train,y_train)
    scores.append(mlp.score(X_test,y_test))
print(scores)
sums=0
for i in scores:
    sums=i+sums
D_sc=sums/10
print('Scores of MLP is:'+str(D_sc))