In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("heart.csv")

In [2]:
df = df.rename({"output": "target"}, axis = 1)
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df.nunique()

age          41
sex           2
cp            4
trtbps       49
chol        152
fbs           2
restecg       3
thalachh     91
exng          2
oldpeak      40
slp           3
caa           5
thall         4
target        2
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, LabelEncoder

#divisao treino e teste
from sklearn.model_selection import train_test_split

#modelos
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB


#metricas
from sklearn.metrics import accuracy_score

#validacao
from sklearn.model_selection import GridSearchCV


In [6]:
def onehot_encode(df, column_dict):
    df = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [7]:
def preprocess_inputs(df, scaler):
    df = df.copy()
    
    nominal_features = ['cp']
    df = onehot_encode(df, dict(zip(nominal_features, ['CP'])))
    
    y = df['target'].copy()
    X = df.drop(['thall','slp','oldpeak','target'], axis=1).copy()
        
    return X, y

## Preprocessing

In [8]:
scaler = RobustScaler()
X,y = preprocess_inputs(df,scaler)

In [9]:
X.head()

Unnamed: 0,age,sex,trtbps,chol,fbs,restecg,thalachh,exng,caa,CP_0,CP_1,CP_2,CP_3
0,63,1,145,233,1,0,150,0,0,0,0,0,1
1,37,1,130,250,0,1,187,0,0,0,0,1,0
2,41,0,130,204,0,0,172,0,0,0,1,0,0
3,56,1,120,236,0,1,178,0,0,0,1,0,0
4,57,0,120,354,0,1,163,1,0,1,0,0,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
print("Shape X_train", X_train.shape)
print("Shape X_test",X_test.shape)
print("Shape y_train",y_train.shape)
print("Shape y_test",y_test.shape)

Shape X_train (242, 13)
Shape X_test (61, 13)
Shape y_train (242,)
Shape y_test (61,)


## Buscando o Melhor Modelo

In [11]:
gnb = GaussianNB()

gnb.fit(X_train,y_train)

y_pred = gnb.predict(X_test)

print(f"A precisao foi de {accuracy_score(y_test, y_pred)*100:.2f}%")

A precisao foi de 80.33%


In [12]:
clf = SVC(kernel='linear', C=1, random_state = 0).fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(f"A precisao do teste SVC eh de {(accuracy_score(y_test,y_pred))*100:.2f}%")

A precisao do teste SVC eh de 80.33%


In [13]:
svm = SVC()

parameters = {"C":np.arange(1,10,1),"gamma":[0.00001,0.00005, 0.0001,0.0005,
                                             0.001,0.005,0.01,0.05,0.1,0.5,1,5]
             }

searcher = GridSearchCV(svm, parameters)

searcher.fit(X_train, y_train)

print(f"O melhor parametro eh: {searcher.best_params_}")
print(f"A melhor pontuacao foi de {searcher.best_score_*100:.2f}%:")

y_pred = searcher.predict(X_test)

print(f"A precisao do teste depois de usar GridSearchCV foi de {accuracy_score(y_test, y_pred)*100:.2f}% ")


O melhor parametro eh: {'C': 5, 'gamma': 0.0005}
A melhor pontuacao foi de 67.39%:
A precisao do teste depois de usar GridSearchCV foi de 68.85% 


In [14]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(f"A precisao foi de {accuracy_score(y_test, y_pred)*100:.2f}%")

A precisao foi de 81.97%


In [15]:
dt = DecisionTreeClassifier(random_state = 0)

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print(f"A precisao foi de {accuracy_score(y_test, y_pred)*100:.2f}%")


A precisao foi de 75.41%


In [16]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print(f"A precisao foi de {accuracy_score(y_test, y_pred)*100:.2f}%")

A precisao foi de 75.41%


In [17]:
gbt = GradientBoostingClassifier(n_estimators = 300,max_depth=1,subsample=0.8,max_features=0.2,random_state=0)

gbt.fit(X_train,y_train)

y_pred = gbt.predict(X_test)

print(f"A precisao foi de {accuracy_score(y_test, y_pred)*100:.2f}%")

A precisao foi de 80.33%


In [18]:
mlp = MLPClassifier(hidden_layer_sizes = (10,), solver = "lbfgs", random_state=0)

mlp.fit(X_train,y_train)

y_pred = mlp.predict(X_test)

print(f"A precisao foi de {accuracy_score(y_test, y_pred)*100:.2f}%")

A precisao foi de 81.97%


## Testando o Modelo Escolhido

In [19]:
def criaDummies(array):
    newArr = array[:2] + array[3:]
    if array[2]== 0:        
        newArr += [1,0,0,0]
    elif array[2]== 1:        
        newArr += [0,1,0,0]
    elif array[2]== 2:        
        newArr += [0,0,1,0]                            
    else:        
        newArr += [0,0,0,1]                        
    return newArr

In [20]:
input_array = [59,"Man","Typical Angina",110,239,"False","Normal",142, "Yes", 0]
transdict = {
    "Asymptomatic":3,
    "Atypical Angina":1,
    "Non-anginal Pain":2,
    "Typical Angina":0,
    "Woman":0,
    "Man":1,
    "Yes":1,
    "No":0,
    "True":1,
    "False":0,
    "Having ST-T":1,
    "Normal":0,
    "Hypertrophy":2,
}
new = [transdict.get(i, i) for i in input_array]
print("1",new)
new = criaDummies(new)
print("2",new)
new = np.asarray(new)

1 [59, 1, 0, 110, 239, 0, 0, 142, 1, 0]
2 [59, 1, 110, 239, 0, 0, 142, 1, 0, 1, 0, 0, 0]


In [21]:
reshaped_dummies = new.reshape(1,-1)
print(reshaped_dummies)

[[ 59   1 110 239   0   0 142   1   0   1   0   0   0]]


In [22]:
prediction = mlp.predict(reshaped_dummies)
print(prediction)

if (prediction[0] == 0):
    print("The Person does not have a Heart Disease")
else:
    print('The Person has Heart Disease')

[0]
The Person does not have a Heart Disease


In [23]:
import pickle

In [24]:
dados = {"model":mlp, "dict":transdict}
with open("saved_data.pkl", "wb") as file:
    pickle.dump(dados,file)
    
    