In [3]:
import pandas as pd
import opendatasets as od
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib

In [4]:
od.download("https://www.kaggle.com/datasets/alexteboul/heart-disease-health-indicators-dataset")

Downloading heart-disease-health-indicators-dataset.zip to ./heart-disease-health-indicators-dataset


100%|██████████| 2.66M/2.66M [00:00<00:00, 57.5MB/s]







In [5]:
!mv "./heart-disease-health-indicators-dataset/heart_disease_health_indicators_BRFSS2015.csv" "../data/raw/df_raw.csv"

In [6]:
df_raw = pd.read_csv("../data/raw/df_raw.csv")
df_raw.head(5)

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [7]:
df_raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HeartDiseaseorAttack,253680.0,0.094186,0.292087,0.0,0.0,0.0,0.0,1.0
HighBP,253680.0,0.429001,0.494934,0.0,0.0,0.0,1.0,1.0
HighChol,253680.0,0.424121,0.49421,0.0,0.0,0.0,1.0,1.0
CholCheck,253680.0,0.96267,0.189571,0.0,1.0,1.0,1.0,1.0
BMI,253680.0,28.382364,6.608694,12.0,24.0,27.0,31.0,98.0
Smoker,253680.0,0.443169,0.496761,0.0,0.0,0.0,1.0,1.0
Stroke,253680.0,0.040571,0.197294,0.0,0.0,0.0,0.0,1.0
Diabetes,253680.0,0.296921,0.69816,0.0,0.0,0.0,0.0,2.0
PhysActivity,253680.0,0.756544,0.429169,0.0,1.0,1.0,1.0,1.0
Fruits,253680.0,0.634256,0.481639,0.0,0.0,1.0,1.0,1.0


In [8]:
df_raw = df_raw.drop(["CholCheck", "Fruits", "Veggies", "AnyHealthcare", "NoDocbcCost", "GenHlth", "MentHlth", "PhysHlth", "DiffWalk", "Age", "Education", "Income"], axis=1).copy()

In [9]:
X = df_raw.drop(['HeartDiseaseorAttack'], axis=1).copy()
y = df_raw["HeartDiseaseorAttack"].copy()

In [10]:
X.head(5)

Unnamed: 0,HighBP,HighChol,BMI,Smoker,Stroke,Diabetes,PhysActivity,HvyAlcoholConsump,Sex
0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,1.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,27.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fa09045c150>

In [15]:
y_pred = model.predict(X_test)
y_pred_binary = np.round(y_pred)



In [16]:
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Precisión del modelo: {accuracy}')

Precisión del modelo: 0.9069497004099654


In [17]:
joblib.dump(model, '../models/relu_adam_42.pkl')

['../models/relu_adam_42.pkl']