In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

# Load the data
CVD_cleaned = pd.read_csv("./Heart_desease.csv")
print(CVD_cleaned)

# Create dummy variables and drop original categorical columns
data = pd.get_dummies(CVD_cleaned, columns=[
    "Exercise", "Heart_Disease", "Skin_Cancer", 
    "Other_Cancer", "Depression", "Diabetes", 
    "Arthritis", "Smoking_History", "Sex"], drop_first=True)

# Convert General Health to numerical factors
general_health_map = {"Excellent": 5, "Very Good": 4, "Good": 3, "Fair": 2, "Poor": 1}
data['General_Health'] = data['General_Health'].map(general_health_map)

# Convert Last Checkup to numerical factors
checkup_map = {
    "Within the past year": 5,
    "Within the past 2 years": 4,
    "Within the past 5 years": 3,
    "5 or more years ago": 2,
    "Never": 1
}
data['Checkup'] = data['Checkup'].map(checkup_map)

# Convert Age Category to numeric
age_category_mapping = {
    "18-24": 21, "25-29": 27, "30-34": 32,
    "35-39": 37, "40-44": 42, "45-49": 47,
    "50-54": 52, "55-59": 57, "60-64": 62,
    "65-69": 67, "70-74": 72, "75-79": 77,
    "80+": 85
}
data['Age_numeric'] = data['Age_Category'].map(age_category_mapping)

# Drop the Age_Category column
data = data.drop(columns=['Age_Category'])

# Rename columns
data.columns = [
    "General_Health", "Last_checkup", "Height_cm",
    "Weight_kg", "BMI", "Alcohol_Consumption",
    "Fruit_Consumption", "Green_Vegetables_Consumption",
    "FriedPotato_Consumption", "Exercise",
    "Heart_Disease", "Skin_Cancer", "Other_Cancer",
    "Depression", "Pre_Diabetes", "Diabetes",
    "Diabetes_Pregnancy", "Arthritis", "Smoking_History",
    "Sex_Male", "Age_numeric"
]

# Check the new column names
print(data.columns)

# Split the data into training and testing sets
np.random.seed(12345)
prop_train = 0.8
train, test = train_test_split(data, train_size=prop_train, random_state=12345)

# Check proportions in the original dataset
print(train['Heart_Disease'].value_counts())
print(train['Heart_Disease'].value_counts(normalize=True))

# Balance the training dataset using under-sampling
ros = RandomOverSampler(sampling_strategy='minority')
X_train = train.drop(columns=['Heart_Disease'])
y_train = train['Heart_Disease']
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Combine the resampled data into a new DataFrame
train_balanced = pd.DataFrame(X_resampled, columns=X_train.columns)
train_balanced['Heart_Disease'] = y_resampled

# Verify balance after sampling
print(train_balanced['Heart_Disease'].value_counts())

# Remove irrelevant variables
train_balanced = train_balanced.drop(columns=['FriedPotato_Consumption'])
test = test.drop(columns=['FriedPotato_Consumption'])


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [2]:
#pip install pandas scikit-learn imbalanced-learn
#!pip install tensorflow

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Collecting joblib>=0.11
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, imbalanced-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.12.4 joblib-1.4.2
Note: you may need to restart the kernel to use updated packages.


In [8]:
train_balanced.head()

Unnamed: 0,General_Health,Last_checkup,Height_cm,Weight_kg,BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,Exercise,Skin_Cancer,Other_Cancer,Depression,Pre_Diabetes,Diabetes,Diabetes_Pregnancy,Arthritis,Smoking_History,Sex_Male,Age_numeric,Heart_Disease
0,3,5,173.0,181.44,60.82,3.0,16.0,8.0,0,0,0,0,0,0,0,1,0,0,57,0
1,4,5,168.0,52.16,18.56,12.0,90.0,30.0,1,0,0,0,0,0,0,0,0,0,52,0
2,2,5,160.0,54.43,21.26,0.0,60.0,0.0,0,1,1,0,0,0,0,0,0,0,85,0
3,4,5,178.0,81.19,25.68,0.0,90.0,30.0,1,0,0,0,0,0,0,0,0,0,62,0
4,2,5,168.0,86.18,30.67,0.0,30.0,12.0,1,0,0,1,0,0,0,1,1,1,72,0


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score

# Assuming `data` is the preprocessed DataFrame with the features and target
# Define feature and target variables
X = data.drop('Heart_Disease', axis=1)
y = data['Heart_Disease']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Scale features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert target variable to categorical (if needed for binary classification)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Build the neural network model
model = Sequential()
model.add(Dense(units=32, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=2, activation='softmax'))  # Use softmax for binary classification with categorical target

# Compile the model
model.compile(
    loss='categorical_crossentropy',  # For categorical output
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=2
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)

# Generate predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert probabilities to class predictions
y_test_classes = np.argmax(y_test, axis=1)

# Print evaluation metrics
print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes))
print("Accuracy:", accuracy_score(y_test_classes, y_pred_classes))


ModuleNotFoundError: No module named 'tensorflow'