In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('cardio_train.csv',  delimiter=';')

In [None]:
df

# Information about the data :

Source : https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset

In [None]:
num_counts = df['cardio'].value_counts()

# Plot a pie chart
plt.figure(figsize=(8, 8))
plt.pie(num_counts, labels=num_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Percentage of People with Different Heart Disease  (num)')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

Dataset has proper distribution of target variables. 

# Splitting the dataset

In [None]:
data = df.drop(columns=['cardio'])

X = data.drop(columns=['id'])
y = df['cardio']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train

In [None]:
y_train

Normalizing the dataset:

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the model

In [None]:
param_grid = {
    'n_estimators': [100, 300, 500],          # Number of trees
    'max_depth': [None, 10, 20, 30],         # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],           # Mi nimum number of samples required to be at a leaf node
    'class_weight': ['balanced', None],      # Handle class imbalance
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=20,  # Test only 20 random combinations instead of all
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
random_search.fit(X_train_scaled, y_train)
print("Best Parameters:", random_search.best_params_)


In [None]:


# Train a new model using the best parameters
best_rf_model = random_search.best_estimator_
y_pred_tuned = best_rf_model.predict(X_test_scaled)

# Evaluate the tuned model
print("Tuned Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned))


# Improving the model

Feature Engineering

In [None]:
import pandas as pd
import numpy as np

feature_importances = best_rf_model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)  # View feature importance


Age | Objective Feature | age | int (days)

Height | Objective Feature | height | int (cm) |

Weight | Objective Feature | weight | float (kg) |

Gender | Objective Feature | gender | categorical code |

Systolic blood pressure | Examination Feature | ap_hi | int |

Diastolic blood pressure | Examination Feature | ap_lo | int |

Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |

Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |

Smoking | Subjective Feature | smoke | binary |

Alcohol intake | Subjective Feature | alco | binary |

Physical activity | Subjective Feature | active | binary |

Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

Systolic blood pressure has the highest importance. As per importance, certain features like smoking are suggested to be dropped. However, this must be thoroughly examined. For now, we are going to compute the performance of a machine learning model that uses only the values with higher importance.

In [None]:
# Drop low-importance features from dataset
low_importance_features = ['active', 'smoke', 'gender', 'alco']
X_train_filtered = X_train.drop(columns=low_importance_features)
X_test_filtered = X_test.drop(columns=low_importance_features)


In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_filtered)
X_test_scaled = scaler.transform(X_test_filtered)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=20,  # Test only 20 random combinations instead of all
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
random_search.fit(X_train_scaled, y_train)
print("Best Parameters:", random_search.best_params_)


In [None]:


# Train a new model using the best parameters
best_rf_model = random_search.best_estimator_
y_pred_tuned = best_rf_model.predict(X_test_scaled)

# Evaluate the tuned model
print("Tuned Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned))


Dropping those features reduced the accuracy. Now, need to combine those features into something useful.
Height and weight can be combined into BMI.

BMI = weight/(height)^2

In [None]:
X_test['BMI'] = X_test['weight']/((X_test['height']/100)**2)

In [None]:
dropColumns = ['height','weight']
X_test.drop(columns=dropColumns)

In [None]:
X_train['BMI'] = X_train['weight']/((X_train['height']/100)**2)
dropColumns = ['height','weight']
X_train.drop(columns=dropColumns)