In [1]:
import pandas as pd
from ydata_profiling import ProfileReport

df = pd.read_csv('CVD_dataset.csv')
profile = ProfileReport(df, title="Cardiovascular Dataset Overview Report")

In [2]:
# Save the report to an HTML file
profile.to_file("Cardio.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()


data_without_duplicates = df.drop_duplicates(keep='first') # delete duplicate values and keep the first one
y = data_without_duplicates['Cardio_Disease']


for col in data_without_duplicates.columns:
    if data_without_duplicates[col].dtype == 'object':
        data_without_duplicates[col] = encoder.fit_transform(data_without_duplicates[col])

if data_without_duplicates.isnull().sum().any():
    print("There are missing values. Handle them before proceeding.")
    data_without_duplicates = data_without_duplicates.fillna(method='ffill')

In [4]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.8, random_state=42) 
X_resampled, y_resampled = smote.fit_resample(data_without_duplicates, y)

# Combine the resampled features and target back into a DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=data_without_duplicates.columns)
df_resampled['Cardio_Disease'] = y_resampled

print(pd.Series(y_resampled).value_counts())


Cardio_Disease
No     283103
Yes    226482
Name: count, dtype: int64


In [5]:
from pandas_profiling import ProfileReport

# Generate the profile report
profile = ProfileReport(df_resampled, title="Balanced Dataset After SMOTE", explorative=True)

# Save the report to an HTML file
profile.to_file("balanced_dataset_smote_report.html")

# If you are using Jupyter Notebooks, you can view it inline
# profile.to_notebook_iframe()


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Feature Selection 

In [None]:
extended_features = [f for f in data.columns if f != TARGET]

In [None]:
numerical_features = [
    'Age_Category', 
    'Height', 
    'Weight', 
    'BMI', 
    'Alcohol_Consumption', 
    'Fruit_Consumption',
    'Green_Vegetables_Consumption',
    'FriedPotato_Consumption',
    'good_habits',
    'bad_habits',
    'comorbities',
    'health_status'
]

categorical_features = [f for f in extended_features if f not in numerical_features]

## Corelation

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_heatmap(df, size=14):
    plt.figure(figsize=(size, size * 0.75))
    sns.heatmap(
        data=df, 
        mask=np.triu(np.ones_like(df)),
        vmin=-1, vmax=1, center=0,
        annot=True, linewidths=0.1,
        fmt='.2f', annot_kws={'fontsize': 8})
    plt.show()