In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                            precision_score, recall_score, f1_score, roc_auc_score)
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.pipeline import Pipeline
import joblib

# Set style for plots
plt.style.use('ggplot')
%matplotlib inline

In [None]:

df = pd.read_excel("cardio_train.csv.xlsx")

print("="*80)
print("Initial Data Inspection")
print("="*80)
print(f"Dataset Shape: {df.shape}")
print("\nFirst 5 Rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nDescriptive Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nNumber of Duplicates:", df.duplicated().sum())

In [None]:

# Convert age from days to years
df['age_years'] = df['age'] / 365

# Remove outliers from blood pressure
df = df[(df['ap_hi'] >= 80) & (df['ap_hi'] <= 200)]
df = df[(df['ap_lo'] >= 50) & (df['ap_lo'] <= 120)]

print("\nAfter cleaning - Dataset Shape:", df.shape)

In [None]:

plt.figure(figsize=(18, 12))

# Age distribution
plt.subplot(2, 3, 1)
sns.histplot(df['age_years'], bins=20, kde=True, color='skyblue')
plt.title('Age Distribution', fontsize=14)
plt.xlabel('Age (years)')

# Blood pressure distribution
plt.subplot(2, 3, 2)
sns.histplot(df['ap_hi'], bins=20, kde=True, color='salmon')
plt.title('Systolic BP Distribution', fontsize=14)
plt.xlabel('Systolic BP')

# Cholesterol distribution
plt.subplot(2, 3, 3)
df['cholesterol'].value_counts().sort_index().plot(kind='bar', color=['lightblue', 'lightgreen', 'pink'])
plt.title('Cholesterol Levels', fontsize=14)
plt.xlabel('Cholesterol Level')

# Physical activity
plt.subplot(2, 3, 4)
df['active'].value_counts().sort_index().plot(kind='bar', color=['lightblue', 'lightgreen'])
plt.title('Physical Activity', fontsize=14)
plt.xlabel('Active (1 = yes, 0 = no)')

# Cardio disease distribution
plt.subplot(2, 3, 5)
df['cardio'].value_counts().sort_index().plot(kind='bar', color=['lightblue', 'lightgreen'])
plt.title('Cardiovascular Disease', fontsize=14)
plt.xlabel('Cardio (1 = yes, 0 = no)')

# BMI calculation and distribution
df['bmi'] = df['weight'] / ((df['height']/100) ** 2)
plt.subplot(2, 3, 6)
sns.histplot(df['bmi'], bins=20, kde=True, color='purple')
plt.title('BMI Distribution', fontsize=14)
plt.xlabel('BMI')

plt.tight_layout()
plt.show()

In [None]:

plt.figure(figsize=(18, 12))

# Age vs Cardio
plt.subplot(2, 3, 1)
sns.boxplot(x='cardio', y='age_years', data=df, palette=['lightblue', 'lightgreen'])
plt.title('Age vs Cardiovascular Disease', fontsize=14)
plt.xlabel('Cardio (1 = yes, 0 = no)')
plt.ylabel('Age (years)')

# Blood Pressure vs Cardio
plt.subplot(2, 3, 2)
sns.boxplot(x='cardio', y='ap_hi', data=df, palette=['lightblue', 'lightgreen'])
plt.title('Systolic BP vs Cardiovascular Disease', fontsize=14)
plt.xlabel('Cardio (1 = yes, 0 = no)')
plt.ylabel('Systolic BP')

# Cholesterol vs Cardio
plt.subplot(2, 3, 3)
sns.countplot(x='cholesterol', hue='cardio', data=df, palette=['lightblue', 'lightgreen'])
plt.title('Cholesterol vs Cardiovascular Disease', fontsize=14)
plt.xlabel('Cholesterol Level')
plt.legend(title='Cardio', labels=['No', 'Yes'])

# BMI vs Cardio
plt.subplot(2, 3, 4)
sns.boxplot(x='cardio', y='bmi', data=df, palette=['lightblue', 'lightgreen'])
plt.title('BMI vs Cardiovascular Disease', fontsize=14)
plt.xlabel('Cardio (1 = yes, 0 = no)')
plt.ylabel('BMI')

# Physical Activity vs Cardio
plt.subplot(2, 3, 5)
sns.countplot(x='active', hue='cardio', data=df, palette=['lightblue', 'lightgreen'])
plt.title('Physical Activity vs Cardiovascular Disease', fontsize=14)
plt.xlabel('Active (1 = yes, 0 = no)')
plt.legend(title='Cardio', labels=['No', 'Yes'])

# Gender vs Cardio
plt.subplot(2, 3, 6)
sns.countplot(x='gender', hue='cardio', data=df, palette=['lightblue', 'lightgreen'])
plt.title('Gender vs Cardiovascular Disease', fontsize=14)
plt.xlabel('Gender (1 = female, 2 = male)')
plt.legend(title='Cardio', labels=['No', 'Yes'])

plt.tight_layout()
plt.show()

In [None]:

plt.figure(figsize=(14, 10))
corr = df[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'bmi', 'active', 'cardio']].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title('Correlation Matrix', fontsize=16)
plt.show()

In [None]:

# Create new features
df['bp_diff'] = df['ap_hi'] - df['ap_lo']
df['age_squared'] = df['age'] ** 2
df['wh_ratio'] = df['weight'] / df['height']

# Feature Selection
X = df.drop(['cardio', 'id', 'age_years'], axis=1)
y = df['cardio']

# Method 1: SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("\nTop 10 features using SelectKBest:")
print(selected_features)

# Method 2: RFE with Logistic Regression
model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(X, y)
selected_features = X.columns[fit.support_]
print("\nTop 10 features using RFE:")
print(selected_features)

# Final feature selection
final_features = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
                 'cholesterol', 'gluc', 'bmi', 'active', 'bp_diff']
X = df[final_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:

print("\nTuning Logistic Regression...")
lr_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'penalty': ['l1', 'l2'],
             'solver': ['liblinear']}

lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_params, cv=5, scoring='accuracy', n_jobs=-1)
lr_grid.fit(X_train_scaled, y_train)
best_lr = lr_grid.best_estimator_
print("Best Logistic Regression Parameters:", lr_grid.best_params_)

In [None]:

print("\nTuning Random Forest...")
rf_params = {'n_estimators': [50, 100, 200],
             'max_depth': [None, 10, 20, 30],
             'min_samples_split': [2, 5, 10]}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
print("Best Random Forest Parameters:", rf_grid.best_params_)

In [None]:

print("\nTuning KNN...")
knn_params = {'n_neighbors': [3, 5, 7, 9, 11],
              'weights': ['uniform', 'distance']}

knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='accuracy', n_jobs=-1)
knn_grid.fit(X_train_scaled, y_train)
best_knn = knn_grid.best_estimator_
print("Best KNN Parameters:", knn_grid.best_params_)

In [None]:

models = {
    'Logistic Regression': best_lr,
    'Random Forest': best_rf,
    'K-Nearest Neighbors': best_knn
}

results = {}
for name, model in models.items():
    if name == 'Random Forest':
        # RF doesn't need scaling
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1]

    results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)

# Plot model comparison
plt.figure(figsize=(12, 6))
results_df.plot(kind='bar', colormap='viridis')
plt.title('Model Performance Comparison', fontsize=16)
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [None]:

best_model = best_rf  # based on results
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.title('Confusion Matrix - Random Forest', fontsize=14)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print("\nBest Model Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:

# Create pipeline with preprocessing and best model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', best_rf)
])

# Train on full dataset
X_scaled = scaler.fit_transform(X)
pipeline.fit(X_scaled, y)

# Save the model and pipeline
joblib.dump(pipeline, 'cardio_model.pkl')
print("\nModel saved as 'cardio_model.pkl'")

In [None]:

feature_importance = pd.DataFrame({
    'Feature': final_features,
    'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='viridis')
plt.title('Feature Importance - Random Forest', fontsize=14)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

print("\nFeature Importance Table:")
print(feature_importance)