In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

try:
    df = pd.read_csv('framingham.csv')
except FileNotFoundError:
    print("Error: 'framingham.csv' not found. Please ensure the file is in the correct directory.")
    exit()

print("Handling missing values by mean imputation...")
for column in df.columns:
    if df[column].isnull().any():
        df[column] = df[column].fillna(df[column].mean())
print("Missing values handled.\n")

X = df.drop('TenYearCHD', axis=1)
y = df['TenYearCHD']
feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

print("## Determining Most Important Features (using RandomForest):")
rf_model_fi = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_fi.fit(X_train_scaled_df, y_train)
importances = rf_model_fi.feature_importances_
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print(feature_importance_df)
print("\n" + "="*60 + "\n")

print("## Logistic Regression without SMOTE:")
log_reg_no_smote = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
log_reg_no_smote.fit(X_train_scaled_df, y_train)
y_pred_lr_no_smote = log_reg_no_smote.predict(X_test_scaled_df)
accuracy_lr_no_smote = accuracy_score(y_test, y_pred_lr_no_smote)
print(f"Accuracy: {accuracy_lr_no_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr_no_smote, zero_division=0))
print("\n" + "="*60 + "\n")
print("## Ensemble (Logistic Regression + KNN) without SMOTE:")
knn_no_smote = KNeighborsClassifier(n_neighbors=5)
knn_no_smote.fit(X_train_scaled_df, y_train)

ensemble_clf_no_smote = VotingClassifier(
    estimators=[('lr', log_reg_no_smote), ('knn', knn_no_smote)],
    voting='hard'
)
ensemble_clf_no_smote.fit(X_train_scaled_df, y_train)
y_pred_ensemble_no_smote = ensemble_clf_no_smote.predict(X_test_scaled_df)
accuracy_ensemble_no_smote = accuracy_score(y_test, y_pred_ensemble_no_smote)
print(f"Accuracy: {accuracy_ensemble_no_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_ensemble_no_smote, zero_division=0))
print("\n" + "="*60 + "\n")

print("Class distribution in original training data:")
print(y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled_df, y_train)

print("\nClass distribution in SMOTE training data:")
print(y_train_smote.value_counts())
print(f"Shape of training data before SMOTE: {X_train_scaled_df.shape}")
print(f"Shape of training data after SMOTE: {X_train_smote.shape}")
print("\n" + "="*60 + "\n")

print("## Logistic Regression with SMOTE:")
log_reg_smote = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
log_reg_smote.fit(X_train_smote, y_train_smote)
y_pred_lr_smote = log_reg_smote.predict(X_test_scaled_df)
accuracy_lr_smote = accuracy_score(y_test, y_pred_lr_smote)
print(f"Accuracy: {accuracy_lr_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr_smote, zero_division=0))
print("\n" + "="*60 + "\n")
print("## Ensemble (Logistic Regression + KNN) with SMOTE:")
knn_smote = KNeighborsClassifier(n_neighbors=5)
knn_smote.fit(X_train_smote, y_train_smote)

# log_reg_smote is already trained
ensemble_clf_smote = VotingClassifier(
    estimators=[('lr', log_reg_smote), ('knn', knn_smote)],
    voting='hard'
)

ensemble_clf_smote.fit(X_train_smote, y_train_smote)
y_pred_ensemble_smote = ensemble_clf_smote.predict(X_test_scaled_df)
accuracy_ensemble_smote = accuracy_score(y_test, y_pred_ensemble_smote)
print(f"Accuracy: {accuracy_ensemble_smote:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_ensemble_smote, zero_division=0))
print("\n" + "="*60 + "\n")

X_train_smote_df_save = pd.DataFrame(X_train_smote, columns=feature_names)
y_train_smote_series_save = pd.Series(y_train_smote, name='TenYearCHD')
framingham_extended_df = pd.concat([X_train_smote_df_save, y_train_smote_series_save], axis=1)
try:
    framingham_extended_df.to_csv('framingham_extended.csv', index=False)
    print(f"SMOTE-augmented training dataset saved to 'framingham_extended.csv'")
    print(f"Shape of the extended dataset: {framingham_extended_df.shape}")
except Exception as e:
    print(f"Error saving 'framingham_extended.csv': {e}")

Handling missing values by mean imputation...
Missing values handled.

## Determining Most Important Features (using RandomForest):
            feature  importance
10            sysBP    0.130597
12              BMI    0.127503
1               age    0.126334
9           totChol    0.123492
14          glucose    0.118363
11            diaBP    0.110702
13        heartRate    0.095294
4        cigsPerDay    0.050460
2         education    0.041157
0              male    0.018898
7      prevalentHyp    0.018123
3     currentSmoker    0.012965
5            BPMeds    0.011895
8          diabetes    0.008236
6   prevalentStroke    0.005980


## Logistic Regression without SMOTE:
Accuracy: 0.8443
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.99      0.91       719
           1       0.41      0.05      0.10       129

    accuracy                           0.84       848
   macro avg       0.63      0.52      0.51       848
weig