In [4]:
import pandas as pd
import logging

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from imblearn.over_sampling import SMOTE

logging.basicConfig(level=logging.INFO)

FILEPATH = "https://raw.githubusercontent.com/chandanc5525/heartdisease_predictionmodel/refs/heads/main/data/raw/heart.csv"

TARGET = "target"

df = pd.read_csv(FILEPATH)
logging.info("Data Loaded Successfully")

df = df.drop_duplicates()

X = df.drop(TARGET, axis=1)
y = df[TARGET]

num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(include="object").columns

num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])

if len(cat_cols) > 0:
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X[num_cols])

X = pd.DataFrame(X_pca)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print(report)


INFO:root:Data Loaded Successfully


Accuracy: 0.7704918032786885
              precision    recall  f1-score   support

           0       0.79      0.68      0.73        28
           1       0.76      0.85      0.80        33

    accuracy                           0.77        61
   macro avg       0.77      0.76      0.77        61
weighted avg       0.77      0.77      0.77        61

