In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/raw/archive.zip')

In [4]:
df_cleaned = df.copy()

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import SGDClassifier

In [6]:
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [7]:
cat_transformer = Pipeline(
    steps=[
        ('one_hot_encoder', OneHotEncoder(sparse_output=False, drop='first'))
    ]
)

In [8]:
age_ord_transformer = Pipeline(
    steps=[
        ('ordinal_encoder', OrdinalEncoder())
    ]
)

In [23]:
health_ord_transformer = Pipeline(
    steps=[
        ('ordinal_encoder', OrdinalEncoder(categories=[['Poor','Fair','Good','Very Good','Excellent']]))
    ]
)

In [9]:
checkup_ord_transformer = Pipeline(
    steps=[
        ('ordinal_encoder', OrdinalEncoder(categories=[['Within the past year','Within the past 2 years','Within the past 5 years','5 or more years ago','Never']]))
    ]
)

In [17]:
num_cols = list(df_cleaned.select_dtypes(include=['float64']).columns)

In [19]:
cat_cols = ['Arthritis', 'Depression', 'Diabetes', 'Exercise', 'Other_Cancer', 'Sex', 'Skin_Cancer', 'Smoking_History']

In [22]:
from sklearn.compose import ColumnTransformer

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols),
        ('cat_cols', cat_transformer, cat_cols),
        ('age_col', age_ord_transformer, ['Age_Category']),
        ('health_col', health_ord_transformer, ['General_Health']),
        ('checkup_col', checkup_ord_transformer, ['Checkup'])
    ]
)

In [25]:
sgd_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('sgd', SGDClassifier(loss='log_loss', penalty='elasticnet', random_state=42))
    ]
)

In [26]:
target = df_cleaned['Heart_Disease'].map({'Yes' : 1, 'No' : 0})
sgd_pipe.fit(df_cleaned, target)

In [27]:
sgd_pipe.predict(df_cleaned)

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
obs = pd.DataFrame(df_cleaned.iloc[0]).transpose()

In [31]:
obs

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0


In [32]:
sgd_pipe.predict(obs)

array([0])

In [33]:
from joblib import dump

dump(sgd_pipe, '../models/sgd_pipeline.joblib')

['../models/sgd_pipeline.joblib']