In [1]:
from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
 # Load Data
dataset = pd.read_csv('data/HDD.csv')
outputs_names = ["target"]
numerical = ["age", "trestbps", "thalach", "oldpeak"]
categorical = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
outputs = {}
# Get Outputs extracted and remove after
for value in outputs_names:
    outputs[value] = dataset[value]

dataset = dataset.drop(outputs_names, axis=1)

# Loop through categorical features and fill null values with most frequent
for feature in categorical:
    dataset[feature] = dataset[feature].fillna(dataset[feature].value_counts().index[0])

# Loop through numerical features and fill null values with mean
for feature in numerical:
    dataset[feature] = dataset[feature].fillna(dataset[feature].mean())

# Normalize Inputs
preprocessor = ColumnTransformer(
    transformers =
    [('ohe',
      OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False),
      categorical),
     ('scaler',
      StandardScaler(),
      numerical)],
    remainder='passthrough',
    verbose_feature_names_out=False).set_output(transform = 'pandas')
x_dataset = preprocessor.fit_transform(dataset)

In [3]:
model = LGBMClassifier()
model.fit(x_dataset, outputs["target"])

[LightGBM] [Info] Number of positive: 526, number of negative: 499
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 422
[LightGBM] [Info] Number of data points in the train set: 1025, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513171 -> initscore=0.052695
[LightGBM] [Info] Start training from score 0.052695


In [4]:
import pickle

# save
with open('hdd_lgbm.pkl','wb') as f:
    pickle.dump(model,f)

# load
with open('hdd_lgbm.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [5]:
print(accuracy_score(clf2.predict(x_dataset), outputs["target"]))

1.0


In [6]:
# HGB
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier()
model.fit(x_dataset, outputs["target"])

In [7]:
import pickle

# save
with open('hdd_hgb.pkl','wb') as f:
    pickle.dump(model,f)
    
# load
with open('hdd_hgb.pkl', 'rb') as f:
    clf2 = pickle.load(f)
    
print(accuracy_score(clf2.predict(x_dataset), outputs["target"]))

1.0
