## Packages

In [None]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split



## Data Import

In [None]:
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 


In [None]:
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features #Features
Y = cdc_diabetes_health_indicators.data.targets #Target Variable

diabetes_data = pd.concat([X, Y], axis=1)

## Data Preprocessing

In [None]:
# Split dataset into training and test sets.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.3, random_state= 14)

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
# Address class imbalance with smote over sampling.
resampler = SMOTE(random_state = 45)
X_train_sm, Y_train_sm = resampler.fit_resample(X_train_sc, Y_train)

## Training

In [None]:
xg_model = XGBClassifier(
    n_estimators=100,       
    max_depth=10,     
    learning_rate=0.1,    
    objective='binary:logistic', 
    eval_metric='logloss', 
)

xg_model.fit(X_train_sm, Y_train_sm)

# Model Preparation

In [None]:
# Example for a scikit-learn model
joblib.dump(xg_model, 'diabetes_model.pkl')

In [None]:
# Packages
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split

# Data Import
# Fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 

# Data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features # Features
Y = cdc_diabetes_health_indicators.data.targets   # Target Variable

# Combine data into one dataframe if needed
diabetes_data = pd.concat([X, Y], axis=1)

# Data Preprocessing
# Split dataset into training and test sets.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=14)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Address class imbalance with SMOTE over-sampling.
smote_resampler = SMOTE(random_state=45)
X_train_resampled, Y_train_resampled = smote_resampler.fit_resample(X_train_scaled, Y_train)

# Training
xg_model = XGBClassifier(
    n_estimators=100,       
    max_depth=10,     
    learning_rate=0.1,    
    objective='binary:logistic', 
    eval_metric='logloss', 
)

xg_model.fit(X_train_resampled, Y_train_resampled)

# Save the trained model using joblib
model_filename = 'diabetes_model_v0.1.pkl'
joblib.dump(xg_model, model_filename)

print(f"Model saved as {model_filename}")


In [None]:


# Assuming X_train is still available
scaler = StandardScaler()

# Fit the scaler on the original training data
scaler.fit(X_train)

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')


In [12]:
import joblib
import numpy as np

# Load the trained model
model_filename = 'diabetes_model_v0.1.pkl'
xg_model = joblib.load(model_filename)

# Load the scaler
scaler_filename = 'scaler.pkl'  # Ensure you saved the scaler during training
scaler = joblib.load(scaler_filename)

# valuesList = [
#     0,  # HighBP
#     0,  # HighChol
#     1,  # CholCheck
#     31.94, # BMI
#     1,  # Smoker
#     0,  # Stroke
#     0,  # HeartDiseaseorAttack
#     0,  # PhysActivity
#     0,  # Fruits
#     0,  # Veggies
#     0,  # HvyAlcoholConsump
#     1,  # AnyHealthcare
#     0,  # NoDocbcCost
#     5,  # GenHlth
#     15, # MentHlth
#     30, # PhysHlth
#     1,  # DiffWalk
#     1,  # Sex
#     10,  # Age
#     5,  # Education
#     1   # Income
# ]

valuesList = [
    0,  # HighBP
    0,  # HighChol
    1,  # CholCheck
    35, # BMI
    1,  # Smoker
    0,  # Stroke
    0,  # HeartDiseaseorAttack
    0,  # PhysActivity
    0,  # Fruits
    0,  # Veggies
    0,  # HvyAlcoholConsump
    1,  # AnyHealthcare
    0,  # NoDocbcCost
    5,  # GenHlth
    15, # MentHlth
    30, # PhysHlth
    1,  # DiffWalk
    1,  # Sex
    10,  # Age
    5,  # Education
    0   # Income
]



# Input data as a list
# valuesList = [
#     1,  # HighBP
#     1,  # HighChol
#     1,  # CholCheck
#     18, # BMI
#     0,  # Smoker
#     0,  # Stroke
#     0,  # HeartDiseaseorAttack
#     0,  # PhysActivity
#     0,  # Fruits
#     0,  # Veggies
#     0,  # HvyAlcoholConsump
#     1,  # AnyHealthcare
#     0,  # NoDocbcCost
#     4,  # GenHlth
#     0,  # MentHlth
#     0,  # PhysHlth
#     1,  # DiffWalk
#     0,  # Sex
#     11, # Age
#     2,  # Education
#     4   # Income
# ]

# Convert to 2D array
input_data = np.array(valuesList).reshape(1, -1)

# # Scale the input data
# input_data_scaled = scaler.transform(input_data)

# Make a prediction
prediction = xg_model.predict(input_data)

# Output the result
print(f"Predicted outcome: {prediction[0]}")


Predicted outcome: 1
