In [None]:
!pip install pandas



In [None]:
!pip install pyspark



In [None]:
!pip install findspark



In [None]:
!pip install xgboost



In [None]:
import findspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")

In [None]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
df["Diabetes_binary"].value_counts()

Unnamed: 0_level_0,count
Diabetes_binary,Unnamed: 1_level_1
0.0,218334
1.0,35346


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Diabetes_binary'], random_state=42)

In [None]:
train_df["Diabetes_binary"].value_counts()

Unnamed: 0_level_0,count
Diabetes_binary,Unnamed: 1_level_1
0.0,174667
1.0,28277


In [None]:
test_df["Diabetes_binary"].value_counts()

Unnamed: 0_level_0,count
Diabetes_binary,Unnamed: 1_level_1
0.0,43667
1.0,7069


In [None]:
train_df.to_csv("offline.csv", index=False)

In [None]:
test_df.to_csv("online.csv", index=False)

In [None]:
online_df = pd.read_csv("online.csv")

In [None]:
online_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,5.0,5.0,8.0
1,1.0,1.0,0.0,1.0,36.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,3.0,0.0,0.0,1.0,0.0,10.0,3.0,4.0
2,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,5.0,5.0,8.0
3,0.0,0.0,0.0,1.0,22.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,4.0,7.0
4,1.0,1.0,1.0,1.0,26.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,4.0,0.0,7.0,0.0,0.0,13.0,5.0,2.0


In [None]:
findspark.init()

In [None]:
spark = SparkSession.builder \
    .appName("DiabetesIndicators") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1") \
    .getOrCreate()

In [None]:
offline_df = spark.read.option("delimiter", ",") \
    .option("header", True)\
    .csv("./offline.csv",
         inferSchema=True)

In [None]:
offline_df.printSchema()

root
 |-- Diabetes_binary: double (nullable = true)
 |-- HighBP: double (nullable = true)
 |-- HighChol: double (nullable = true)
 |-- CholCheck: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Smoker: double (nullable = true)
 |-- Stroke: double (nullable = true)
 |-- HeartDiseaseorAttack: double (nullable = true)
 |-- PhysActivity: double (nullable = true)
 |-- Fruits: double (nullable = true)
 |-- Veggies: double (nullable = true)
 |-- HvyAlcoholConsump: double (nullable = true)
 |-- AnyHealthcare: double (nullable = true)
 |-- NoDocbcCost: double (nullable = true)
 |-- GenHlth: double (nullable = true)
 |-- MentHlth: double (nullable = true)
 |-- PhysHlth: double (nullable = true)
 |-- DiffWalk: double (nullable = true)
 |-- Sex: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Education: double (nullable = true)
 |-- Income: double (nullable = true)



In [None]:
offline_df_pd = pd.read_csv("offline.csv")

In [None]:
offline_df_pd.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,0.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,2.0,4.0,5.0
1,0.0,1.0,0.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,13.0,4.0,7.0
2,0.0,1.0,1.0,1.0,29.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,9.0,6.0,8.0
3,0.0,1.0,1.0,1.0,39.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,0.0,1.0,7.0,4.0,7.0
4,0.0,0.0,1.0,1.0,16.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,7.0,5.0,1.0


In [None]:
offline_df_train, offline_df_test = train_test_split(offline_df_pd, test_size=0.2, stratify=offline_df_pd['Diabetes_binary'], random_state=42)

In [None]:
offline_df_train = offline_df_train.reset_index(drop=True)
offline_df_test = offline_df_test.reset_index(drop=True)

In [None]:
offline_df_train["Diabetes_binary"].value_counts()

Unnamed: 0_level_0,count
Diabetes_binary,Unnamed: 1_level_1
0.0,139733
1.0,22622


In [None]:
offline_df_test["Diabetes_binary"].value_counts()

Unnamed: 0_level_0,count
Diabetes_binary,Unnamed: 1_level_1
0.0,34934
1.0,5655


In [None]:
offline_df_train_x = offline_df_train.drop(columns=["Diabetes_binary"])
offline_df_train_y = offline_df_train["Diabetes_binary"]

In [None]:
offline_df_train_x.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,1.0,27.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,4.0,7.0
1,0.0,1.0,1.0,30.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,2.0,0.0,0.0,1.0,8.0,3.0,6.0
2,0.0,1.0,1.0,27.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,0.0,7.0,0.0,0.0,3.0,6.0,8.0
3,0.0,0.0,1.0,44.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,4.0,21.0,2.0,0.0,0.0,6.0,5.0,1.0
4,1.0,0.0,1.0,31.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,3.0,2.0,0.0,0.0,0.0,9.0,5.0,6.0


In [None]:
offline_df_train_y.head()

Unnamed: 0,Diabetes_binary
0,0.0
1,0.0
2,0.0
3,0.0
4,1.0


In [None]:
offline_df_test_x = offline_df_test.drop(columns=["Diabetes_binary"])
offline_df_test_y = offline_df_test["Diabetes_binary"]

In [None]:
offline_df_test_x.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,1.0,30.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,13.0,4.0,5.0
1,0.0,0.0,1.0,23.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,6.0,7.0
2,0.0,1.0,1.0,35.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,3.0,0.0,0.0,0.0,1.0,4.0,6.0,8.0
3,1.0,0.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,4.0,0.0,1.0,0.0,1.0,12.0,4.0,3.0
4,0.0,1.0,1.0,30.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,4.0,5.0,30.0,1.0,0.0,7.0,6.0,8.0


In [None]:
offline_df_test_y.head()

Unnamed: 0,Diabetes_binary
0,0.0
1,0.0
2,0.0
3,1.0
4,0.0


In [None]:
len(offline_df_train_x)

162355

In [None]:
scaler = RobustScaler()

In [None]:
offline_df_train_x = scaler.fit_transform(offline_df_train_x)

In [None]:
offline_df_test_x = scaler.transform(offline_df_test_x)

In [None]:
from joblib import dump

In [None]:
dump(scaler, "scaler.joblib")

['scaler.joblib']

In [None]:
!pip uninstall scikit-learn

Found existing installation: scikit-learn 1.5.2
Uninstalling scikit-learn-1.5.2:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/scikit_learn-1.5.2.dist-info/*
    /usr/local/lib/python3.11/dist-packages/scikit_learn.libs/libgomp-a34b3233.so.1.0.0
    /usr/local/lib/python3.11/dist-packages/sklearn/*
Proceed (Y/n)? y
  Successfully uninstalled scikit-learn-1.5.2


In [None]:
!pip install scikit-learn==1.5.2

Collecting scikit-learn==1.5.2
  Using cached scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Using cached scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.5.2


In [None]:
!pip install xgboost



In [None]:
import sklearn
import xgboost

In [None]:
print(f"scikit-learn version: {sklearn.__version__}")
print(f"XGBoost version: {xgboost.__version__}")

scikit-learn version: 1.5.2
XGBoost version: 2.1.3


In [None]:
print(f"Pandas version: {pd.__version__}")

Pandas version: 2.2.2


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from joblib import dump
from xgboost import XGBClassifier

In [None]:
models = {
    "XGB": XGBClassifier(random_state=42, n_estimators=100, max_depth=2, objective='binary:logistic'),
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42, max_iter=1000),
}

hyperparameters = {
    "XGB": {
        "max_depth": [2,3,4,5],
        "n_estimators": [50, 100, 200],
    },
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
    },
    "LogisticRegression": {
        "C": [0.1, 1, 10],
        "penalty": ["l2"],
        "solver": ["lbfgs"],
    },
}

In [None]:
best_model = None
best_f1_score = 0
best_model_name = None

In [None]:
for model_name, model in models.items():
    print(f"Training {model_name}...")

    grid = GridSearchCV(
        estimator=model,
        param_grid=hyperparameters[model_name],
        scoring="f1",
        cv=5,
        n_jobs=-1,
    )
    grid.fit(offline_df_train_x, offline_df_train_y)

    best_estimator = grid.best_estimator_
    y_pred = best_estimator.predict(offline_df_test_x)
    f1 = f1_score(offline_df_test_y, y_pred)

    print(f"Best F1 Score for {model_name}: {f1}")
    print(f"Best Hyperparameters: {grid.best_params_}")

    if f1 > best_f1_score:
        best_f1_score = f1
        best_model = best_estimator
        best_model_name = model_name

print(f"\nBest Model: {best_model_name} with F1 Score: {best_f1_score}")
dump(best_model, f"best_model_{best_model_name}.joblib")
print(f"Best model saved as 'best_model_{best_model_name}.joblib'.")

Training XGB...
Best F1 Score for XGB: 0.2649561107359892
Best Hyperparameters: {'max_depth': 2, 'n_estimators': 100}
Training RandomForest...




Best F1 Score for RandomForest: 0.26142595978062155
Best Hyperparameters: {'max_depth': None, 'n_estimators': 200}
Training LogisticRegression...
Best F1 Score for LogisticRegression: 0.24686989657049538
Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}

Best Model: XGB with F1 Score: 0.2649561107359892
Best model saved as 'best_model_XGB.joblib'.
