In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName("OfflineDiabetes").getOrCreate()

In [8]:
# df = spark.read.csv(path="/home/jovyan/work/data/offline.csv", header=True, inferSchema=True)
df = spark.read.csv(path="/data/offline.csv", header=True, inferSchema=True) # path from the mounted volume

In [9]:
df.show()

+------------+------+--------+---------+----+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+--------+---+----+---------+------+
|Diabetes_012|HighBP|HighChol|CholCheck| BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|MentHlth|PhysHlth|DiffWalk|Sex| Age|Education|Income|
+------------+------+--------+---------+----+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+--------+---+----+---------+------+
|         0.0|   0.0|     1.0|      1.0|20.0|   1.0|   0.0|                 0.0|         1.0|   1.0|    1.0|              0.0|          1.0|        0.0|    2.0|     0.0|     0.0|     0.0|1.0|12.0|      6.0|   8.0|
|         0.0|   0.0|     0.0|      1.0|34.0|   0.0|   0.0|                 0.0|         1.0|   0.0|    1.0|              0.0|          1.0|    

In [11]:
df.printSchema()

root
 |-- Diabetes_012: double (nullable = true)
 |-- HighBP: double (nullable = true)
 |-- HighChol: double (nullable = true)
 |-- CholCheck: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Smoker: double (nullable = true)
 |-- Stroke: double (nullable = true)
 |-- HeartDiseaseorAttack: double (nullable = true)
 |-- PhysActivity: double (nullable = true)
 |-- Fruits: double (nullable = true)
 |-- Veggies: double (nullable = true)
 |-- HvyAlcoholConsump: double (nullable = true)
 |-- AnyHealthcare: double (nullable = true)
 |-- NoDocbcCost: double (nullable = true)
 |-- GenHlth: double (nullable = true)
 |-- MentHlth: double (nullable = true)
 |-- PhysHlth: double (nullable = true)
 |-- DiffWalk: double (nullable = true)
 |-- Sex: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Education: double (nullable = true)
 |-- Income: double (nullable = true)



In [12]:
label_col = "Diabetes_012"
feature_cols = [c for c in df.columns if c != label_col]

## VectorAssembler for merging multiple columns into a vector column.

we use VectorAssembler to combine multiple input feature columns (numeric, boolean, vector) into a single feature vector column

In [13]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

## Scaling the features

In [14]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)

## Models and their hyperparameters

1. Logistic Regression
2. Random Forest
3. Gradient-Boosted Trees

In [15]:
models = [
    {
        "name": "Logistic Regression",
        "model": LogisticRegression(featuresCol="scaledFeatures", labelCol=label_col),
        "paramGrid": ParamGridBuilder().addGrid(LogisticRegression.regParam, values=[0.1, 0.01]).build()
    },
    {
        "name": "Random Forest",
        "model": RandomForestClassifier(featuresCol="scaledFeatures", labelCol=label_col),
        "paramGrid": ParamGridBuilder().addGrid(RandomForestClassifier.numTrees, values=[20, 50]).build()
    },
    {
        "name": "Decision Tree",
        "model": DecisionTreeClassifier(featuresCol="scaledFeatures", labelCol=label_col),
        "paramGrid": ParamGridBuilder().addGrid(DecisionTreeClassifier.maxDepth, values=[5, 10, 15]).build()
    }
]

# Evaluator (F1)

In [16]:
evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="f1")

## Cross-Validation for every model

In [20]:
best_f1_score = 0.0
best_overrall_model = None

for m in models:
  pipeline = Pipeline(stages=[assembler, scaler, m["model"]]) # take the model from the dict

  cv = CrossValidator(
      estimator=pipeline,
      estimatorParamMaps=m["paramGrid"],
      evaluator=evaluator,
      numFolds=5,
  )

  model_name = m["name"]

  print(f"Training: {model_name}")
  cv_model = cv.fit(df)

  f1 = evaluator.evaluate(cv_model.transform(df))

  print(f"{model_name} F1 Score: {f1}")

  if f1 > best_f1_score:
    best_f1_score = f1
    best_overrall_model = cv_model.bestModel

print(f"\nBest model: {model_name} with F1 score: {best_f1_score}")

Training: Logistic Regression
Logistic Regression F1 Score: 0.8073291652933868
Training: Random Forest
Random Forest F1 Score: 0.7699789551563437
Training: Decision Tree
Decision Tree F1 Score: 0.7942878650519836

Best model: Decision Tree with F1 score: 0.8073291652933868


## Serialization of the model

Saving the best model

In [29]:
# import os
# os.makedirs("/home/jovyan/work/saved_models", exist_ok=True)
# model_path = "/home/jovyan/work/saved_models/best_diabetes_model"

model_path = f"saved_models/best_diabetes_model"

best_overrall_model.write() \
    .overwrite() \
    .save(model_path)