https://medium.com/@aieeshashafique/gradient-boost-model-using-pyspark-mllib-solving-a-chronic-kidney-disease-problem-13039b6dc099

In [29]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [30]:
df = spark.read.csv("data/diabetes.csv",inferSchema=True,header=True)
df.show(10)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [31]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [32]:
df.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [33]:
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=[
    'Pregnancies','Glucose','BloodPressure',
    'SkinThickness','Insulin','BMI',
    'DiabetesPedigreeFunction','Age'],outputCol='features')
output=assembler.transform(df)
output.select('features','Outcome').show(5)

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
|[8.0,183.0,64.0,0...|      1|
|[1.0,89.0,66.0,23...|      0|
|[0.0,137.0,40.0,3...|      1|
+--------------------+-------+
only showing top 5 rows



In [34]:
from pyspark.ml.feature import Normalizer

normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
l1NormData = normalizer.transform(output)
l1NormData.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|       features_norm|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|[0.01737986889785...|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|[0.00418495842243...|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,0...|[0.02572578881699...|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|[1.0,89.0,66.0,23...|[0.00310301706349...|
|     

In [35]:
(train, test) = l1NormData.randomSplit([0.8, 0.2])
train.describe().show()

+-------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------------+------------------+-------------------+
|summary|       Pregnancies|           Glucose|    BloodPressure|     SkinThickness|           Insulin|              BMI|DiabetesPedigreeFunction|               Age|            Outcome|
+-------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------------+------------------+-------------------+
|  count|               614|               614|              614|               614|               614|              614|                     614|               614|                614|
|   mean|3.8762214983713354|121.23941368078177| 68.8957654723127|20.855048859934854| 79.68892508143323|32.25651465798048|      0.4696791530944625| 33.16123778501629|0.36482084690553745|
| stddev| 3.418735643402662| 32.45691852057783|19.79418278405208|16.21

In [36]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="Outcome", featuresCol="features_norm", maxIter=10)

In [37]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[gbt])

model = pipeline.fit(train)
prediction = model.transform(train)
prediction.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+--------------------+--------------------+----------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|       features_norm|       rawPrediction|         probability|prediction|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+--------------------+--------------------+----------+
|          0|     57|           60|            0|      0|21.7|                   0.735| 67|      0|[0.0,57.0,60.0,0....|[0.0,0.2761159687...|[1.21391391381012...|[0.91892484885309...|       0.0|
|          0|     67|           76|            0|      0|45.3|                   0.194| 46|      0|[0.0,67.0,76.0,0....|[0.0,0.2857215962...|[1.19140789925138...|[0.91550750117612...|       0.0|
|          0|     73|    

# Model Evaluation(explain overfitting)

In [38]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator= BinaryClassificationEvaluator(labelCol="Outcome")
accuracy= evaluator.evaluate(prediction)
print(accuracy)

0.9661458333333334
