In [1]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.csv("data/diabetes.csv",inferSchema=True,header=True)
df.show(10)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [3]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [4]:
df.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [5]:
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=[
    'Pregnancies','Glucose','BloodPressure',
    'SkinThickness','Insulin','BMI',
    'DiabetesPedigreeFunction','Age'],outputCol='features')
output=assembler.transform(df)
output.select('features','Outcome').show(5)

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
|[8.0,183.0,64.0,0...|      1|
|[1.0,89.0,66.0,23...|      0|
|[0.0,137.0,40.0,3...|      1|
+--------------------+-------+
only showing top 5 rows



In [6]:
(train, test) = output.randomSplit([0.8, 0.2])
train.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------------+------------------+-------------------+
|summary|       Pregnancies|           Glucose|     BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|            Outcome|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------------+------------------+-------------------+
|  count|               617|               617|               617|               617|               617|               617|                     617|               617|                617|
|   mean| 3.764991896272285|121.81199351701783| 69.40680713128039|20.285251215559157| 80.12803889789303|31.861102106969224|      0.4730405186385733|33.175040518638575| 0.3419773095623987|
| stddev|3.3050836052846284| 32.03139164518505|18.5405352429

In [7]:
test.describe().show()

+-------+-----------------+-----------------+------------------+------------------+------------------+-----------------+------------------------+------------------+-------------------+
|summary|      Pregnancies|          Glucose|     BloodPressure|     SkinThickness|           Insulin|              BMI|DiabetesPedigreeFunction|               Age|            Outcome|
+-------+-----------------+-----------------+------------------+------------------+------------------+-----------------+------------------------+------------------+-------------------+
|  count|              151|              151|               151|               151|               151|              151|                     151|               151|                151|
|   mean|4.172185430463577|117.1456953642384| 67.87417218543047| 21.56291390728477| 78.45695364238411|32.52980132450332|      0.4671192052980132| 33.50993377483444|0.37748344370860926|
| stddev|3.614344734349457| 31.5595516129942|22.408124906161934|15.26546261

In [8]:
dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Outcome")
dtc = dtc.fit(train)

pred = dtc.transform(test)
pred.show()


+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+-------------+--------------------+----------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|rawPrediction|         probability|prediction|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+-------------+--------------------+----------+
|          0|     84|           64|           22|     66|35.8|                   0.545| 21|      0|[0.0,84.0,64.0,22...|   [82.0,0.0]|           [1.0,0.0]|       0.0|
|          0|     86|           68|           32|      0|35.8|                   0.238| 25|      0|[0.0,86.0,68.0,32...|   [82.0,0.0]|           [1.0,0.0]|       0.0|
|          0|     95|           85|           25|     36|37.4|                   0.247| 24|      1|[0.0,95.0,85.0,25...|   [82.0,0.0]|           [1.0,0.0]|       0.0

# Model Evaluation

In [9]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator= BinaryClassificationEvaluator(labelCol="Outcome")
accuracy= evaluator.evaluate(pred)
print(accuracy)

0.7215378872713698
