In [26]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [27]:
df = spark.read.csv("data/diabetes.csv",inferSchema=True,header=True)
df.show(10)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [28]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [29]:
df.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [30]:
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=[
    'Pregnancies','Glucose','BloodPressure',
    'SkinThickness','Insulin','BMI',
    'DiabetesPedigreeFunction','Age'],outputCol='features')
output=assembler.transform(df)
output.select('features','Outcome').show(5)

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
|[8.0,183.0,64.0,0...|      1|
|[1.0,89.0,66.0,23...|      0|
|[0.0,137.0,40.0,3...|      1|
+--------------------+-------+
only showing top 5 rows



In [31]:
(train, test) = output.randomSplit([0.8, 0.2])
train.describe().show()

+-------+------------------+------------------+-----------------+------------------+------------------+----------------+------------------------+------------------+-------------------+
|summary|       Pregnancies|           Glucose|    BloodPressure|     SkinThickness|           Insulin|             BMI|DiabetesPedigreeFunction|               Age|            Outcome|
+-------+------------------+------------------+-----------------+------------------+------------------+----------------+------------------------+------------------+-------------------+
|  count|               599|               599|              599|               599|               599|             599|                     599|               599|                599|
|   mean|3.9248747913188646| 120.8848080133556|69.36393989983306|20.402337228714526| 79.06343906510851|            32.1|      0.4707762938230384| 33.40734557595993|0.34557595993322204|
| stddev|3.4354095931817694|31.317850801915505|19.39919752770176|15.7007997

In [32]:
test.describe().show()

+-------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|summary|      Pregnancies|           Glucose|     BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|  count|              169|               169|               169|               169|               169|               169|                     169|               169|               169|
|   mean|3.562130177514793|120.92899408284023| 68.18934911242603|21.011834319526628| 82.40828402366864|31.611834319526626|      0.4757751479289942|32.650887573964496|0.3609467455621302|
| stddev|3.118300087440895|34.290525896288216|19.230423073807643|16.85

In [33]:
dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Outcome")
dtc = dtc.fit(train)

pred = dtc.transform(test)
pred.show()


+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+-------------+--------------------+----------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|rawPrediction|         probability|prediction|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+-------------+--------------------+----------+
|          0|     73|            0|            0|      0|21.1|                   0.342| 25|      0|(8,[1,5,6,7],[73....|  [110.0,1.0]|[0.99099099099099...|       0.0|
|          0|     78|           88|           29|     40|36.9|                   0.434| 21|      0|[0.0,78.0,88.0,29...|  [74.0,13.0]|[0.85057471264367...|       0.0|
|          0|     93|           60|            0|      0|35.3|                   0.263| 25|      0|[0.0,93.0,60.0,0....|  [74.0,13.0]|[0.85057471264367...|       0.0

# Model Evaluation

In [34]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator= BinaryClassificationEvaluator(labelCol="Outcome")
accuracy= evaluator.evaluate(pred)
print(accuracy)

0.7937917425622344
