In [1]:
# this workbook demonstrates how to use PySparkML to do multi variable binary regression
# the setup is a bit contrived, since this is a how-to demo
# 
# we start by using a fabricated data set (see Generate_Dummy_BP_Data.ipynb). 
# this script generates a random bmi, systolic, and diastolic readings. There are
# no correlations built into the script, random uniform numbers within a range.
#
# we then 
# 1) train a regression model to predict whether tahe patient will be coded
# as hypertensive based on systolic and diastolic blood pressure
# 2) apply this model on a test data set and view the predictions
# 3) view the estimated error on the test set
# 4) view the decision tree to investigate cutoff values for various inputs
# 
# we will do this with a random forest regressor, but most of it should work with other
# various other Spark ML regressors, depending on the algorithm
# 
# docs are here: (https://spark.apache.org/docs/latest/ml-classification-regression.html)
#

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [4]:
sqlContext = SQLContext(sc)

In [5]:
df = sqlContext.read.format("csv").option("inferschema","true").option("header", "true").option("delimiter", ",").load("hypertensive1.csv")

In [6]:
df.show()

+---+--------+---------+------------+
|bmi|systolic|diastolic|hypertensive|
+---+--------+---------+------------+
| 22|     129|       80|           1|
| 19|      64|       99|           0|
| 28|      91|       56|           0|
| 29|     130|       49|           0|
| 20|     149|       42|           0|
| 20|      97|       76|           0|
| 27|     100|       40|           0|
| 30|     174|       71|           1|
| 22|      84|       94|           0|
| 25|     157|       85|           1|
| 19|     142|       56|           0|
| 29|      60|       74|           0|
| 25|     145|      105|           1|
| 20|     177|      106|           1|
| 20|     176|       48|           1|
| 19|     149|       84|           1|
| 19|     100|       90|           0|
| 21|      64|      104|           0|
| 25|     153|       48|           1|
| 25|     157|       46|           1|
+---+--------+---------+------------+
only showing top 20 rows



In [7]:
df.printSchema()

root
 |-- bmi: integer (nullable = true)
 |-- systolic: integer (nullable = true)
 |-- diastolic: integer (nullable = true)
 |-- hypertensive: integer (nullable = true)



In [8]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
vectorAssembler = VectorAssembler(inputCols = ['systolic', 'diastolic'], outputCol = 'features')
va_training = vectorAssembler.transform(trainingData)
va_training = va_training.select(['features', 'hypertensive'])
va_training.show(10)

+-----------+------------+
|   features|hypertensive|
+-----------+------------+
|[60.0,49.0]|           0|
|[60.0,57.0]|           0|
|[60.0,67.0]|           0|
|[60.0,75.0]|           0|
|[60.0,76.0]|           0|
|[60.0,77.0]|           0|
|[61.0,50.0]|           0|
|[61.0,89.0]|           0|
|[61.0,90.0]|           0|
|[61.0,99.0]|           0|
+-----------+------------+
only showing top 10 rows



In [11]:
from pyspark.ml.regression import DecisionTreeRegressor

In [12]:
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'hypertensive')

In [13]:
dt_model = dt.fit(va_training)

In [14]:
va_testing = vectorAssembler.transform(testData)
va_testing = va_testing.select(['features', 'hypertensive'])
va_testing.show(10)

+------------+------------+
|    features|hypertensive|
+------------+------------+
| [60.0,62.0]|           0|
| [60.0,73.0]|           0|
| [61.0,63.0]|           0|
| [63.0,50.0]|           0|
|[63.0,107.0]|           0|
|[63.0,110.0]|           0|
| [64.0,45.0]|           0|
| [65.0,85.0]|           0|
|[65.0,105.0]|           0|
| [66.0,59.0]|           0|
+------------+------------+
only showing top 10 rows



In [15]:
dt_predictions = dt_model.transform(va_testing)

In [16]:
dt_predictions.show(100)

+-------------+------------+--------------------+
|     features|hypertensive|          prediction|
+-------------+------------+--------------------+
|  [60.0,62.0]|           0|                 0.0|
|  [60.0,73.0]|           0|                 0.0|
|  [61.0,63.0]|           0|                 0.0|
|  [63.0,50.0]|           0|                 0.0|
| [63.0,107.0]|           0|                 0.0|
| [63.0,110.0]|           0|                 0.0|
|  [64.0,45.0]|           0|                 0.0|
|  [65.0,85.0]|           0|0.023529411764705882|
| [65.0,105.0]|           0|                 0.0|
|  [66.0,59.0]|           0|                 0.0|
|  [66.0,73.0]|           0|                 0.0|
|  [67.0,61.0]|           0|                 0.0|
|  [68.0,79.0]|           0|                 0.0|
|  [68.0,83.0]|           0|                 0.0|
|  [69.0,73.0]|           0|                 0.0|
|  [70.0,68.0]|           0|                 0.0|
|  [70.0,77.0]|           0|                 0.0|


In [17]:
evaluator = RegressionEvaluator(
        labelCol="hypertensive", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.126786


In [18]:
from pyspark.ml.regression import RandomForestRegressor

In [19]:
rf = RandomForestRegressor(featuresCol ='features', labelCol = 'hypertensive', numTrees=20)

In [20]:
rf_model = rf.fit(va_training)

In [21]:
rf_predictions = rf_model.transform(va_testing)

In [22]:
rf_predictions.show(100)

+-------------+------------+--------------------+
|     features|hypertensive|          prediction|
+-------------+------------+--------------------+
|  [60.0,62.0]|           0|0.001231304957787...|
|  [60.0,73.0]|           0|8.032466756494399E-4|
|  [61.0,63.0]|           0|0.001231304957787...|
|  [63.0,50.0]|           0|0.003027060352153...|
| [63.0,107.0]|           0|0.034087049508737324|
| [63.0,110.0]|           0|0.022697104520480516|
|  [64.0,45.0]|           0|3.466958126197276E-4|
|  [65.0,85.0]|           0| 0.02431968696922312|
| [65.0,105.0]|           0| 0.04104630232488212|
|  [66.0,59.0]|           0|                 0.0|
|  [66.0,73.0]|           0|8.032466756494399E-4|
|  [67.0,61.0]|           0|                 0.0|
|  [68.0,79.0]|           0|8.032466756494399E-4|
|  [68.0,83.0]|           0| 0.02478445912517175|
|  [69.0,73.0]|           0|8.032466756494399E-4|
|  [70.0,68.0]|           0|0.001090037495178...|
|  [70.0,77.0]|           0|8.032466756494399E-4|


In [23]:
rmse = evaluator.evaluate(rf_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.120029


In [24]:
# find the cutoff values
#
# this part was a little surprising to me. Scikit-learn has a method for this: clf.tree_.threshold[0]
# 
# unfortunately, looks like there isn't a method for this in SparkML? 
# you can see the tree by printing .toDebugString on your model and 
#
# not sure if this is really my only or best option, I've looked around 
# but haven't found a tidy api call for this...

In [25]:
rf_model.toDebugString

'RandomForestRegressionModel (uid=RandomForestRegressor_a052bd3c025d) with 20 trees\n  Tree 0 (weight 1.0):\n    If (feature 0 <= 120.5)\n     If (feature 0 <= 101.5)\n      If (feature 1 <= 101.5)\n       Predict: 0.0\n      Else (feature 1 > 101.5)\n       If (feature 1 <= 103.5)\n        If (feature 0 <= 97.5)\n         Predict: 0.0\n        Else (feature 0 > 97.5)\n         Predict: 0.875\n       Else (feature 1 > 103.5)\n        If (feature 0 <= 93.5)\n         Predict: 0.005291005291005291\n        Else (feature 0 > 93.5)\n         Predict: 0.6904761904761905\n     Else (feature 0 > 101.5)\n      If (feature 1 <= 88.5)\n       If (feature 0 <= 112.5)\n        Predict: 0.0\n       Else (feature 0 > 112.5)\n        If (feature 0 <= 116.5)\n         Predict: 0.06432748538011696\n        Else (feature 0 > 116.5)\n         Predict: 0.055900621118012424\n      Else (feature 1 > 88.5)\n       If (feature 0 <= 108.5)\n        If (feature 0 <= 105.5)\n         Predict: 0.6086956521739131\