In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder.appName('my app').master('local').getOrCreate()
sc = spark.sparkContext

22/10/27 13:09:58 WARN Utils: Your hostname, bagdoyeong-ui-MacBookAir.local resolves to a loopback address: 127.0.0.1; using 192.168.0.13 instead (on interface en0)
22/10/27 13:09:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/27 13:09:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.ml.feature import Imputer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

from pyspark.ml.classification import LogisticRegression

In [3]:
raw_data = spark.read.format('csv').option('header', 'true').load('./input_week08/diabetes.csv')

                                                                                

In [4]:
raw_data.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [35]:
schema = StructType([
    StructField('Pregnancies', FloatType(), True),
    StructField('Glucose', FloatType(), True),
    StructField('BloodPressure', FloatType(), True),
    StructField('SkinThickness', FloatType(), True),
    StructField('Insulin', FloatType(), True),
    StructField('BMI', FloatType(), True),
    StructField('DiabetesPedigreeFunction', FloatType(), True),
    StructField('Age', FloatType(), True),
    StructField('Outcome', FloatType(), True)
])

raw_data = spark.read.format('csv').option('header', 'true').schema(schema).load('diabetes.csv')
raw_data.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [36]:
raw_data.describe('Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI').show()

+-------+-----------------+------------------+------------------+------------------+-----------------+
|summary|          Glucose|     BloodPressure|     SkinThickness|           Insulin|              BMI|
+-------+-----------------+------------------+------------------+------------------+-----------------+
|  count|              768|               768|               768|               768|              768|
|   mean|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.99257813890775|
| stddev|31.97261819513622|19.355807170644777|15.952217567727642|115.24400235133803|7.884160293010772|
|    min|              0.0|               0.0|               0.0|               0.0|              0.0|
|    max|            199.0|             122.0|              99.0|             846.0|             67.1|
+-------+-----------------+------------------+------------------+------------------+-----------------+



In [37]:
raw_data.select('Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI').summary().show()

+-------+-----------------+------------------+------------------+------------------+-----------------+
|summary|          Glucose|     BloodPressure|     SkinThickness|           Insulin|              BMI|
+-------+-----------------+------------------+------------------+------------------+-----------------+
|  count|              768|               768|               768|               768|              768|
|   mean|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.99257813890775|
| stddev|31.97261819513622|19.355807170644777|15.952217567727642|115.24400235133803|7.884160293010772|
|    min|              0.0|               0.0|               0.0|               0.0|              0.0|
|    25%|             99.0|              62.0|               0.0|               0.0|             27.3|
|    50%|            117.0|              72.0|              23.0|              29.0|             32.0|
|    75%|            140.0|              80.0|              32.0|        

In [38]:
raw_data.rdd.map(
    lambda row: 1 if sum([c==None for c in row]) > 0 else 0
).reduce(
    lambda x, y: x+y
)

0

In [39]:
prep_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

raw_data.rdd.map(
    lambda row: 1 if sum([row[c]==0 for c in prep_cols]) > 0 else 0
).reduce(
    lambda x, y : x + y
)

376

In [40]:
for c in prep_cols:
    raw_data = raw_data.withColumn(c, fn.when(fn.col(c)==0, None).otherwise(fn.col(c)))

raw_data.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|        6.0|  148.0|         72.0|         35.0|   null|33.6|                   0.627|50.0|    1.0|
|        1.0|   85.0|         66.0|         29.0|   null|26.6|                   0.351|31.0|    0.0|
|        8.0|  183.0|         64.0|         null|   null|23.3|                   0.672|32.0|    1.0|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
only showing top 5 rows



In [41]:
raw_data.select(*[
    (fn.count(c) / fn.count('*')).alias(c + '_missing') for c in prep_cols
]).show()

+------------------+---------------------+---------------------+------------------+------------------+
|   Glucose_missing|BloodPressure_missing|SkinThickness_missing|   Insulin_missing|       BMI_missing|
+------------------+---------------------+---------------------+------------------+------------------+
|0.9934895833333334|   0.9544270833333334|   0.7044270833333334|0.5130208333333334|0.9856770833333334|
+------------------+---------------------+---------------------+------------------+------------------+



In [42]:
imputer = Imputer(inputCols=prep_cols, outputCols=prep_cols)
model = imputer.fit(raw_data)
raw_data = model.transform(raw_data)
raw_data.show(5)

+-----------+-------+-------------+-------------+---------+----+------------------------+----+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|  Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|
+-----------+-------+-------------+-------------+---------+----+------------------------+----+-------+
|        6.0|  148.0|         72.0|         35.0|155.54822|33.6|                   0.627|50.0|    1.0|
|        1.0|   85.0|         66.0|         29.0|155.54822|26.6|                   0.351|31.0|    0.0|
|        8.0|  183.0|         64.0|     29.15342|155.54822|23.3|                   0.672|32.0|    1.0|
|        1.0|   89.0|         66.0|         23.0|     94.0|28.1|                   0.167|21.0|    0.0|
|        0.0|  137.0|         40.0|         35.0|    168.0|43.1|                   2.288|33.0|    1.0|
+-----------+-------+-------------+-------------+---------+----+------------------------+----+-------+
only showing top 5 rows



In [43]:
cols = raw_data.columns
cols.remove('Outcome')

In [44]:
assembler = VectorAssembler(inputCols=cols, outputCol='features')
raw_data = assembler.transform(raw_data)
raw_data.select('features').show(5, truncate=False)

+-----------------------------------------------------------------------------------------------+
|features                                                                                       |
+-----------------------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,155.5482177734375,33.599998474121094,0.6269999742507935,50.0]             |
|[1.0,85.0,66.0,29.0,155.5482177734375,26.600000381469727,0.35100001096725464,31.0]             |
|[8.0,183.0,64.0,29.153419494628906,155.5482177734375,23.299999237060547,0.671999990940094,32.0]|
|[1.0,89.0,66.0,23.0,94.0,28.100000381469727,0.16699999570846558,21.0]                          |
|[0.0,137.0,40.0,35.0,168.0,43.099998474121094,2.2880001068115234,33.0]                         |
+-----------------------------------------------------------------------------------------------+
only showing top 5 rows



In [46]:
standardscaler = StandardScaler(
    inputCol='features', outputCol='scaled_features', withStd=True, withMean=False
)

raw_data = standardscaler.fit(raw_data).transform(raw_data)
raw_data.select('features', 'scaled_features').show(5)

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[6.0,148.0,72.0,3...|[1.78063837321943...|
|[1.0,85.0,66.0,29...|[0.29677306220323...|
|[8.0,183.0,64.0,2...|[2.37418449762590...|
|[1.0,89.0,66.0,23...|[0.29677306220323...|
|[0.0,137.0,40.0,3...|[0.0,4.5012560836...|
+--------------------+--------------------+
only showing top 5 rows



In [47]:
train, test = raw_data.randomSplit([0.8, 0.2], seed=37)

print(train.count())
print(test.count())

613
155


In [49]:
lr = LogisticRegression(
    labelCol='Outcome',
    featuresCol='scaled_features',
    maxIter=10
)

model = lr.fit(train)
predict_train = model.transform(train)
predict_test = model.transform(test)
predict_test.select('Outcome', 'prediction').show(10)

22/10/20 15:39:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/10/20 15:39:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
+-------+----------+
|Outcome|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
+-------+----------+
only showing top 10 rows



In [50]:
predict_test.select('Outcome', 'rawPrediction', 'probability', 'prediction').show(10, truncate=False)

+-------+----------------------------------------+-----------------------------------------+----------+
|Outcome|rawPrediction                           |probability                              |prediction|
+-------+----------------------------------------+-----------------------------------------+----------+
|0.0    |[2.6606675575489014,-2.6606675575489014]|[0.9346654434423237,0.06533455655767628] |0.0       |
|0.0    |[1.9517259190911238,-1.9517259190911238]|[0.8756347138025965,0.1243652861974035]  |0.0       |
|0.0    |[2.576622282928418,-2.576622282928418]  |[0.9293417905959838,0.07065820940401624] |0.0       |
|0.0    |[1.2043488247289247,-1.2043488247289247]|[0.7692975116571915,0.2307024883428085]  |0.0       |
|0.0    |[2.997123226896999,-2.997123226896999]  |[0.9524439944863868,0.047556005513613164]|0.0       |
|0.0    |[1.1803281424381549,-1.1803281424381549]|[0.7650067995177674,0.2349932004822326]  |0.0       |
|0.0    |[3.396095324364234,-3.396095324364234]  |[0.96758228144