In [236]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier

In [237]:
spark = SparkSession.builder.appName("Stroke").config("spark.sql.debug", "true").getOrCreate()

In [238]:
df = spark.read.csv('healthcare-dataset-stroke-data.csv', header = True, inferSchema=True)

In [239]:
df.show(5)

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|         

In [240]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [241]:
df_prediction = df.select('gender','age','hypertension','heart_disease','Residence_type','avg_glucose_level','bmi','smoking_status','stroke')

In [242]:
df_prediction.show(5)

+------+----+------------+-------------+--------------+-----------------+----+---------------+------+
|gender| age|hypertension|heart_disease|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+------+----+------------+-------------+--------------+-----------------+----+---------------+------+
|  Male|67.0|           0|            1|         Urban|           228.69|36.6|formerly smoked|     1|
|Female|61.0|           0|            0|         Rural|           202.21| N/A|   never smoked|     1|
|  Male|80.0|           0|            1|         Rural|           105.92|32.5|   never smoked|     1|
|Female|49.0|           0|            0|         Urban|           171.23|34.4|         smokes|     1|
|Female|79.0|           1|            0|         Rural|           174.12|  24|   never smoked|     1|
+------+----+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 5 rows



In [243]:
df_prediction.dtypes

[('gender', 'string'),
 ('age', 'double'),
 ('hypertension', 'int'),
 ('heart_disease', 'int'),
 ('Residence_type', 'string'),
 ('avg_glucose_level', 'double'),
 ('bmi', 'string'),
 ('smoking_status', 'string'),
 ('stroke', 'int')]

In [244]:
df_prediction.groupBy('smoking_status').count().show()

+---------------+-----+
| smoking_status|count|
+---------------+-----+
|         smokes|  789|
|        Unknown| 1544|
|   never smoked| 1892|
|formerly smoked|  885|
+---------------+-----+



In [245]:
df_prediction.groupBy(['gender','Residence_type','smoking_status']).count().show()

+------+--------------+---------------+-----+
|gender|Residence_type| smoking_status|count|
+------+--------------+---------------+-----+
|Female|         Urban|         smokes|  243|
|Female|         Rural|formerly smoked|  227|
|  Male|         Urban|         smokes|  183|
|Female|         Urban|   never smoked|  618|
|Female|         Rural|   never smoked|  611|
|Female|         Urban|        Unknown|  418|
|  Male|         Rural|formerly smoked|  200|
|Female|         Rural|         smokes|  209|
| Other|         Rural|formerly smoked|    1|
|  Male|         Urban|        Unknown|  364|
|  Male|         Rural|   never smoked|  350|
|Female|         Urban|formerly smoked|  250|
|  Male|         Urban|   never smoked|  313|
|  Male|         Rural|         smokes|  154|
|Female|         Rural|        Unknown|  418|
|  Male|         Rural|        Unknown|  344|
|  Male|         Urban|formerly smoked|  207|
+------+--------------+---------------+-----+



In [246]:
df_prediction.select('smoking_status').distinct().show()

+---------------+
| smoking_status|
+---------------+
|         smokes|
|        Unknown|
|   never smoked|
|formerly smoked|
+---------------+



In [247]:
genderEncoder = StringIndexer(inputCols=['gender','Residence_type','smoking_status'], outputCols=['indexer_gender','indexer_Residence_type','indexer_smoking_status']).fit(df_prediction)

In [248]:
df_prediction_indexer = genderEncoder.transform(df_prediction)

In [249]:
df_prediction_indexer.show(5)

+------+----+------------+-------------+--------------+-----------------+----+---------------+------+--------------+----------------------+----------------------+
|gender| age|hypertension|heart_disease|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|indexer_gender|indexer_Residence_type|indexer_smoking_status|
+------+----+------------+-------------+--------------+-----------------+----+---------------+------+--------------+----------------------+----------------------+
|  Male|67.0|           0|            1|         Urban|           228.69|36.6|formerly smoked|     1|           1.0|                   0.0|                   2.0|
|Female|61.0|           0|            0|         Rural|           202.21| N/A|   never smoked|     1|           0.0|                   1.0|                   0.0|
|  Male|80.0|           0|            1|         Rural|           105.92|32.5|   never smoked|     1|           1.0|                   1.0|                   0.0|
|Female|49.0|         

In [250]:
df = df_prediction_indexer.select('indexer_gender','age','hypertension','heart_disease','indexer_Residence_type','avg_glucose_level','bmi','indexer_smoking_status','stroke')

In [251]:
df.show(5)

+--------------+----+------------+-------------+----------------------+-----------------+----+----------------------+------+
|indexer_gender| age|hypertension|heart_disease|indexer_Residence_type|avg_glucose_level| bmi|indexer_smoking_status|stroke|
+--------------+----+------------+-------------+----------------------+-----------------+----+----------------------+------+
|           1.0|67.0|           0|            1|                   0.0|           228.69|36.6|                   2.0|     1|
|           0.0|61.0|           0|            0|                   1.0|           202.21| N/A|                   0.0|     1|
|           1.0|80.0|           0|            1|                   1.0|           105.92|32.5|                   0.0|     1|
|           0.0|49.0|           0|            0|                   0.0|           171.23|34.4|                   3.0|     1|
|           0.0|79.0|           1|            0|                   1.0|           174.12|  24|                   0.0|     1|


In [252]:
df.dtypes

[('indexer_gender', 'double'),
 ('age', 'double'),
 ('hypertension', 'int'),
 ('heart_disease', 'int'),
 ('indexer_Residence_type', 'double'),
 ('avg_glucose_level', 'double'),
 ('bmi', 'string'),
 ('indexer_smoking_status', 'double'),
 ('stroke', 'int')]

In [253]:
from pyspark.sql.functions import col

df_na = df.filter(col("BMI") == "N/A")
df_na.groupBy('BMI').count().show()

+---+-----+
|BMI|count|
+---+-----+
|N/A|  201|
+---+-----+



In [254]:
df = df.where(col("BMI") != "N/A")
df.show(5)

+--------------+----+------------+-------------+----------------------+-----------------+----+----------------------+------+
|indexer_gender| age|hypertension|heart_disease|indexer_Residence_type|avg_glucose_level| bmi|indexer_smoking_status|stroke|
+--------------+----+------------+-------------+----------------------+-----------------+----+----------------------+------+
|           1.0|67.0|           0|            1|                   0.0|           228.69|36.6|                   2.0|     1|
|           1.0|80.0|           0|            1|                   1.0|           105.92|32.5|                   0.0|     1|
|           0.0|49.0|           0|            0|                   0.0|           171.23|34.4|                   3.0|     1|
|           0.0|79.0|           1|            0|                   1.0|           174.12|  24|                   0.0|     1|
|           1.0|81.0|           0|            0|                   0.0|           186.21|  29|                   2.0|     1|


In [255]:
# df = df.na.replace("N/A", "0", subset=["bmi"]) 

In [256]:
from pyspark.sql.types import DoubleType
df = df.withColumn("bmi", col("bmi").cast(DoubleType()))

In [257]:
df.printSchema()

root
 |-- indexer_gender: double (nullable = false)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- indexer_Residence_type: double (nullable = false)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- indexer_smoking_status: double (nullable = false)
 |-- stroke: integer (nullable = true)



In [258]:
columns = df.columns
columns.remove('stroke')
VectorAss = VectorAssembler(inputCols=columns, outputCol='features')
vec = VectorAss.transform(df)

In [259]:
vec.show(3)

+--------------+----+------------+-------------+----------------------+-----------------+----+----------------------+------+--------------------+
|indexer_gender| age|hypertension|heart_disease|indexer_Residence_type|avg_glucose_level| bmi|indexer_smoking_status|stroke|            features|
+--------------+----+------------+-------------+----------------------+-----------------+----+----------------------+------+--------------------+
|           1.0|67.0|           0|            1|                   0.0|           228.69|36.6|                   2.0|     1|[1.0,67.0,0.0,1.0...|
|           1.0|80.0|           0|            1|                   1.0|           105.92|32.5|                   0.0|     1|[1.0,80.0,0.0,1.0...|
|           0.0|49.0|           0|            0|                   0.0|           171.23|34.4|                   3.0|     1|(8,[1,5,6,7],[49....|
+--------------+----+------------+-------------+----------------------+-----------------+----+----------------------+------+

In [260]:
vec.select('features').take(3)

[Row(features=DenseVector([1.0, 67.0, 0.0, 1.0, 0.0, 228.69, 36.6, 2.0])),
 Row(features=DenseVector([1.0, 80.0, 0.0, 1.0, 1.0, 105.92, 32.5, 0.0])),
 Row(features=SparseVector(8, {1: 49.0, 5: 171.23, 6: 34.4, 7: 3.0}))]

In [261]:
scaler = StandardScaler(inputCol='features', outputCol='scaler_features', withMean=True, withStd=True)
scaler = scaler.fit(vec)
df_final = scaler.transform(vec)

In [262]:
df_final = df_final.select('scaler_features', 'stroke')

In [263]:
df_final.show(5)

+--------------------+------+
|     scaler_features|stroke|
+--------------------+------+
|[1.19830604849670...|     1|
|[1.19830604849670...|     1|
|[-0.8329385620248...|     1|
|[-0.8329385620248...|     1|
|[1.19830604849670...|     1|
+--------------------+------+
only showing top 5 rows



In [264]:
df_final.take(3)

[Row(scaler_features=DenseVector([1.1983, 1.07, -0.318, 4.3815, -0.9855, 2.7774, 0.9812, 0.8491]), stroke=1),
 Row(scaler_features=DenseVector([1.1983, 1.6464, -0.318, 4.3815, 1.0145, 0.0138, 0.4592, -1.0243]), stroke=1),
 Row(scaler_features=DenseVector([-0.8329, 0.272, -0.318, -0.2282, -0.9855, 1.484, 0.7011, 1.7858]), stroke=1)]

In [265]:
train, test = df_final.randomSplit([0.7,0.3])
train.count()

3434

In [266]:
regre = LogisticRegression(featuresCol='scaler_features', labelCol='stroke')
regre_md = regre.fit(train)
y_pred = regre_md.transform(test)


In [267]:
y_pred.show(5)

+--------------------+------+--------------------+--------------------+----------+
|     scaler_features|stroke|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|[-0.8329385620248...|     0|[7.19580050297836...|[0.99925083398471...|       0.0|
|[-0.8329385620248...|     0|[7.29740998159318...|[0.99932316771137...|       0.0|
|[-0.8329385620248...|     0|[7.47248937714034...|[0.99943181172658...|       0.0|
|[-0.8329385620248...|     0|[7.40308972912592...|[0.99939100403178...|       0.0|
|[-0.8329385620248...|     0|[7.44163198130407...|[0.99941401603780...|       0.0|
+--------------------+------+--------------------+--------------------+----------+
only showing top 5 rows



In [268]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

mul_ev = MulticlassClassificationEvaluator(labelCol='stroke', metricName='accuracy')
mul_ev.evaluate(y_pred)

0.9613559322033899