In [2]:
df = spark.read.csv('hdfs://orion11:25000/train_2v.csv', inferSchema=True,header=True)

In [4]:
%%time
df.createOrReplaceTempView("TEMP_DF")
stroke = spark.sql("SELECT distinct(stroke) FROM TEMP_DF").collect()
print(stroke)

[Row(stroke=1), Row(stroke=0)]
CPU times: user 6.02 ms, sys: 1 ms, total: 7.03 ms
Wall time: 3.88 s


In [5]:
%%time
df.createOrReplaceTempView("TEMP_DF")
spark.sql("SELECT distinct(stroke) FROM TEMP_DF").show()


+------+
|stroke|
+------+
|     1|
|     0|
+------+

CPU times: user 1.36 ms, sys: 1.09 ms, total: 2.45 ms
Wall time: 2.65 s


In [36]:
%%time
#Find the occupation which is most vulernale to the stroke
spark.sql("select work_type, count(*) as stroke_patients from TEMP_DF where stroke = 1 group by work_type order by count(*) desc").show()


+-------------+---------------+
|    work_type|stroke_patients|
+-------------+---------------+
|      Private|            441|
|Self-employed|            251|
|     Govt_job|             89|
|     children|              2|
+-------------+---------------+

None
CPU times: user 2.61 ms, sys: 2.88 ms, total: 5.48 ms
Wall time: 859 ms


In [6]:
#Factoring affecting the stroke in the people
spark.sql("SELECT gender, count(gender) as count, count(gender)*100/sum(count(gender)) over() as percent FROM TEMP_DF GROUP BY gender").show()

+------+-----+-------------------+
|gender|count|            percent|
+------+-----+-------------------+
|Female|25665|  59.13594470046083|
| Other|   11|0.02534562211981567|
|  Male|17724|  40.83870967741935|
+------+-----+-------------------+



In [7]:
#Age as factor
spark.sql("select case when age <= 25 then '0-25' else case when age <=50 then '26-50' else '>50' end end as age_range, \
             count(*) as count from TEMP_DF group by 1 order by count desc").show()

+---------+-----+
|age_range|count|
+---------+-----+
|      >50|17202|
|    26-50|14652|
|     0-25|11546|
+---------+-----+



In [8]:
# fill in missing values
train_f = df.na.fill('No Info', subset=['smoking_status'])
# fill in miss values with mean
from pyspark.sql.functions import mean
mean = train_f.select(mean(train_f['bmi'])).collect()
mean_bmi = mean[0][0]
train_f = train_f.na.fill(mean_bmi,['bmi'])

In [9]:
from pyspark.ml.feature import (VectorAssembler,OneHotEncoder,
                                StringIndexer)
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [11]:
gender_indexer = StringIndexer(inputCol='gender', outputCol = 'genderIndex')
gender_encoder = OneHotEncoder(inputCol='genderIndex', outputCol = 'genderVec')

In [12]:
ever_married_indexer = StringIndexer(inputCol='ever_married', outputCol = 'ever_marriedIndex')
ever_married_encoder = OneHotEncoder(inputCol='ever_marriedIndex', outputCol = 'ever_marriedVec')

In [13]:
work_type_indexer = StringIndexer(inputCol='work_type', outputCol = 'work_typeIndex')
work_type_encoder = OneHotEncoder(inputCol='work_typeIndex', outputCol = 'work_typeVec')

In [14]:
Residence_type_indexer = StringIndexer(inputCol='Residence_type', outputCol = 'Residence_typeIndex')
Residence_type_encoder = OneHotEncoder(inputCol='Residence_typeIndex', outputCol = 'Residence_typeVec')

In [15]:
smoking_status_indexer = StringIndexer(inputCol='smoking_status', outputCol = 'smoking_statusIndex')
smoking_status_encoder = OneHotEncoder(inputCol='smoking_statusIndex', outputCol = 'smoking_statusVec')

In [16]:
assembler = VectorAssembler(inputCols=['genderVec',
 'age',
 'hypertension',
 'heart_disease',
 'ever_marriedVec',
 'work_typeVec',
 'Residence_typeVec',
 'avg_glucose_level',
 'bmi',
 'smoking_statusVec'],outputCol='features')

In [17]:
dtc = DecisionTreeClassifier(labelCol='stroke',featuresCol='features')

In [18]:
pipeline = Pipeline(stages=[gender_indexer, ever_married_indexer, work_type_indexer, Residence_type_indexer,
                           smoking_status_indexer, gender_encoder, ever_married_encoder, work_type_encoder,
                           Residence_type_encoder, smoking_status_encoder, assembler, dtc])

In [19]:
train_data,test_data = train_f.randomSplit([0.8,0.2])

In [20]:
model = pipeline.fit(train_data)

In [21]:
dtc_predictions = model.transform(test_data)

In [22]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="stroke", predictionCol="prediction", metricName="accuracy")
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
print('Accuracy using decision tree is: {0:2.2f}%'.format(dtc_acc*100))

Accuracy using decision tree is: 98.28%
