# Logistic Regression on 'HEART DISEASE' Dataset  
Elif Cansu YILDIZ

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, countDistinct
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, MinMaxScaler, IndexToString
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [2]:
spark = SparkSession\
    .builder\
    .appName("MachineLearningExample")\
    .getOrCreate()

The dataset used is 'Heart Disease' dataset from Kaggle. You can get from this [link](https://www.kaggle.com/ronitf/heart-disease-uci).

In [3]:
df = spark.read.csv('datasets/heart.csv', header = True, inferSchema = True)  #Kaggle Dataset
df.printSchema()
df.show(5)

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|

__HOW MANY DISTINCT VALUE DO COLUMNS HAVE?__

In [4]:
df.agg(*(countDistinct(col(c)).alias(c) for c in df.columns)).show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 41|  2|  4|      49| 152|  2|      3|     91|    2|     40|    3|  5|   4|     2|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



__SET the Label Column and Input Columns__

In [5]:
labelColumn = "thal"
input_columns = [t[0] for t in df.dtypes if t[0]!=labelColumn]

In [6]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = df.randomSplit([0.7, 0.3])
print("total data count: ", df.count())
print("train data count: ", trainingData.count())
print("test data count: ", testData.count())

total data count:  303
train data count:  218
test data count:  85


__TRAINING__

In [7]:
assembler = VectorAssembler(inputCols = input_columns, outputCol='features')

lr = LogisticRegression(featuresCol='features', labelCol=labelColumn,
                        maxIter=10, regParam=0.3, elasticNetParam=0.8)

stages = [assembler, lr]
partialPipeline = Pipeline().setStages(stages)
model = partialPipeline.fit(trainingData)

__MAKE PREDICTIONS__

In [8]:
predictions = model.transform(testData)

predictionss = predictions.select("probability", "rawPrediction", "prediction", 
                                 col(labelColumn).alias("label"))
predictionss[["probability", "prediction", "label"]].show(5, truncate=False)

+--------------------------------------------------------------------------------+----------+-----+
|probability                                                                     |prediction|label|
+--------------------------------------------------------------------------------+----------+-----+
|[0.011082788245690223,0.05729867172540959,0.5740584251416755,0.3575601148872248]|2.0       |2    |
|[0.011082788245690223,0.05729867172540959,0.5740584251416755,0.3575601148872248]|2.0       |3    |
|[0.011082788245690223,0.05729867172540959,0.5740584251416755,0.3575601148872248]|2.0       |2    |
|[0.011082788245690223,0.05729867172540959,0.5740584251416755,0.3575601148872248]|2.0       |2    |
|[0.012875234771605678,0.06656572644096996,0.5051698495258184,0.4153891892616059]|2.0       |3    |
+--------------------------------------------------------------------------------+----------+-----+
only showing top 5 rows



__EVALUATION for Binary Classification__

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
areaUnderROC = evaluator.evaluate(predictionss)
print("Area under ROC = %g" % areaUnderROC)

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderPR")
areaUnderPR = evaluator.evaluate(predictionss)
print("areaUnderPR = %g" % areaUnderPR)

__EVALUATION for Multiclass Classification__

In [10]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionss)
print("accuracy = %g" % accuracy)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictionss)
print("f1 = %g" % f1)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(predictionss)
print("weightedPrecision = %g" % weightedPrecision)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(predictionss)
print("weightedRecall = %g" % weightedRecall)

accuracy = 0.564706
f1 = 0.407607
weightedPrecision = 0.318893
weightedRecall = 0.564706
