In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification  import NaiveBayes
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, VectorIndexer
from pyspark.sql.functions import *

In [2]:
spark=SparkSession.builder.appName('SparkNB').getOrCreate()

In [3]:
df = spark.read.csv('D:/KULIAH/no_show_appointments.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- NomorPasien: double (nullable = true)
 |-- NomorAppointment: integer (nullable = true)
 |-- JenisKelamin: string (nullable = true)
 |-- TanggalPembuatan: timestamp (nullable = true)
 |-- TanggalAppointment: timestamp (nullable = true)
 |-- Umur: integer (nullable = true)
 |-- Daerah: string (nullable = true)
 |-- Beasiswa: integer (nullable = true)
 |-- DarahTinggi: integer (nullable = true)
 |-- Diabetes: integer (nullable = true)
 |-- Alkohol: integer (nullable = true)
 |-- Disabled: integer (nullable = true)
 |-- SMS_received: integer (nullable = true)
 |-- No-show: string (nullable = true)



In [4]:
stringIndex = [StringIndexer(inputCol='JenisKelamin', outputCol='Gender'), StringIndexer(inputCol='No-show', outputCol='Label')]
pipeline = Pipeline(stages=stringIndex) 

df = pipeline.fit(df).transform(df)
df.show(5)

+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+-----+
|NomorPasien|NomorAppointment|JenisKelamin|   TanggalPembuatan| TanggalAppointment|Umur|          Daerah|Beasiswa|DarahTinggi|Diabetes|Alkohol|Disabled|SMS_received|No-show|Gender|Label|
+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+-----+
| 2.14311E13|         5611376|           F|2016-04-25 14:08:41|2016-05-25 07:00:00|  43|     Bidara Cina|       0|          0|       0|      0|       0|           1|     No|   0.0|  0.0|
| 2.83935E13|         5629135|           M|2016-04-27 19:50:39|2016-04-29 07:00:00|  78| Harapan Mulia  |       0|          1|       1|      0|       0|           0|     No|   1.0|  0.0|
| 3.48758E13|         5686183|           M|2016-05-11 19:18:24|20

In [5]:
featureAssembler = VectorAssembler(inputCols=['DarahTinggi','Diabetes','Alkohol','SMS_received','Gender'], outputCol='Features')
df2 = featureAssembler.transform(df)

In [6]:
df3 = df2.select(['Features','Label'])
df3.show(13)

+--------------------+-----+
|            Features|Label|
+--------------------+-----+
|       (5,[3],[1.0])|  0.0|
|[1.0,1.0,0.0,0.0,...|  0.0|
| (5,[0,4],[1.0,1.0])|  0.0|
|           (5,[],[])|  0.0|
|           (5,[],[])|  0.0|
| (5,[3,4],[1.0,1.0])|  1.0|
|       (5,[3],[1.0])|  0.0|
|       (5,[0],[1.0])|  0.0|
|       (5,[2],[1.0])|  0.0|
|           (5,[],[])|  0.0|
|           (5,[],[])|  0.0|
|       (5,[0],[1.0])|  0.0|
|       (5,[4],[1.0])|  1.0|
+--------------------+-----+
only showing top 13 rows



In [7]:
scaler = StandardScaler(inputCol='Features', outputCol='Scaler', withStd=True, withMean=False)
df3 = scaler.fit(df3).transform(df3)
df3.show()

+--------------------+-----+--------------------+
|            Features|Label|              Scaler|
+--------------------+-----+--------------------+
|       (5,[3],[1.0])|  0.0|(5,[3],[2.1419113...|
|[1.0,1.0,0.0,0.0,...|  0.0|[2.51305942778134...|
| (5,[0,4],[1.0,1.0])|  0.0|(5,[0,4],[2.51305...|
|           (5,[],[])|  0.0|           (5,[],[])|
|           (5,[],[])|  0.0|           (5,[],[])|
| (5,[3,4],[1.0,1.0])|  1.0|(5,[3,4],[2.14191...|
|       (5,[3],[1.0])|  0.0|(5,[3],[2.1419113...|
|       (5,[0],[1.0])|  0.0|(5,[0],[2.5130594...|
|       (5,[2],[1.0])|  0.0|(5,[2],[5.8246018...|
|           (5,[],[])|  0.0|           (5,[],[])|
|           (5,[],[])|  0.0|           (5,[],[])|
|       (5,[0],[1.0])|  0.0|(5,[0],[2.5130594...|
|       (5,[4],[1.0])|  1.0|(5,[4],[2.0965282...|
| (5,[3,4],[1.0,1.0])|  0.0|(5,[3,4],[2.14191...|
| (5,[0,4],[1.0,1.0])|  0.0|(5,[0,4],[2.51305...|
|           (5,[],[])|  1.0|           (5,[],[])|
|       (5,[4],[1.0])|  0.0|(5,[4],[2.0965282...|


In [8]:
# Split the data into training and test sets (30% held out for testing)
trainingData, testData = df3.randomSplit([0.7, 0.3], seed=1000)

### Naive Bayesian Classifier

In [9]:
nb = NaiveBayes(featuresCol='Scaler', labelCol='Label', smoothing=1.0)

In [10]:
nbmodel = nb.fit(trainingData)

In [11]:
predictions = nbmodel.transform(testData)
predictions.show(5, True)

+---------+-----+---------+--------------------+--------------------+----------+
| Features|Label|   Scaler|       rawPrediction|         probability|prediction|
+---------+-----+---------+--------------------+--------------------+----------+
|(5,[],[])|  0.0|(5,[],[])|[-0.2250571610265...|[0.79847057605713...|       0.0|
|(5,[],[])|  0.0|(5,[],[])|[-0.2250571610265...|[0.79847057605713...|       0.0|
|(5,[],[])|  0.0|(5,[],[])|[-0.2250571610265...|[0.79847057605713...|       0.0|
|(5,[],[])|  0.0|(5,[],[])|[-0.2250571610265...|[0.79847057605713...|       0.0|
|(5,[],[])|  0.0|(5,[],[])|[-0.2250571610265...|[0.79847057605713...|       0.0|
+---------+-----+---------+--------------------+--------------------+----------+
only showing top 5 rows



In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = BinaryClassificationEvaluator(labelCol="Label", rawPredictionCol='rawPrediction')
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.487549315355889


### Naive Bayes with Tuning Parameters

In [13]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [14]:
nbparamGrid = (ParamGridBuilder()
               .addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
               .build())

In [15]:
nbevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='Label')

In [16]:
# Create 5-fold CrossValidator
nbcv = CrossValidator(estimator = nb,
                      estimatorParamMaps = nbparamGrid,
                      evaluator = nbevaluator,
                      numFolds = 5)

In [17]:
nbcvModel = nbcv.fit(trainingData)
print(nbcvModel)

CrossValidatorModel_3d13b2446767


In [18]:
nbpredictions = nbcvModel.transform(testData)

In [19]:
accuracy = evaluator.evaluate(nbpredictions)
#f1 = evaluator.setMetricName('f1').evaluate(predictions)
print('Accuracy:', accuracy)
#print('F1:', f1)

Accuracy: 0.487549315355889
