In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification  import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, VectorIndexer
from pyspark.sql.functions import *

In [2]:
spark=SparkSession.builder.appName('SparkGBT').getOrCreate()

In [3]:
df = spark.read.csv('D:/KULIAH/no_show_appointments.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- NomorPasien: double (nullable = true)
 |-- NomorAppointment: integer (nullable = true)
 |-- JenisKelamin: string (nullable = true)
 |-- TanggalPembuatan: timestamp (nullable = true)
 |-- TanggalAppointment: timestamp (nullable = true)
 |-- Umur: integer (nullable = true)
 |-- Daerah: string (nullable = true)
 |-- Beasiswa: integer (nullable = true)
 |-- DarahTinggi: integer (nullable = true)
 |-- Diabetes: integer (nullable = true)
 |-- Alkohol: integer (nullable = true)
 |-- Disabled: integer (nullable = true)
 |-- SMS_received: integer (nullable = true)
 |-- No-show: string (nullable = true)



In [4]:
stringIndex = StringIndexer(inputCol='JenisKelamin', outputCol='Gender')
df = stringIndex.fit(df).transform(df)
df.show(5)

+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+
|NomorPasien|NomorAppointment|JenisKelamin|   TanggalPembuatan| TanggalAppointment|Umur|          Daerah|Beasiswa|DarahTinggi|Diabetes|Alkohol|Disabled|SMS_received|No-show|Gender|
+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+
| 2.14311E13|         5611376|           F|2016-04-25 14:08:41|2016-05-25 07:00:00|  43|     Bidara Cina|       0|          0|       0|      0|       0|           1|     No|   0.0|
| 2.83935E13|         5629135|           M|2016-04-27 19:50:39|2016-04-29 07:00:00|  78| Harapan Mulia  |       0|          1|       1|      0|       0|           0|     No|   1.0|
| 3.48758E13|         5686183|           M|2016-05-11 19:18:24|2016-05-17 07:00:00|  58|       

In [5]:
stringIndex1 = StringIndexer(inputCol='No-show', outputCol='Label')
df = stringIndex1.fit(df).transform(df)
df.show(5)

+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+-----+
|NomorPasien|NomorAppointment|JenisKelamin|   TanggalPembuatan| TanggalAppointment|Umur|          Daerah|Beasiswa|DarahTinggi|Diabetes|Alkohol|Disabled|SMS_received|No-show|Gender|Label|
+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+-----+
| 2.14311E13|         5611376|           F|2016-04-25 14:08:41|2016-05-25 07:00:00|  43|     Bidara Cina|       0|          0|       0|      0|       0|           1|     No|   0.0|  0.0|
| 2.83935E13|         5629135|           M|2016-04-27 19:50:39|2016-04-29 07:00:00|  78| Harapan Mulia  |       0|          1|       1|      0|       0|           0|     No|   1.0|  0.0|
| 3.48758E13|         5686183|           M|2016-05-11 19:18:24|20

In [6]:
featureAssembler = VectorAssembler(inputCols=['Umur','DarahTinggi','Diabetes','SMS_received','Gender'], outputCol='Features')
df2 = featureAssembler.transform(df)

In [7]:
df3 = df2.select(['Features','Label'])
df3.show(13)

+--------------------+-----+
|            Features|Label|
+--------------------+-----+
|(5,[0,3],[43.0,1.0])|  0.0|
|[78.0,1.0,1.0,0.0...|  0.0|
|[58.0,1.0,0.0,0.0...|  0.0|
|      (5,[0],[22.0])|  0.0|
|      (5,[0],[30.0])|  0.0|
|[5.0,0.0,0.0,1.0,...|  1.0|
|(5,[0,3],[31.0,1.0])|  0.0|
|(5,[0,1],[43.0,1.0])|  0.0|
|      (5,[0],[45.0])|  0.0|
|           (5,[],[])|  0.0|
|      (5,[0],[28.0])|  0.0|
|(5,[0,1],[50.0,1.0])|  0.0|
|       (5,[4],[1.0])|  1.0|
+--------------------+-----+
only showing top 13 rows



In [8]:
scaler = StandardScaler(inputCol='Features', outputCol='Scaler')
df3 = scaler.fit(df3).transform(df3)
df3.show()

+--------------------+-----+--------------------+
|            Features|Label|              Scaler|
+--------------------+-----+--------------------+
|(5,[0,3],[43.0,1.0])|  0.0|(5,[0,3],[1.86064...|
|[78.0,1.0,1.0,0.0...|  0.0|[3.37513233320846...|
|[58.0,1.0,0.0,0.0...|  0.0|[2.50971378623193...|
|      (5,[0],[22.0])|  0.0|(5,[0],[0.9519604...|
|      (5,[0],[30.0])|  0.0|(5,[0],[1.2981278...|
|[5.0,0.0,0.0,1.0,...|  1.0|[0.21635463674413...|
|(5,[0,3],[31.0,1.0])|  0.0|(5,[0,3],[1.34139...|
|(5,[0,1],[43.0,1.0])|  0.0|(5,[0,1],[1.86064...|
|      (5,[0],[45.0])|  0.0|(5,[0],[1.9471917...|
|           (5,[],[])|  0.0|           (5,[],[])|
|      (5,[0],[28.0])|  0.0|(5,[0],[1.2115859...|
|(5,[0,1],[50.0,1.0])|  0.0|(5,[0,1],[2.16354...|
|       (5,[4],[1.0])|  1.0|(5,[4],[2.0965282...|
|[39.0,0.0,0.0,1.0...|  0.0|[1.68756616660423...|
|[58.0,1.0,0.0,0.0...|  0.0|[2.50971378623193...|
|       (5,[0],[5.0])|  1.0|(5,[0],[0.2163546...|
|       (5,[4],[1.0])|  0.0|(5,[4],[2.0965282...|


In [9]:
labelIndexer = StringIndexer(inputCol="Label", outputCol="indexedLabel").fit(df3)

In [10]:
featureIndexer =VectorIndexer(inputCol="Scaler", outputCol="indexedFeatures", maxCategories=5).fit(df3)

In [11]:
# Split the data into training and test sets (30% held out for testing)
trainingData, testData = df3.randomSplit([0.7, 0.3])

### GBTClassifier

In [12]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10,maxDepth=4)

In [13]:
 # Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

In [14]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [15]:
 # Make predictions.
predictions = model.transform(testData)

In [16]:
# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+---------+
|prediction|indexedLabel| features|
+----------+------------+---------+
|       0.0|         0.0|(5,[],[])|
|       0.0|         0.0|(5,[],[])|
|       0.0|         0.0|(5,[],[])|
|       0.0|         0.0|(5,[],[])|
|       0.0|         0.0|(5,[],[])|
+----------+------------+---------+
only showing top 5 rows



In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol='rawPrediction')
accuracy = evaluator.evaluate(predictions)
# f1 = evaluator.setMetricName('f1').evaluate(predictions)
print('Accuracy:', accuracy)
# print('F1:', f1)

Accuracy: 0.6049113080588552
