In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification  import LinearSVC
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.sql.functions import *

In [2]:
spark=SparkSession.builder.appName('SparkSVM').getOrCreate()

In [3]:
df = spark.read.csv('D:/KULIAH/no_show_appointments.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- NomorPasien: double (nullable = true)
 |-- NomorAppointment: integer (nullable = true)
 |-- JenisKelamin: string (nullable = true)
 |-- TanggalPembuatan: timestamp (nullable = true)
 |-- TanggalAppointment: timestamp (nullable = true)
 |-- Umur: integer (nullable = true)
 |-- Daerah: string (nullable = true)
 |-- Beasiswa: integer (nullable = true)
 |-- DarahTinggi: integer (nullable = true)
 |-- Diabetes: integer (nullable = true)
 |-- Alkohol: integer (nullable = true)
 |-- Disabled: integer (nullable = true)
 |-- SMS_received: integer (nullable = true)
 |-- No-show: string (nullable = true)



In [4]:
stringIndex = [StringIndexer(inputCol='JenisKelamin', outputCol='Gender'), StringIndexer(inputCol='No-show', outputCol='Label')]
pipeline = Pipeline(stages=stringIndex) 

df = pipeline.fit(df).transform(df)
df.show(5)

+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+-----+
|NomorPasien|NomorAppointment|JenisKelamin|   TanggalPembuatan| TanggalAppointment|Umur|          Daerah|Beasiswa|DarahTinggi|Diabetes|Alkohol|Disabled|SMS_received|No-show|Gender|Label|
+-----------+----------------+------------+-------------------+-------------------+----+----------------+--------+-----------+--------+-------+--------+------------+-------+------+-----+
| 2.14311E13|         5611376|           F|2016-04-25 14:08:41|2016-05-25 07:00:00|  43|     Bidara Cina|       0|          0|       0|      0|       0|           1|     No|   0.0|  0.0|
| 2.83935E13|         5629135|           M|2016-04-27 19:50:39|2016-04-29 07:00:00|  78| Harapan Mulia  |       0|          1|       1|      0|       0|           0|     No|   1.0|  0.0|
| 3.48758E13|         5686183|           M|2016-05-11 19:18:24|20

In [28]:
featureAssembler = VectorAssembler(inputCols=['Umur','Beasiswa','DarahTinggi','Diabetes','Alkohol','SMS_received','Gender'], outputCol='Features')
df2 = featureAssembler.transform(df)

In [29]:
df3 = df2.select(['Features','Label'])
df3.show(13)

+--------------------+-----+
|            Features|Label|
+--------------------+-----+
|(7,[0,5],[43.0,1.0])|  0.0|
|[78.0,0.0,1.0,1.0...|  0.0|
|(7,[0,2,6],[58.0,...|  0.0|
|      (7,[0],[22.0])|  0.0|
|      (7,[0],[30.0])|  0.0|
|[5.0,1.0,0.0,0.0,...|  1.0|
|(7,[0,1,5],[31.0,...|  0.0|
|(7,[0,2],[43.0,1.0])|  0.0|
|(7,[0,4],[45.0,1.0])|  0.0|
|           (7,[],[])|  0.0|
|      (7,[0],[28.0])|  0.0|
|(7,[0,2],[50.0,1.0])|  0.0|
|       (7,[6],[1.0])|  1.0|
+--------------------+-----+
only showing top 13 rows



In [30]:
# Split the data into training and test sets (30% held out for testing)
trainingData, testData = df3.randomSplit([0.7, 0.3], seed=1310)

### SVM Classifier

In [31]:
lsvc = LinearSVC(featuresCol='Features', labelCol='Label', maxIter=150)

In [32]:
lsvc = lsvc.fit(trainingData)

In [33]:
pred = lsvc.transform(testData)
pred.show(3)

+---------+-----+--------------------+----------+
| Features|Label|       rawPrediction|prediction|
+---------+-----+--------------------+----------+
|(7,[],[])|  0.0|[0.99999999890478...|       0.0|
|(7,[],[])|  0.0|[0.99999999890478...|       0.0|
|(7,[],[])|  0.0|[0.99999999890478...|       0.0|
+---------+-----+--------------------+----------+
only showing top 3 rows



In [34]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = BinaryClassificationEvaluator(labelCol="Label", rawPredictionCol='rawPrediction')
accuracy = evaluator.evaluate(pred)
print('Accuracy:', accuracy)

Accuracy: 0.5823432596178479
