In [42]:
from pyspark.sql import SparkSession
from pyspark.ml.classification  import RandomForestClassifier, DecisionTreeClassifier, NaiveBayes, LinearSVC, OneVsRest
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, VectorIndexer
from pyspark.sql.functions import *

In [2]:
spark=SparkSession.builder.appName('SparkMlExample').getOrCreate()

In [3]:
df = spark.read.csv('D:/KULIAH/Machine Learning Full/Random Forest/iris.csv', header = True, inferSchema=True)
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [4]:
stringIndex = StringIndexer(inputCol='Species', outputCol='Label')
df = stringIndex.fit(df).transform(df)
df.show(5)

+---+-------------+------------+-------------+------------+-----------+-----+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|Label|
+---+-------------+------------+-------------+------------+-----------+-----+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|  0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|  0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|  0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|  0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|  0.0|
+---+-------------+------------+-------------+------------+-----------+-----+
only showing top 5 rows



In [5]:
featureAssembler = VectorAssembler(inputCols=['SepalLengthCm', 'SepalWidthCm','PetalLengthCm','PetalWidthCm'], outputCol='Features')
df2 = featureAssembler.transform(df)

In [6]:
df3 = df2.select(['Features','Label'])
df3.show(13)

+-----------------+-----+
|         Features|Label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
|[5.4,3.7,1.5,0.2]|  0.0|
|[4.8,3.4,1.6,0.2]|  0.0|
|[4.8,3.0,1.4,0.1]|  0.0|
+-----------------+-----+
only showing top 13 rows



In [7]:
train_data, test_data = df3.randomSplit([.80, .20], seed=11)

In [8]:
train_data.groupBy('Label').agg(count('Label')).show()

+-----+------------+
|Label|count(Label)|
+-----+------------+
|  0.0|          37|
|  1.0|          42|
|  2.0|          41|
+-----+------------+



### Decision Tree

In [9]:
dt = DecisionTreeClassifier(featuresCol = 'Features', labelCol = 'Label',impurity='Gini')
dtModel = dt.fit(train_data)

In [10]:
predDt = dtModel.transform(test_data)
predDt.printSchema()

root
 |-- Features: vector (nullable = true)
 |-- Label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='Label')
accuracy = evaluator.setMetricName('accuracy').evaluate(predDt)
print('accuracy ',accuracy)

accuracy  0.9666666666666667


In [12]:
from pyspark.mllib.evaluation import MulticlassMetrics
lr_metric = MulticlassMetrics(predDt['Label','prediction'].rdd)
print('presisi :',lr_metric.precision(1.0))
print('F1 :',lr_metric.fMeasure(1.0))



presisi : 1.0
F1 : 0.9411764705882353


### Random Forest Model

In [13]:
rf = RandomForestClassifier(featuresCol = 'Features', labelCol = 'Label', seed=15)
rfModel = rf.fit(train_data)
predRf = rfModel.transform(test_data)
predRf.printSchema()

root
 |-- Features: vector (nullable = true)
 |-- Label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [14]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='Label', probabilityCol='probability')
accuracy = evaluator.setMetricName('accuracy').evaluate(predRf)

rf_metric = MulticlassMetrics(predRf['Label','prediction'].rdd)
print('presisi :',rf_metric.precision(1.0))
print('F1 :',rf_metric.fMeasure(1.0))

print('accuracy ',accuracy)



presisi : 1.0
F1 : 0.9411764705882353
accuracy  0.9666666666666667


### Naive Bayes model

In [28]:
nb = NaiveBayes(featuresCol='Features', labelCol='Label')
nbmodel = nb.fit(train_data)
predictions = nbmodel.transform(test_data)
predictions.select('rawPrediction').show(5, True)

+--------------------+
|       rawPrediction|
+--------------------+
|[-11.436538182314...|
|[-11.351362961240...|
|[-11.640927658369...|
|[-11.771954420718...|
|[-12.097828997859...|
+--------------------+
only showing top 5 rows



In [32]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='Label', probabilityCol='probability')
accuracy = evaluator.setMetricName('accuracy').evaluate(predictions)

rf_metric = MulticlassMetrics(predictions['Label','prediction'].rdd)
print('presisi :',rf_metric.precision(1.0))
print('F1 :',rf_metric.fMeasure(1.0))
print('accuracy ',accuracy)

presisi : 1.0
F1 : 0.9411764705882353
accuracy  0.9666666666666667


### SVM

In [43]:
lsvc = LinearSVC(featuresCol='Features',labelCol='Label',maxIter=150)
ovr= OneVsRest(classifier=lsvc, featuresCol='Features', labelCol='Label')
ovrModel = ovr.fit(train_data)
pred = ovrModel.transform(test_data)
pred.show(3)

+-----------------+-----+--------------------+----------+
|         Features|Label|       rawPrediction|prediction|
+-----------------+-----+--------------------+----------+
|[4.6,3.1,1.5,0.2]|  0.0|[1.97157676876047...|       0.0|
|[4.6,3.2,1.4,0.2]|  0.0|[2.14772373448296...|       0.0|
|[4.8,3.0,1.4,0.3]|  0.0|[1.69375530924379...|       0.0|
+-----------------+-----+--------------------+----------+
only showing top 3 rows



In [44]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='Label', probabilityCol='probability')
accuracy = evaluator.setMetricName('accuracy').evaluate(pred)

rf_metric = MulticlassMetrics(pred['Label','prediction'].rdd)
print('presisi :',rf_metric.precision(1.0))
print('F1 :',rf_metric.fMeasure(1.0))
print('accuracy ',accuracy)



presisi : 0.875
F1 : 0.9333333333333333
accuracy  0.9666666666666667
