In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer,VectorAssembler,StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit,ParamGridBuilder
from pyspark.ml.classification import GBTClassifier
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType,ArrayType
from matplotlib import pyplot as plt
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.mllib.util import MLUtils
from pyspark.ml.classification import GBTParams
from pyspark.ml.classification import LogisticRegression

spark = SparkSession.builder.appName("IRIS Classification App").getOrCreate()
df =spark.read\
        .option("inferSchema","true")\
        .option("delimiter",",")\
        .csv("/home/ercan/Desktop/hadoop/datasets/leaf.csv")
df = df.withColumnRenamed("_c0","label")

df.show(2)
df.printSchema()
df = df.drop("_c1")
df.show(2)

vA = VectorAssembler(inputCols=df.columns[1:],
                     outputCol="features")
df = vA.transform(df)
df = df.select("features","label")
df.show(2)

+-----+---+-------+------+-------+-------+-------+-------+---------+---------+-------+--------+---------+---------+---------+-------+
|label|_c1|    _c2|   _c3|    _c4|    _c5|    _c6|    _c7|      _c8|      _c9|   _c10|    _c11|     _c12|     _c13|     _c14|   _c15|
+-----+---+-------+------+-------+-------+-------+-------+---------+---------+-------+--------+---------+---------+---------+-------+
|    1|  1|0.72694|1.4742|0.32396|0.98535|    1.0|0.83592|0.0046566|0.0039465|0.04779| 0.12795| 0.016108|0.0052323|2.7477E-4| 1.1756|
|    1|  2|0.74173|1.5257|0.36116|0.98152|0.99825|0.79867|0.0052423|0.0050016|0.02416|0.090476|0.0081195| 0.002708|7.4846E-5|0.69659|
+-----+---+-------+------+-------+-------+-------+-------+---------+---------+-------+--------+---------+---------+---------+-------+
only showing top 2 rows

root
 |-- label: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (null

## RandomForestClassifier with Cross Validation

In [2]:
(trainDF, testDF) = df.randomSplit([0.75,0.25],seed=444)
rfClassifier = RandomForestClassifier()
myEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder()\
            .addGrid(rfClassifier.numTrees,[20])\
            .addGrid(rfClassifier.maxDepth,[10])\
            .build()

validator = CrossValidator(estimator=rfClassifier,
                                  estimatorParamMaps=myParams,
                                   evaluator = myEvaluator,
                                    numFolds=4
                                    )


bestModelRF = validator.fit(trainDF)
resultDF = bestModelRF.transform(testDF)

result = myEvaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Num Trees : ",bestModelRF.bestModel._java_obj.getNumTrees())
print("Max Depth : ",bestModelRF.bestModel._java_obj.getMaxDepth())
print("Impurity :  ",bestModelRF.bestModel._java_obj.getImpurity())


Accuracy =  0.7840909090909091
Num Trees :  20
Max Depth :  10
Impurity :   gini


## RandomForestClassifier with TrainValidationSplit

In [3]:
(trainDF, testDF) = df.randomSplit([0.75,0.25],seed=444)
rfClassifier = RandomForestClassifier()
myEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")


myParams = ParamGridBuilder()\
            .addGrid(rfClassifier.numTrees,[20])\
            .addGrid(rfClassifier.maxDepth,[15])\
            .build()

validator = TrainValidationSplit( estimator=rfClassifier,
                                  estimatorParamMaps=myParams,
                                  trainRatio=0.75,
                                   evaluator = myEvaluator
                                    )


bestModelRF = validator.fit(trainDF)
resultDF = bestModelRF.transform(testDF)

result = myEvaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Num Trees : ",bestModelRF.bestModel._java_obj.getNumTrees())
print("Max Depth : ",bestModelRF.bestModel._java_obj.getMaxDepth())
print("Impurity :  ",bestModelRF.bestModel._java_obj.getImpurity())


Accuracy =  0.75
Num Trees :  20
Max Depth :  15
Impurity :   gini


## DecisionTreeClassifier with TrainValidationSplit

In [24]:
(trainDF, testDF) = df.randomSplit([0.75,0.25],seed=4444)


dtClassifier = DecisionTreeClassifier()
myEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder()\
            .addGrid(dtClassifier.maxDepth,[10])\
            .addGrid(dtClassifier.maxBins,[8])\
            .build()

validator = TrainValidationSplit( estimator=dtClassifier,
                                  estimatorParamMaps=myParams,
                                  trainRatio=0.75,
                                   evaluator = myEvaluator,
                                    )

bestModelDT = validator.fit(trainDF)
resultDF = bestModelDT.transform(testDF)

result = myEvaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Max Depth : ",bestModelDT.bestModel._java_obj.getMaxDepth())
print("Impurity :  ",bestModelDT.bestModel._java_obj.getImpurity())



Accuracy =  0.6206896551724138
Max Depth :  10
Impurity :   gini


## DecisionTreeClassifier with CrossValidator

In [18]:
(trainDF, testDF) = df.randomSplit([0.75,0.25],seed=4444)

dtClassifier = DecisionTreeClassifier()
myEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder()\
            .addGrid(dtClassifier.maxDepth,[15])\
            .build()

validator = CrossValidator(estimator=dtClassifier,
                                  estimatorParamMaps=myParams,
                                   evaluator = myEvaluator,
                                    numFolds=4
                                    )

bestModelDT = validator.fit(trainDF)
resultDF = bestModelDT.transform(testDF)

result = myEvaluator.evaluate(resultDF)
print("Accuracy = ",result)

print("Max Depth : ",bestModelDT.bestModel._java_obj.getMaxDepth())
print("Impurity :  ",bestModelDT.bestModel._java_obj.getImpurity())



Accuracy =  0.6436781609195402
Max Depth :  15
Impurity :   gini


## LogisticRegression with CrossValidator

In [9]:
(trainDF, testDF) = df.randomSplit([0.75,0.25],seed=112)
lrClassifier = LogisticRegression()
myEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder()\
            .addGrid(lrClassifier.maxIter,[10])\
            .addGrid(lrClassifier.regParam,[0.3])\
            .addGrid(lrClassifier.elasticNetParam,[0.8])\
            .build()

validator = CrossValidator(estimator=lrClassifier,
                                  estimatorParamMaps=myParams,
                                   evaluator = myEvaluator,
                                    numFolds=4
                                    )


bestModelLR = validator.fit(trainDF)
resultDF = bestModelRF.transform(testDF)

result = myEvaluator.evaluate(resultDF)
print("Accuracy = ",result)



Accuracy =  0.967032967032967


## LogisticRegression with TrainValidationSplit

In [10]:
(trainDF, testDF) = df.randomSplit([0.75,0.25],seed=1112)
lrClassifier = LogisticRegression()
myEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder()\
            .addGrid(lrClassifier.maxIter,[10])\
            .addGrid(lrClassifier.regParam,[0.3])\
            .addGrid(lrClassifier.elasticNetParam,[0.8])\
            .build()

validator = TrainValidationSplit( estimator=lrClassifier,
                                  estimatorParamMaps=myParams,
                                  trainRatio=0.75,
                                   evaluator = myEvaluator,
                                    )


bestModelLR = validator.fit(trainDF)
resultDF = bestModelRF.transform(testDF)

result = myEvaluator.evaluate(resultDF)
print("Accuracy = ",result)


Accuracy =  0.9176470588235294
