# Hyperparameter Tuning - Tunning Cross with Train Validation Split

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("tunningcrosswithtrainvalidsplit").getOrCreate()

24/04/03 22:33:32 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.108 instead (on interface wlo1)
24/04/03 22:33:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/03 22:33:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [3]:
iris = spark.read.csv("../0_data/iris.csv", header=True, inferSchema=True, sep=",")
print(iris.count())
iris.show(5)

150
+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [4]:
iris_train, iris_test = iris.randomSplit([0.7, 0.3])

In [5]:
vector_assembler = VectorAssembler(inputCols=["sepallength", "sepalwidth", "petallength", "petalwidth"], outputCol="independant")
indexer = StringIndexer(inputCol="class", outputCol="label")

In [6]:
mlp = MultilayerPerceptronClassifier(featuresCol="independant", labelCol="label", maxIter=100, layers=[4, 5, 4, 3])

In [7]:
pipeline = Pipeline(stages=[vector_assembler, indexer, mlp])

In [8]:
# Performance for train_val
performance = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
grid = ParamGridBuilder().addGrid(mlp.maxIter, [10, 100, 1000]).addGrid(mlp.layers,[[4, 5, 4, 3], [4, 4, 3]]).build()
train_val = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=performance, trainRatio=0.8)

In [9]:
model = train_val.fit(iris_train)
result = model.transform(iris_test)
print(performance.evaluate(result))

0.9830508474576272


In [10]:
prediction = model.transform(iris_test)
prediction.select("rawprediction", "prediction", "probability").show(5)

+--------------------+----------+--------------------+
|       rawprediction|prediction|         probability|
+--------------------+----------+--------------------+
|[-84.122922506668...|       2.0|[7.92888653362761...|
|[-84.122922506668...|       2.0|[7.92888653362761...|
|[-84.122922506668...|       2.0|[7.92888653362761...|
|[-84.122922506668...|       2.0|[7.92888653362761...|
|[-84.122922506668...|       2.0|[7.92888653362761...|
+--------------------+----------+--------------------+
only showing top 5 rows



In [11]:
# Performance of prediction
performance = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = performance.evaluate(prediction)
print(f"accuracy: {accuracy}")

accuracy: 0.9830508474576272
