In [None]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
# Set the environment variables for running PySpark in the collaboration environmentimport os
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"

In [None]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.1-bin-hadoop3.2')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

#Dataset Iris

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn import datasets

In [None]:
iris = datasets.load_iris()

In [None]:
X = iris.data[:,:4]

In [None]:
y = iris.target

In [None]:
df = pd.DataFrame(X,columns=iris.feature_names)
df['target'] = y
df.columns = ['sepal_length','sepal_width','petal_length','petal_width','target']
df['name_iris'] = df['target'].apply(lambda _:iris.target_names[_])

In [None]:
print(df.head())
print('---------------------------------------')
print(df.info())
print('---------------------------------------')
print(df.name_iris.value_counts(normalize=True))

   sepal_length  sepal_width  petal_length  petal_width  target name_iris
0           5.1          3.5           1.4          0.2       0    setosa
1           4.9          3.0           1.4          0.2       0    setosa
2           4.7          3.2           1.3          0.2       0    setosa
3           4.6          3.1           1.5          0.2       0    setosa
4           5.0          3.6           1.4          0.2       0    setosa
---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   target        150 non-null    int64  
 5   name_iris     150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None
-

In [None]:
df.to_csv('data.csv', index=False)

#Building a Classification Model

In [None]:
df = spark.read.csv('data.csv',inferSchema=True,header=True)

In [None]:
df.show(5)

+------------+-----------+------------+-----------+------+---------+
|sepal_length|sepal_width|petal_length|petal_width|target|name_iris|
+------------+-----------+------------+-----------+------+---------+
|         5.1|        3.5|         1.4|        0.2|     0|   setosa|
|         4.9|        3.0|         1.4|        0.2|     0|   setosa|
|         4.7|        3.2|         1.3|        0.2|     0|   setosa|
|         4.6|        3.1|         1.5|        0.2|     0|   setosa|
|         5.0|        3.6|         1.4|        0.2|     0|   setosa|
+------------+-----------+------------+-----------+------+---------+
only showing top 5 rows



In [None]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- target: integer (nullable = true)
 |-- name_iris: string (nullable = true)



In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
df_assembler = VectorAssembler(inputCols=['sepal_length','sepal_width','petal_length','petal_width'],outputCol='features')

In [None]:
df = df_assembler.transform(df)

In [None]:
df.select(['features','target']).show(5,False)

+-----------------+------+
|features         |target|
+-----------------+------+
|[5.1,3.5,1.4,0.2]|0     |
|[4.9,3.0,1.4,0.2]|0     |
|[4.7,3.2,1.3,0.2]|0     |
|[4.6,3.1,1.5,0.2]|0     |
|[5.0,3.6,1.4,0.2]|0     |
+-----------------+------+
only showing top 5 rows



In [None]:
model_df = df.select(['features','target'])

In [120]:
training_df, test_df = model_df.randomSplit([0.75,0.25])

In [121]:
from pyspark.ml.classification import RandomForestClassifier

In [122]:
rf = RandomForestClassifier(labelCol='target',featuresCol='features', maxDepth=5)

In [123]:
model_rf = rf.fit(training_df)

In [153]:
model_rf.featureImportances

SparseVector(4, {0: 0.1297, 1: 0.0088, 2: 0.4721, 3: 0.3894})

In [124]:
predictions = model_rf.transform(test_df)

In [149]:
predictions.show(5)

+-----------------+------+--------------+-------------+----------+
|         features|target| rawPrediction|  probability|prediction|
+-----------------+------+--------------+-------------+----------+
|[4.6,3.2,1.4,0.2]|     0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.4,1.4,0.3]|     0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.0,1.4,0.1]|     0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.9,3.0,1.4,0.2]|     0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.9,3.6,1.4,0.1]|     0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+-----------------+------+--------------+-------------+----------+
only showing top 5 rows



In [125]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='accuracy')

In [126]:
accuracy = evaluator_accuracy.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.967741935483871


In [127]:
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='f1')

In [128]:
f1 = evaluator_f1.evaluate(predictions)
print('Test f1 = ', f1)

Test f1 =  0.9675382003395587


# Hyperparameter Tuning

In [129]:
from pyspark.ml.classification import RandomForestClassifier

In [130]:
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

In [131]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [132]:
evaluator = MulticlassClassificationEvaluator(    
    labelCol='target', 
    predictionCol='prediction', 
    metricName='accuracy')

In [133]:
rf = RandomForestClassifier(labelCol='target',featuresCol='features')

In [134]:
paramGrid = (ParamGridBuilder(). \
             addGrid(rf.maxDepth,[5,10,20,25,30]). \
             addGrid(rf.maxBins,[20,30,40]). \
             addGrid(rf.numTrees,[5,20,50]). \
             build())

In [135]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

In [136]:
cv_model = cv.fit(training_df)

In [137]:
best_model_rf = cv_model.bestModel

In [139]:
best_predictions = best_model_rf.transform(test_df)

In [140]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='accuracy')

In [141]:
accuracy = evaluator_accuracy.evaluate(best_predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.967741935483871


In [142]:
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='f1')

In [143]:
f1 = evaluator_f1.evaluate(best_predictions)
print('Test f1 = ', f1)

Test f1 =  0.9675382003395587


In [144]:
evaluator_weightedPrecision = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='weightedPrecision')

In [146]:
precision = evaluator_weightedPrecision.evaluate(best_predictions)
print('Test precision = ', precision)

Test precision =  0.9702233250620347


In [147]:
evaluator_weightedRecall = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='weightedRecall')

In [148]:
recall = evaluator_weightedRecall.evaluate(best_predictions)
print('Test recall = ', recall)

Test recall =  0.967741935483871
