In [None]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
# Set the environment variables for running PySpark in the collaboration environmentimport os
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"

In [None]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.1-bin-hadoop3.2')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [None]:
spark

In [6]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("drive/MyDrive/db/iris_frame.csv"))

In [7]:
df.show(5)

+---+-----------------+----------------+-----------------+----------------+------+
|_c0|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
+---+-----------------+----------------+-----------------+----------------+------+
|  0|              5.1|             3.5|              1.4|             0.2|     0|
|  1|              4.9|             3.0|              1.4|             0.2|     0|
|  2|              4.7|             3.2|              1.3|             0.2|     0|
|  3|              4.6|             3.1|              1.5|             0.2|     0|
|  4|              5.0|             3.6|              1.4|             0.2|     0|
+---+-----------------+----------------+-----------------+----------------+------+
only showing top 5 rows



In [8]:
df.toPandas()

Unnamed: 0,_c0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0,5.1,3.5,1.4,0.2,0
1,1,4.9,3.0,1.4,0.2,0
2,2,4.7,3.2,1.3,0.2,0
3,3,4.6,3.1,1.5,0.2,0
4,4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...,...
145,145,6.7,3.0,5.2,2.3,2
146,146,6.3,2.5,5.0,1.9,2
147,147,6.5,3.0,5.2,2.0,2
148,148,6.2,3.4,5.4,2.3,2


In [9]:
# How many rows we have
df.count()

150

In [10]:
# The names of our columns
df.columns

['_c0',
 'sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)',
 'target']

In [11]:
# Types of our columns
df.dtypes

[('_c0', 'string'),
 ('sepal length (cm)', 'string'),
 ('sepal width (cm)', 'string'),
 ('petal length (cm)', 'string'),
 ('petal width (cm)', 'string'),
 ('target', 'string')]

In [12]:
# Basics stats from our columns
df.describe().toPandas()

Unnamed: 0,summary,_c0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,count,150.0,150.0,150.0,150.0,150.0,150.0
1,mean,74.5,5.843333333333335,3.057333333333334,3.7580000000000022,1.199333333333334,1.0
2,stddev,43.44536799245692,0.8280661279778637,0.4358662849366979,1.7652982332594662,0.7622376689603467,0.8192319205190406
3,min,0.0,4.3,2.0,1.0,0.1,0.0
4,max,99.0,7.9,4.4,6.9,2.5,2.0


In [13]:
from pyspark.sql.functions import col
dataset = df.select(col('sepal length (cm)').cast('float'),
                         col('sepal width (cm)').cast('float'),
                         col('petal length (cm)').cast('float'),
                         col('petal width (cm)').cast('float'),
                         col('target').cast('integer')            
                        )
dataset.show()

+-----------------+----------------+-----------------+----------------+------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
+-----------------+----------------+-----------------+----------------+------+
|              5.1|             3.5|              1.4|             0.2|     0|
|              4.9|             3.0|              1.4|             0.2|     0|
|              4.7|             3.2|              1.3|             0.2|     0|
|              4.6|             3.1|              1.5|             0.2|     0|
|              5.0|             3.6|              1.4|             0.2|     0|
|              5.4|             3.9|              1.7|             0.4|     0|
|              4.6|             3.4|              1.4|             0.3|     0|
|              5.0|             3.4|              1.5|             0.2|     0|
|              4.4|             2.9|              1.4|             0.2|     0|
|              4.9|             3.1|              1.

In [14]:
dataset.dtypes

[('sepal length (cm)', 'float'),
 ('sepal width (cm)', 'float'),
 ('petal length (cm)', 'float'),
 ('petal width (cm)', 'float'),
 ('target', 'int')]

In [15]:
# Assemble all the features with VectorAssembler
required_features = ['sepal length (cm)',
                    'sepal width (cm)',
                    'petal length (cm)',
                    'petal width (cm)'
                   ]
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(dataset)

In [17]:
transformed_data.show(10)

+-----------------+----------------+-----------------+----------------+------+--------------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|            features|
+-----------------+----------------+-----------------+----------------+------+--------------------+
|              5.1|             3.5|              1.4|             0.2|     0|[5.09999990463256...|
|              4.9|             3.0|              1.4|             0.2|     0|[4.90000009536743...|
|              4.7|             3.2|              1.3|             0.2|     0|[4.69999980926513...|
|              4.6|             3.1|              1.5|             0.2|     0|[4.59999990463256...|
|              5.0|             3.6|              1.4|             0.2|     0|[5.0,3.5999999046...|
|              5.4|             3.9|              1.7|             0.4|     0|[5.40000009536743...|
|              4.6|             3.4|              1.4|             0.3|     0|[4.59999990463256...|


In [46]:
# Modeling
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [47]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='target', featuresCol='features', maxDepth=5)

In [48]:
model = rf.fit(training_data)

In [49]:
predictions = model.transform(test_data)

In [50]:
# Evaluate our model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='accuracy')

In [51]:
accuracy = evaluator_accuracy.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.9615384615384616


In [52]:
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol='target', 
    predictionCol='prediction', 
    metricName='f1')

In [53]:
f1 = evaluator_f1.evaluate(predictions)
print('Test f1 = ', f1)

Test f1 =  0.9626235833132386
