In [11]:
#import SparkSession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('random_forest').getOrCreate()
df=spark.read.csv('iris_dataset.csv',inferSchema=True,header=True)

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

input_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# Transform all features into a vector using VectorAssembler
vec_assembler = VectorAssembler(inputCols = input_cols, outputCol='features')
df = vec_assembler.transform(df)

In [12]:
df.show(3)

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|
+------------+-----------+------------+-----------+-------+-----------------+
only showing top 3 rows



In [13]:
#import required libraries
from pyspark.ml.feature import StringIndexer
species_indexer = StringIndexer(inputCol="species", outputCol="target").fit(df)
df = species_indexer.transform(df)
df.show(5)

+------------+-----------+------------+-----------+-------+-----------------+------+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|target|
+------------+-----------+------------+-----------+-------+-----------------+------+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|   0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|   0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|   0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|[4.6,3.1,1.5,0.2]|   0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|[5.0,3.6,1.4,0.2]|   0.0|
+------------+-----------+------------+-----------+-------+-----------------+------+
only showing top 5 rows



### select data for building model

In [16]:
df.select(['features','target']).show(10,False)
model_df=df.select(['features','target'])

+-----------------+------+
|features         |target|
+-----------------+------+
|[5.1,3.5,1.4,0.2]|0.0   |
|[4.9,3.0,1.4,0.2]|0.0   |
|[4.7,3.2,1.3,0.2]|0.0   |
|[4.6,3.1,1.5,0.2]|0.0   |
|[5.0,3.6,1.4,0.2]|0.0   |
|[5.4,3.9,1.7,0.4]|0.0   |
|[4.6,3.4,1.4,0.3]|0.0   |
|[5.0,3.4,1.5,0.2]|0.0   |
|[4.4,2.9,1.4,0.2]|0.0   |
|[4.9,3.1,1.5,0.1]|0.0   |
+-----------------+------+
only showing top 10 rows



In [17]:
train_df,test_df=model_df.randomSplit([0.75,0.25])

### random forest classifier

In [19]:
from pyspark.ml.classification import RandomForestClassifier
rf_classifier=RandomForestClassifier(labelCol='target',numTrees=50).fit(train_df)
rf_predictions=rf_classifier.transform(test_df)

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
rf_accuracy=MulticlassClassificationEvaluator(labelCol='target',metricName='accuracy').evaluate(rf_predictions)
print('The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy))

rf_precision=MulticlassClassificationEvaluator(labelCol='target',metricName='weightedPrecision').evaluate(rf_predictions)
print('The precision rate on test data is {0:.0%}'.format(rf_precision))

The accuracy of RF on test data is 97%
The precision rate on test data is 97%


In [22]:
rf_classifier.featureImportances

SparseVector(4, {0: 0.0967, 1: 0.0181, 2: 0.4974, 3: 0.3879})

In [23]:
df.schema["features"].metadata["ml_attr"]["attrs"]

{'numeric': [{'idx': 0, 'name': 'sepal_length'},
  {'idx': 1, 'name': 'sepal_width'},
  {'idx': 2, 'name': 'petal_length'},
  {'idx': 3, 'name': 'petal_width'}]}

## Save the model 

In [24]:
pwd

'/Users/aufhebung/Desktop/Programming/Spark/ML'

In [None]:
rf_classifier.save(".../RF_model")

In [None]:
from pyspark.ml.classification import RandomForestClassificationModel

In [None]:
rf=RandomForestClassificationModel.load(".../RF_model")

In [None]:
model_preditions=rf.transform(test_df)

In [None]:
model_preditions.show()