# Random Forest Example

In [1]:
import findspark
findspark.init('/home/fernando/spark-2.4.6-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rf').getOrCreate()

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [6]:
data = spark.read.format("libsvm").load("Tree_Methods/sample_libsvm_data.txt")

In [7]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



## Split Data

In [12]:
training_data, test_data = data.randomSplit([0.7, 0.3])

# Create Models

In [16]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [17]:
dtc_model = dtc.fit(training_data)
rfc_model = rfc.fit(training_data)
gbt_model = gbt.fit(training_data)

### Transform test data to get predictions

In [18]:
dtc_preds = dtc_model.transform(training_data)
rfc_preds = rfc_model.transform(training_data)
gbt_preds = gbt_model.transform(training_data)

In [20]:
dtc_preds.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[100,101,102...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[121,122,123...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,148...|   [30.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [21]:
rfc_preds.show(5)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[100,101,102...|  [90.0,10.0]|  [0.9,0.1]|       0.0|
|  0.0|(692,[121,122,123...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[122,123,124...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,148...|   [99.0,1.0]|[0.99,0.01]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 5 rows



In [22]:
gbt_preds.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[100,101,102...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[121,122,123...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[122,123,124...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[122,123,148...|[1.54350200272498...|[0.95635347857270...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



## Evaluator

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [24]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [25]:
print("DTC Accuracy")
acc_eval.evaluate(dtc_preds)

DTC Accuracy


1.0

In [26]:
print("RFC Accuracy")
acc_eval.evaluate(rfc_preds)

RFC Accuracy


1.0

In [27]:
print("GBT Accuracy")
acc_eval.evaluate(gbt_preds)

GBT Accuracy


1.0

## Feature importance

In [28]:
rfc_model.featureImportances

SparseVector(692, {100: 0.0003, 125: 0.0004, 127: 0.0002, 179: 0.0003, 184: 0.0001, 185: 0.0007, 188: 0.0005, 206: 0.0012, 207: 0.0059, 209: 0.0003, 213: 0.0007, 214: 0.0004, 233: 0.0013, 234: 0.0015, 237: 0.0006, 239: 0.0014, 243: 0.0033, 244: 0.0099, 245: 0.0012, 262: 0.0059, 266: 0.0012, 268: 0.0009, 272: 0.0295, 273: 0.015, 274: 0.0003, 289: 0.0068, 290: 0.0099, 291: 0.0018, 295: 0.0058, 296: 0.0016, 298: 0.0003, 299: 0.0009, 300: 0.0094, 301: 0.0061, 302: 0.001, 318: 0.0005, 320: 0.0006, 321: 0.0015, 322: 0.0124, 323: 0.0218, 325: 0.0018, 330: 0.0068, 343: 0.0005, 345: 0.0005, 346: 0.0005, 350: 0.0264, 351: 0.0417, 352: 0.0038, 354: 0.0017, 356: 0.001, 357: 0.0073, 359: 0.0029, 360: 0.0009, 371: 0.0046, 373: 0.0094, 377: 0.0022, 378: 0.0277, 379: 0.0123, 380: 0.0023, 384: 0.0087, 386: 0.0006, 387: 0.0007, 402: 0.0026, 405: 0.0221, 406: 0.0929, 407: 0.0226, 408: 0.0025, 410: 0.001, 412: 0.0024, 413: 0.0143, 414: 0.0006, 427: 0.0006, 428: 0.0077, 429: 0.0029, 433: 0.018, 434: 0.0284