In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mytree').getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier,GBTClassifier,DecisionTreeClassifier
#those function can be used to regression too. (pysparl.ml.regression)

In [0]:
data = spark.read.format('libsvm').load('dbfs:/FileStore/shared_uploads/gkantirisrafael@gmail.com/sample_libsvm_data-1.txt')

In [0]:
data.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 3 rows



In [0]:
#We will split the data into test and train data
train_data,test_data = data.randomSplit([0.7,0.3])

In [0]:
#Now we create various models.      Let's start with the decision tree classifier AND the random forest classifier AND Gradient Boosting Tree Classifier(GBT)
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100) 
gbt = GBTClassifier()

In [0]:
# Now we want to fit the classifiers with the train data into a model
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [0]:
#Now we can use them to transform the test data to get predictions!!
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [0]:
#Let's take a look into one of these
gbt_predictions.show(3)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[122,123,124...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [0]:
#Let's see how we can use an evaluator here
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
#Let's create a Multi-Class Classification Evaluator that works for accuracy
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [0]:
print('DTC ACCURACY:',acc_eval.evaluate(dtc_predictions))
print('RFC ACCURACY:',acc_eval.evaluate(rfc_predictions))
print('GBT ACCURACY:',acc_eval.evaluate(gbt_predictions))

DTC ACCURACY: 0.9285714285714286
RFC ACCURACY: 1.0
GBT ACCURACY: 0.9285714285714286


In [0]:
#We have the ability to grab feature importance using the fitted model
rfc_model.featureImportances
#Example: Feature:100 , Importance:0.0011 -----> The higher this number is the more important the feature was

Out[32]: SparseVector(692, {100: 0.0011, 127: 0.0003, 131: 0.0017, 154: 0.0001, 178: 0.005, 182: 0.0021, 189: 0.001, 206: 0.0043, 208: 0.0046, 209: 0.0068, 214: 0.0026, 216: 0.0008, 232: 0.0003, 235: 0.001, 238: 0.0005, 243: 0.0092, 244: 0.008, 264: 0.0005, 272: 0.0057, 273: 0.0077, 274: 0.0005, 275: 0.0008, 287: 0.0026, 289: 0.0136, 296: 0.0004, 298: 0.0011, 299: 0.0041, 300: 0.0005, 317: 0.0141, 322: 0.0067, 323: 0.0254, 324: 0.0079, 326: 0.0009, 328: 0.0151, 329: 0.0063, 331: 0.0007, 342: 0.0041, 343: 0.0005, 345: 0.0023, 349: 0.0045, 350: 0.0045, 351: 0.02, 352: 0.0013, 353: 0.0006, 356: 0.0083, 357: 0.008, 359: 0.0025, 370: 0.0005, 373: 0.0086, 374: 0.0015, 377: 0.0205, 378: 0.02, 379: 0.0451, 382: 0.0015, 385: 0.0011, 386: 0.0062, 397: 0.0012, 398: 0.0029, 399: 0.0209, 400: 0.0159, 401: 0.0053, 402: 0.0058, 403: 0.0006, 404: 0.0003, 405: 0.071, 407: 0.0223, 408: 0.0006, 409: 0.002, 412: 0.0022, 414: 0.0072, 415: 0.0022, 425: 0.0013, 427: 0.0079, 428: 0.0027, 429: 0.0064, 432: 0.0