Video 46: Tree Methods Documentation Examples
===============================================================

In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('myrandomforest').getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [3]:
train_data, test_data = data.randomSplit([0.7, 0.3])
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
dtc_preds.show()
rfc_preds.show()
gbt_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[154,155,156...|   [0.0,35.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[154,155,156...|   [0.0,35.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[155,156,180...|   [32.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[123,124,125...|   [0.0,35.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [5]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
print('DTC ACCURACY:')
acc_eval.evaluate(dtc_preds)

DTC ACCURACY:


0.9393939393939394

In [6]:
print('RFC ACCURACY:')
acc_eval.evaluate(rfc_preds)

RFC ACCURACY:


1.0

In [11]:
print('GBT ACCURACY:')
acc_eval.evaluate(gbt_preds)

GBT ACCURACY:


0.9393939393939394

In [12]:
rfc_model.featureImportances

SparseVector(692, {99: 0.0018, 101: 0.0009, 120: 0.0012, 127: 0.0004, 130: 0.0002, 154: 0.0008, 158: 0.0001, 180: 0.0003, 183: 0.0007, 186: 0.0009, 207: 0.0084, 208: 0.006, 213: 0.0002, 216: 0.0164, 235: 0.0066, 236: 0.0058, 242: 0.007, 243: 0.0098, 259: 0.0004, 262: 0.0144, 263: 0.0103, 270: 0.001, 271: 0.001, 272: 0.016, 290: 0.0062, 295: 0.0021, 296: 0.0015, 297: 0.0003, 301: 0.003, 302: 0.0019, 303: 0.001, 318: 0.0065, 319: 0.0009, 322: 0.0033, 323: 0.0069, 325: 0.0005, 328: 0.0019, 330: 0.0066, 331: 0.001, 343: 0.0004, 345: 0.0009, 350: 0.0517, 351: 0.0411, 355: 0.0005, 356: 0.0076, 358: 0.0064, 359: 0.0049, 372: 0.0051, 373: 0.0064, 374: 0.0007, 376: 0.0007, 377: 0.0284, 378: 0.076, 379: 0.028, 382: 0.0013, 385: 0.0005, 386: 0.003, 398: 0.0003, 400: 0.012, 401: 0.0032, 402: 0.0006, 404: 0.0014, 405: 0.0106, 406: 0.0331, 407: 0.027, 408: 0.0026, 410: 0.0005, 412: 0.0034, 415: 0.0055, 416: 0.0016, 427: 0.0033, 429: 0.0035, 430: 0.0011, 432: 0.0011, 433: 0.022, 434: 0.0393, 435: 0.0

Video 47: Decision Trees and Random Forest Code Along Examples
===============================================================

In [13]:
data = spark.read.csv('College.csv', inferSchema=True, header=True)
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [14]:
data.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [15]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [20]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Apps',
                                       'Accept',
                                       'Enroll',
                                       'Top10perc',
                                       'Top25perc',
                                       'F_Undergrad',
                                       'P_Undergrad',
                                       'Outstate',
                                       'Room_Board',
                                       'Books',
                                       'Personal',
                                       'PhD',
                                       'Terminal',
                                       'S_F_Ratio',
                                       'perc_alumni',
                                       'Expend',
                                       'Grad_Rate'], 
                            outputCol='features')
output = assembler.transform(data)
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Private', outputCol='Private_index')
output_fixed = indexer.fit(output).transform(output)
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- Private_index: double (nullable = true)



In [29]:
final_data = output_fixed.select('features', 'Private_index')
train_data, test_data = final_data.randomSplit([0.7, 0.3])
dtc = DecisionTreeClassifier(labelCol='Private_index', featuresCol='features')
rfc = RandomForestClassifier(numTrees=150, labelCol='Private_index', featuresCol='features')
gbt = GBTClassifier(labelCol='Private_index', featuresCol='features')
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
dtc_preds.show()
rfc_preds.show()
gbt_preds.show()

+--------------------+-------------+-------------+--------------------+----------+
|            features|Private_index|rawPrediction|         probability|prediction|
+--------------------+-------------+-------------+--------------------+----------+
|[81.0,72.0,51.0,3...|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[100.0,90.0,35.0,...|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[141.0,118.0,55.0...|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[167.0,130.0,46.0...|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[174.0,146.0,88.0...|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[232.0,216.0,106....|          0.0|   [13.0,1.0]|[0.92857142857142...|       0.0|
|[235.0,217.0,121....|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[247.0,189.0,100....|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[313.0,228.0,137....|          0.0|  [300.0,0.0]|           [1.0,0.0]|       0.0|
|[32

In [30]:
acc_eval = MulticlassClassificationEvaluator(labelCol='Private_index')
print('DTC ACCURACY:')
acc_eval.evaluate(dtc_preds)

DTC ACCURACY:


0.9240531464705077

In [39]:
print('RFC ACCURACY:')
acc_eval.evaluate(rfc_preds)

RFC ACCURACY:


0.9346514922092513

In [32]:
print('GBT ACCURACY:')
acc_eval.evaluate(gbt_preds)

GBT ACCURACY:


0.9304927679572532

In [33]:
dtc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Private_index: double (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [34]:
rfc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Private_index: double (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [35]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Private_index: double (nullable = true)
 |-- prediction: double (nullable = true)



In [42]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
acc_eval = BinaryClassificationEvaluator(labelCol='Private_index', rawPredictionCol='prediction')
print('GBT: ')
print(acc_eval.evaluate(gbt_preds))

GBT: 
0.895483193277311


In [44]:
acc_eval = MulticlassClassificationEvaluator(labelCol='Private_index', metricName='accuracy')
rfc_acc = acc_eval.evaluate(rfc_preds)
rfc_acc

0.9364406779661016