# Trees

In [1]:
import os
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("mytree").getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

PATH = os.path.join(os.getcwd(),"Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods")

In [2]:
data = spark.read.format('libsvm').load(os.path.join(PATH,'sample_libsvm_data.txt'))
college = spark.read.csv(os.path.join(PATH,'College.csv'),inferSchema=True,header=True)
dogfood = spark.read.csv(os.path.join(PATH,'dog_food.csv'),inferSchema=True,header=True)

In [3]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [4]:
train, test = data.randomSplit([0.7, 0.3])

In [5]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [6]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [7]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [8]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[100,101,102...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,124...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(69

In [9]:
rfc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[95,96,97,12...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[100,101,102...|  [55.0,45.0]|[0.55,0.45]|       0.0|
|  0.0|(692,[122,123,124...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[123,124,125...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[124,125,126...|  [79.0,21.0]|[0.79,0.21]|       0.0|
|  0.0|(692,[126,127,128...|  [86.0,14.0]|[0.86,0.14]|       0.0|
|  0.0|(692,[126,127,128...|  [86.0,14.0]|[0.86,0.14]|       0.0|
|  0.0|(692,[126,127,128...|  [89.0,11.0]|[0.89,0.11]|       0.0|
|  0.0|(692,[126,127,128...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[126,127,128...|  [87.0,13.0]|[0.87,0.13]|       0.0|
|  0.0|(692,[127,128,129...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[127,128,129...|   [92.0,8.0]|[0.92,0.08]|       0.0|
|  0.0|(69

In [10]:
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[100,101,102...|[1.29942254463997...|[0.93078721457071...|       0.0|
|  0.0|(692,[122,123,124...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.20406020702228...|[0.91744443341250...|       0.0|
|  0.0|(692,[126,127,128...|[1.14332606188163...|[0.90776552390848...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.34999649195960...|[0.93702622993885...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126

In [11]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [12]:
print(f"DTC Accuracy: {acc_eval.evaluate(dtc_preds):.2f}")
print(f"RFC Accuracy: {acc_eval.evaluate(rfc_preds):.2f}")
print(f"GBT Accuracy: {acc_eval.evaluate(gbt_preds):.2f}")

DTC Accuracy: 1.00
RFC Accuracy: 1.00
GBT Accuracy: 1.00


In [13]:
rfc_model.featureImportances

SparseVector(692, {99: 0.0012, 155: 0.0058, 158: 0.0007, 203: 0.0006, 212: 0.0022, 215: 0.0066, 216: 0.0094, 235: 0.0006, 237: 0.0003, 242: 0.0012, 244: 0.0086, 245: 0.0016, 263: 0.0005, 266: 0.0008, 270: 0.0018, 273: 0.0014, 287: 0.0029, 291: 0.0065, 300: 0.0093, 301: 0.0098, 302: 0.0054, 314: 0.0015, 315: 0.003, 317: 0.005, 320: 0.0008, 323: 0.0143, 328: 0.0009, 329: 0.0076, 330: 0.017, 349: 0.0017, 350: 0.0006, 351: 0.0123, 352: 0.0022, 354: 0.0004, 355: 0.0009, 356: 0.0168, 357: 0.0258, 358: 0.0204, 359: 0.0063, 369: 0.0007, 375: 0.0017, 377: 0.0066, 378: 0.0288, 379: 0.0261, 381: 0.0005, 385: 0.0318, 398: 0.0056, 399: 0.0158, 400: 0.0088, 401: 0.0037, 405: 0.0013, 406: 0.0187, 407: 0.0259, 411: 0.0007, 413: 0.0179, 414: 0.0029, 427: 0.0009, 430: 0.0006, 432: 0.0007, 433: 0.0633, 434: 0.0364, 436: 0.0005, 439: 0.0034, 440: 0.0157, 441: 0.0021, 442: 0.0014, 454: 0.0028, 455: 0.0426, 456: 0.0017, 457: 0.0062, 461: 0.0182, 462: 0.0238, 463: 0.0119, 468: 0.0107, 470: 0.0038, 481: 0.000

### College

In [14]:
college.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [15]:
college.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [16]:
college.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [17]:
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc',
                                       'Top25perc','F_Undergrad','P_Undergrad','Outstate',
                                       'Room_Board','Books','Personal','PhD','Terminal',
                                       'S_F_Ratio','perc_alumni','Expend','Grad_Rate'],outputCol='features')

In [18]:
output = assembler.transform(college)

In [19]:
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')

In [20]:
output_fixed = indexer.fit(output).transform(output)

In [21]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [22]:
final_data = output_fixed.select('features','PrivateIndex')

In [23]:
train, test = final_data.randomSplit([0.7,0.3])

In [24]:
from pyspark.ml import Pipeline

In [25]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [26]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [27]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [28]:
my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [29]:
print(f"DTC: {my_binary_eval.evaluate(dtc_preds)}")
print(f"RFC: {my_binary_eval.evaluate(rfc_preds)}")
print(f"GBT: {my_binary_eval.evaluate(gbt_preds)}")

DTC: 0.9690251572327045
RFC: 0.9760482180293499
GBT: 0.9827044025157233


In [30]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [31]:
rfc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [32]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='accuracy')

In [33]:
rfc_acc = acc_eval.evaluate(rfc_preds)

In [34]:
rfc_acc

0.9269406392694064

### Dog Food Consulting Project

In [36]:
dogfood.head(10)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0),
 Row(A=5, B=6, C=12.0, D=7, Spoiled=1.0),
 Row(A=6, B=2, C=13.0, D=6, Spoiled=1.0),
 Row(A=4, B=2, C=12.0, D=1, Spoiled=1.0),
 Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0),
 Row(A=10, B=3, C=13.0, D=9, Spoiled=1.0),
 Row(A=8, B=5, C=14.0, D=5, Spoiled=1.0),
 Row(A=5, B=8, C=12.0, D=8, Spoiled=1.0),
 Row(A=6, B=5, C=12.0, D=9, Spoiled=1.0),
 Row(A=3, B=3, C=12.0, D=1, Spoiled=1.0)]

In [37]:
dogfood.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [39]:
assembler = VectorAssembler(inputCols=['A','B','C','D'],outputCol='features')
output = assembler.transform(dogfood)

In [40]:
rfc = RandomForestClassifier(labelCol='Spoiled',featuresCol='features')

In [41]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [42]:
final_data = output.select('features','Spoiled')

In [43]:
final_data.show()

+-------------------+-------+
|           features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [44]:
rfc_model = rfc.fit(final_data)

In [45]:
final_data.head(1)

[Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]

Feature 2 is the single most important feature in determining spoilage.

In [46]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0176, 1: 0.0197, 2: 0.937, 3: 0.0257})