# Trees

In [1]:
import os
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("mytree").getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

PATH = os.path.join(os.getcwd(),"Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods")

In [2]:
data = spark.read.format('libsvm').load(os.path.join(PATH,'sample_libsvm_data.txt'))
college = spark.read.csv(os.path.join(PATH,'College.csv'),inferSchema=True,header=True)

In [3]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [4]:
train, test = data.randomSplit([0.7, 0.3])

In [5]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [6]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [7]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [8]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[100,101,102...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[234,235,237...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[97,98,99,12...|   [0.0,45.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[99,100,101,...|   [33.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(69

In [9]:
rfc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[100,101,102...|  [58.0,42.0]|[0.58,0.42]|       0.0|
|  0.0|(692,[125,126,127...|   [91.0,9.0]|[0.91,0.09]|       0.0|
|  0.0|(692,[126,127,128...|   [91.0,9.0]|[0.91,0.09]|       0.0|
|  0.0|(692,[126,127,128...|   [92.0,8.0]|[0.92,0.08]|       0.0|
|  0.0|(692,[126,127,128...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[126,127,128...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[126,127,128...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[127,128,129...|   [95.0,5.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[153,154,155...|   [95.0,5.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[234,235,237...|  [77.0,23.0]|[0.77,0.23]|       0.0|
|  1.0|(692,[97,98,99,12...|  [20.0,80.0]|  [0.2,0.8]|       1.0|
|  1.0|(692,[99,100,101,...|  [52.0,48.0]|[0.52,0.48]|       0.0|
|  1.0|(69

In [10]:
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[100,101,102...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[125,126,127...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.20519826603301...|[0.91761666320254...|       0.0|
|  0.0|(692,[126,127,128...|[1.50310863336919...|[0.95285421316443...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[126,127,128...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[153,154,155...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[234

In [11]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [12]:
print(f"DTC Accuracy: {acc_eval.evaluate(dtc_preds):.2f}")
print(f"RFC Accuracy: {acc_eval.evaluate(rfc_preds):.2f}")
print(f"GBT Accuracy: {acc_eval.evaluate(gbt_preds):.2f}")

DTC Accuracy: 0.95
RFC Accuracy: 0.95
GBT Accuracy: 0.95


In [13]:
rfc_model.featureImportances

SparseVector(692, {129: 0.0006, 155: 0.0018, 158: 0.0005, 182: 0.0006, 183: 0.0019, 184: 0.0019, 203: 0.0029, 210: 0.0029, 235: 0.0061, 242: 0.0005, 243: 0.0209, 244: 0.0214, 260: 0.0019, 262: 0.0104, 263: 0.0066, 271: 0.0069, 272: 0.0177, 273: 0.0027, 298: 0.0029, 299: 0.0028, 315: 0.0013, 316: 0.0003, 317: 0.0078, 322: 0.0058, 323: 0.0068, 324: 0.0055, 342: 0.001, 343: 0.0005, 344: 0.007, 345: 0.0086, 346: 0.0014, 349: 0.0058, 350: 0.0106, 351: 0.011, 352: 0.0015, 354: 0.0009, 356: 0.0025, 357: 0.0216, 358: 0.0014, 369: 0.0007, 370: 0.0029, 373: 0.0078, 375: 0.0049, 377: 0.0089, 378: 0.0117, 379: 0.0067, 382: 0.0023, 385: 0.0163, 399: 0.0044, 400: 0.0084, 402: 0.0005, 405: 0.034, 406: 0.018, 407: 0.0111, 411: 0.0005, 413: 0.0017, 415: 0.001, 426: 0.003, 428: 0.0083, 433: 0.0525, 434: 0.0436, 435: 0.0353, 438: 0.0005, 440: 0.0078, 442: 0.0005, 443: 0.0018, 454: 0.0075, 455: 0.0467, 456: 0.017, 460: 0.0005, 461: 0.0105, 462: 0.0229, 463: 0.0061, 468: 0.0038, 480: 0.0005, 482: 0.0005, 4

### College

In [14]:
college.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [15]:
college.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [16]:
college.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [17]:
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc',
                                       'Top25perc','F_Undergrad','P_Undergrad','Outstate',
                                       'Room_Board','Books','Personal','PhD','Terminal',
                                       'S_F_Ratio','perc_alumni','Expend','Grad_Rate'],outputCol='features')

In [18]:
output = assembler.transform(college)

In [19]:
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')

In [20]:
output_fixed = indexer.fit(output).transform(output)

In [21]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [22]:
final_data = output_fixed.select('features','PrivateIndex')

In [23]:
train, test = final_data.randomSplit([0.7,0.3])

In [24]:
from pyspark.ml import Pipeline

In [25]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [26]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbt_model = gbt.fit(train)

In [27]:
dtc_preds = dtc_model.transform(test)
rfc_preds = rfc_model.transform(test)
gbt_preds = gbt_model.transform(test)

In [28]:
my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [29]:
print(f"DTC: {my_binary_eval.evaluate(dtc_preds)}")
print(f"RFC: {my_binary_eval.evaluate(rfc_preds)}")
print(f"GBT: {my_binary_eval.evaluate(gbt_preds)}")

DTC: 0.943044230294045
RFC: 0.9828679680421711
GBT: 0.9623589490157315


In [30]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [31]:
rfc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [33]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='accuracy')

In [34]:
rfc_acc = acc_eval.evaluate(rfc_preds)

In [35]:
rfc_acc

0.9586776859504132