# Spark Tree Methods Code 

Let's test out 3 different tree methods:

* A single decision tree
* A random forest
* A gradient boosted tree classifier
    
We will be using a college dataset to try to classify colleges as Private or Public based off these features:

    Private A factor with levels No and Yes indicating private or public university
    Apps Number of applications received
    Accept Number of applications accepted
    Enroll Number of new students enrolled
    Top10perc Pct. new students from top 10% of H.S. class
    Top25perc Pct. new students from top 25% of H.S. class
    F.Undergrad Number of fulltime undergraduates
    P.Undergrad Number of parttime undergraduates
    Outstate Out-of-state tuition
    Room.Board Room and board costs
    Books Estimated book costs
    Personal Estimated personal spending
    PhD Pct. of faculty with Ph.D.’s
    Terminal Pct. of faculty with terminal degree
    S.F.Ratio Student/faculty ratio
    perc.alumni Pct. alumni who donate
    Expend Instructional expenditure per student
    Grad.Rate Graduation rate

In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.4.4-bin-hadoop2.7/')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('university').getOrCreate()

In [3]:
data = spark.read.csv('College.csv', inferSchema=True, header=True)
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
data.head()

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)

In [5]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [8]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad',
                                         'P_Undergrad','Outstate','Room_Board','Books','Personal','PhD','Terminal',
                                         'S_F_Ratio','perc_alumni','Expend','Grad_Rate'],
                           outputCol='features')

In [9]:
df = assembler.transform(data)

In [10]:
from pyspark.ml.feature import StringIndexer

In [11]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [12]:
df = indexer.fit(df).transform(df)

In [13]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [14]:
input_data = df.select(['features', 'PrivateIndex'])

In [16]:
train_data, test_data = input_data.randomSplit([0.7, 0.3])

In [17]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier

In [18]:
from pyspark.ml import Pipeline

In [19]:
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='PrivateIndex')
gbc = GBTClassifier(featuresCol='features', labelCol='PrivateIndex')
rfc = RandomForestClassifier(featuresCol='features', labelCol='PrivateIndex') 

In [20]:
dtc_model = dtc.fit(train_data)
gbc_model = gbc.fit(train_data)
rfc_model = rfc.fit(train_data)

In [21]:
dtc_preds = dtc_model.transform(test_data)
gbc_preds = gbc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [24]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='PrivateIndex')

In [25]:
print('DTC')
evaluator.evaluate(dtc_preds)

DTC


0.872823541390897

In [26]:
print('RFC')
evaluator.evaluate(rfc_preds)

RFC


0.928215049383973

In [27]:
print('GBC')
evaluator.evaluate(gbc_preds)

GBC


0.8974137053253233

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
acc_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='PrivateIndex', metricName='accuracy')

In [31]:
print('DTC')
acc_eval.evaluate(dtc_preds)

DTC


0.918918918918919

In [32]:
print('RFC')
acc_eval.evaluate(rfc_preds)

RFC


0.954954954954955

In [33]:
print('GBC')
acc_eval.evaluate(gbc_preds)

GBC


0.9324324324324325