# Decision Trees in Spark

In [1]:
from pyspark.mllib.regression import LabeledPoint #this is a datatype required by decision trees
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext #this is needed to create spark context object which is kind of root of everything you do in spark.
import numpy as np

In [2]:
sc = SparkContext.getOrCreate()

In [3]:
def binary(data):
    if data == "Y":
        return 1
    else:
        return 0

def mapEducation(degree):
    if degree == "BS":
        return 1
    elif degree == "MS":
        return 2
    elif degree == "PHD":
        return 3
    else:
        return 0

def createLabelPoints(fields):
    yearsExp = int(fields[0])
    employed = binary(fields[1])
    previousExp = int(fields[2])
    education = mapEducation(fields[3])
    topTier = binary(fields[4])
    internship = binary(fields[5])
    hired = binary(fields[6])
    return LabeledPoint(hired, np.array([yearsExp, employed, previousExp, education, topTier, internship]))

In [4]:
rawData = sc.textFile("Hire.csv")
rawData.take(5)

['YearsExp,Employed?,PreviousEmployers,Education,TopTierSchool,Internship?,Hired',
 '10,Y,4,BS,N,N,Y',
 '0,N,0,BS,Y,Y,Y',
 '7,N,6,BS,N,N,N',
 '2,Y,1,MS,Y,N,Y']

In [5]:
column_row = rawData.first()
column_row

'YearsExp,Employed?,PreviousEmployers,Education,TopTierSchool,Internship?,Hired'

In [6]:
rawData = rawData.filter(lambda x: x != column_row)   #removing column-name from the data
rawData.take(5)  

['10,Y,4,BS,N,N,Y',
 '0,N,0,BS,Y,Y,Y',
 '7,N,6,BS,N,N,N',
 '2,Y,1,MS,Y,N,Y',
 '20,N,2,PHD,Y,N,N']

In [7]:
csvData = rawData.map(lambda x: x.split(",")) #separate each element of a with a comma
csvData.take(5)

[['10', 'Y', '4', 'BS', 'N', 'N', 'Y'],
 ['0', 'N', '0', 'BS', 'Y', 'Y', 'Y'],
 ['7', 'N', '6', 'BS', 'N', 'N', 'N'],
 ['2', 'Y', '1', 'MS', 'Y', 'N', 'Y'],
 ['20', 'N', '2', 'PHD', 'Y', 'N', 'N']]

In [8]:
train_data = csvData.map(createLabelPoints)
train_data.take(5)

[LabeledPoint(1.0, [10.0,1.0,4.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [0.0,0.0,0.0,1.0,1.0,1.0]),
 LabeledPoint(0.0, [7.0,0.0,6.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [2.0,1.0,1.0,2.0,1.0,0.0]),
 LabeledPoint(0.0, [20.0,0.0,2.0,3.0,1.0,0.0])]

In [9]:
testCandidate = list(np.array([10, 1, 3, 1, 0, 0]))
#above 10 means 10 yrs of experience, 1 means employed, 3 means 3
#previous employers, 1 means BS degree, 0 means not a top tier school and
#last 0 means no internship.
testData = sc.parallelize(testCandidate)   #this is to create RDD

In [15]:
model = DecisionTree.trainClassifier(train_data, numClasses=2,
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2},
                                     impurity='gini', maxDepth=5, maxBins=32)
#refer: https://spark.apache.org/docs/2.2.0/mllib-decision-tree.html
predictions = model.predict(testData)

In [20]:
print("Hire Prediction:")
results = predictions.collect() #collect() returns python object on our predictions
for result in results:
    print(result)

Hire Prediction:
1.0
