Same as two prior labs, except notice the RandomForest library instead of Decision Tree

In [1]:
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest

sc = SparkContext(appName="TitanicLR")
fileNameTrain = 'wasb://kaggle@criteo.blob.core.windows.net/train.csv'
points = sc.textFile(fileNameTrain)

In [2]:
def parsePoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = line.split(',')
    values = [0 if e == '' else e for e in values]
    gender = abs(hash(values[5]))
    embarked = abs(hash(values[12]))
    return LabeledPoint(float(values[1]), [float(values[2]),gender,float(values[6]),float(values[7]),float(values[8]),float(values[10]),embarked])


In [3]:
#skip header
header = points.first() #extract header
points = points.filter(lambda x:x !=header) #filter out header

print points.first()


1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S


In [4]:
labeledPoints = points.map(parsePoint)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = labeledPoints.randomSplit([0.7, 0.3])

#initialize variables for grid search
bestModel = None
bestTestErr = 100

This is where we start to differ: create a parameter search over a certain (specified) grid

In [6]:
maxDepths = range(4,10)
maxTrees = range(3,10)
for depthLevel in maxDepths:
    for treeLevel in maxTrees:
        model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                             numTrees=treeLevel, featureSubsetStrategy="auto",
                                             impurity='gini', maxDepth=depthLevel, maxBins=32)        

        predictions = model.predict(testData.map(lambda x: x.features))
        labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
        testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
        
        print ('\maxDepth = {0:.1f}, trees = {1:.1f}: trainErr = {2:.5f}'
               .format(depthLevel, treeLevel, testErr))
        if (testErr < bestTestErr):
            bestModel = model
            bestTestErr = testErr
            
print ('Best Test Error: = {0:.3f}\n'
       .format(bestTestErr))

\maxDepth = 4.0, trees = 3.0: trainErr = 0.19667
\maxDepth = 4.0, trees = 4.0: trainErr = 0.18333
\maxDepth = 4.0, trees = 5.0: trainErr = 0.21000
\maxDepth = 4.0, trees = 6.0: trainErr = 0.19000
\maxDepth = 4.0, trees = 7.0: trainErr = 0.16333
\maxDepth = 4.0, trees = 8.0: trainErr = 0.19333
\maxDepth = 4.0, trees = 9.0: trainErr = 0.18000
\maxDepth = 5.0, trees = 3.0: trainErr = 0.16000
\maxDepth = 5.0, trees = 4.0: trainErr = 0.17000
\maxDepth = 5.0, trees = 5.0: trainErr = 0.19000
\maxDepth = 5.0, trees = 6.0: trainErr = 0.18000
\maxDepth = 5.0, trees = 7.0: trainErr = 0.16667
\maxDepth = 5.0, trees = 8.0: trainErr = 0.18333
\maxDepth = 5.0, trees = 9.0: trainErr = 0.17333
\maxDepth = 6.0, trees = 3.0: trainErr = 0.20333
\maxDepth = 6.0, trees = 4.0: trainErr = 0.17667
\maxDepth = 6.0, trees = 5.0: trainErr = 0.17667
\maxDepth = 6.0, trees = 6.0: trainErr = 0.19000
\maxDepth = 6.0, trees = 7.0: trainErr = 0.16333
\maxDepth = 6.0, trees = 8.0: trainErr = 0.16667
\maxDepth = 6.0, tre

In [7]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())

print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

Test Error = 0.176666666667
Learned classification tree model:
TreeEnsembleModel classifier with 9 trees

  Tree 0:
    If (feature 1 <= 1.067159931E9)
     If (feature 4 <= 0.0)
      If (feature 6 <= 1.3908798E7)
       If (feature 5 <= 50.4958)
        If (feature 3 <= 0.0)
         If (feature 5 <= 7.4958)
          If (feature 5 <= 7.225)
           Predict: 0.0
          Else (feature 5 > 7.225)
           Predict: 0.0
         Else (feature 5 > 7.4958)
          If (feature 5 <= 18.7875)
           Predict: 1.0
          Else (feature 5 > 18.7875)
           If (feature 2 <= 36.0)
            Predict: 0.0
           Else (feature 2 > 36.0)
            If (feature 2 <= 41.0)
             Predict: 1.0
            Else (feature 2 > 41.0)
             Predict: 0.0
        Else (feature 3 > 0.0)
         Predict: 0.0
       Else (feature 5 > 50.4958)
        If (feature 2 <= 0.0)
         Predict: 0.0
        Else (feature 2 > 0.0)
         If (feature 3 <= 0.0)
          Predict: 1.

In [8]:
from pyspark.mllib.linalg import Vectors

def parseTestPoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = line.split(',')
    values = [0 if e == '' else e for e in values]
    gender = abs(hash(values[4]))
    embarked = abs(hash(values[11]))
    return Vectors.dense([float(values[1]),gender,float(values[5]),float(values[6]),float(values[7]),float(values[9]),embarked])

fileNameTest = 'wasb://kaggle@criteo.blob.core.windows.net/test.csv'
testPoints = sc.textFile(fileNameTest)

#skip header
headerTest = testPoints.first() #extract header
testPoints = testPoints.filter(lambda x:x !=headerTest) #filter out header
print testPoints.take(2)

testPoints = testPoints.map(parseTestPoint)

predictions = bestModel.predict(testPoints)
print predictions.take(125)

[u'892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q', u'893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S']
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0]


In [9]:
#write out predictions to .CSV file
submissionIds = sc.textFile(fileNameTest).map(lambda x: x.split(',')[0])

#skip header
headerSubmission = submissionIds.first() #extract header
submissionIds = submissionIds.filter(lambda x:x !=headerSubmission) #filter out header
print submissionIds.take(3)

[u'892', u'893', u'894']


In [10]:
submission = submissionIds.zip(predictions.map(lambda x: int(x)))

In [None]:
submission.map(lambda a: str(a[0]) + "," + str(a[1])).coalesce(1,True).saveAsTextFile('wasb://criteo@criteo.blob.core.windows.net/kaggle/test_svcc3.csv')