##Preliminaries
Add the necessary libraries and create Spark Context and create an RDD

In [1]:
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree

sc = SparkContext(appName="TitanicLR")
fileNameTrain = 'wasb://kaggle@criteo.blob.core.windows.net/train.csv'
points = sc.textFile(fileNameTrain)

Create a function to parse CSV lines and convert them to LabeledPoint. LabeledPoint is the class that our Machine Learning algorithms expects (down below). Note the additional values we are extracting from the data - 'gender' and 'embarked.'

In [2]:
def parsePoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = line.split(',')
    values = [0 if e == '' else e for e in values]
    gender = abs(hash(values[5]))
    embarked = abs(hash(values[12]))
    return LabeledPoint(float(values[1]), [float(values[2]),gender,float(values[6]),float(values[7]),float(values[8]),float(values[10]),embarked])


In [3]:
#skip header
header = points.first() #extract header
points = points.filter(lambda x:x !=header) #filter out header

print points.first()


1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S


Split the data into training and test set - much better to test on unseen data

In [9]:
labeledPoints = points.map(parsePoint)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = labeledPoints.randomSplit([0.7, 0.3])
print trainingData.take(3)

[LabeledPoint(1.0, [1.0,1620467478.0,35.0,1.0,0.0,53.1,2034097362.0]), LabeledPoint(0.0, [3.0,1067159931.0,35.0,0.0,0.0,8.05,2034097362.0]), LabeledPoint(0.0, [3.0,1067159931.0,0.0,0.0,0.0,8.4583,1778096592.0])]


Train Decision Tree classifier

In [8]:
model=DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)


And make predictions

In [6]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())

print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

Test Error = 0.185606060606
Learned classification tree model:
DecisionTreeModel classifier of depth 5 with 55 nodes
  If (feature 1 <= 1.067159931E9)
   If (feature 5 <= 15.55)
    If (feature 3 <= 0.0)
     If (feature 6 <= 1.3908798E7)
      If (feature 2 <= 29.0)
       Predict: 0.0
      Else (feature 2 > 29.0)
       Predict: 0.0
     Else (feature 6 > 1.3908798E7)
      If (feature 5 <= 7.7417)
       Predict: 0.0
      Else (feature 5 > 7.7417)
       Predict: 0.0
    Else (feature 3 > 0.0)
     If (feature 2 <= 13.0)
      If (feature 5 <= 7.75)
       Predict: 0.0
      Else (feature 5 > 7.75)
       Predict: 1.0
     Else (feature 2 > 13.0)
      If (feature 2 <= 25.0)
       Predict: 0.0
      Else (feature 2 > 25.0)
       Predict: 0.0
   Else (feature 5 > 15.55)
    If (feature 2 <= 13.0)
     If (feature 2 <= 0.0)
      If (feature 5 <= 69.3)
       Predict: 0.0
      Else (feature 5 > 69.3)
       Predict: 0.0
     Else (feature 2 > 0.0)
      If (feature 3 <= 2.0)
    

Read test data and make predictions

In [7]:
from pyspark.mllib.linalg import Vectors

def parseTestPoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = line.split(',')
    values = [0 if e == '' else e for e in values]
    gender = abs(hash(values[4]))
    embarked = abs(hash(values[11]))
    return Vectors.dense([float(values[1]),gender,float(values[5]),float(values[6]),float(values[7]),float(values[9]),embarked])

fileNameTest = 'wasb://kaggle@criteo.blob.core.windows.net/test.csv'
testPoints = sc.textFile(fileNameTest)

#skip header
headerTest = testPoints.first() #extract header
testPoints = testPoints.filter(lambda x:x !=headerTest) #filter out header
print testPoints.take(2)

testPoints = testPoints.map(parseTestPoint)

predictions = model.predict(testPoints)
print predictions.take(125)

[u'892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q', u'893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S']
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0]


Get Passenger Ids, join to predictions and create a submission file

In [8]:
#write out predictions to .CSV file
submissionIds = sc.textFile(fileNameTest).map(lambda x: x.split(',')[0])

#skip header
headerSubmission = submissionIds.first() #extract header
submissionIds = submissionIds.filter(lambda x:x !=headerSubmission) #filter out header
print submissionIds.take(3)

[u'892', u'893', u'894']


In [14]:
submission = submissionIds.zip(predictions.map(lambda x: int(x)))

In [16]:
submission.map(lambda a: str(a[0]) + "," + str(a[1])).coalesce(1,True).saveAsTextFile('wasb://criteo@criteo.blob.core.windows.net/kaggle/svcc_test2.csv')