#Predict survival on the Titanic

In [1]:
sc.version

u'1.3.0'

In [2]:
FILE_PATH = "file:///notebooks/cineca/data/"

In [3]:
inp_file = sc.textFile(FILE_PATH + "titanic/titanic3_01.csv")
pass_rdd = inp_file.map(lambda line: line.split(','))
# 0 pclass,1 survived,2 l.name,3.f.name, 4 sex,5 age,6 sibsp,7 parch,8 ticket,9 fare,10 cabin,
# 11 embarked,12 boat,13 body,14 home.dest

In [4]:
import pandas as pd
cols = ["pclass","survived","l.name","f.name", "sex", "age", "sibsp","parch","ticket", "fare","cabin","embarked","boat","body","home.dest"]
data = pd.DataFrame(pass_rdd.take(3))
data = data.ix[:,0:14]
data.columns = cols
data

Unnamed: 0,pclass,survived,l.name,f.name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"""Allen","Miss. Elisabeth Walton""",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"""St Louis"
1,1,1,"""Allison","Master. Hudson Trevor""",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"""Montreal"
2,1,0,"""Allison","Miss. Helen Loraine""",female,2.0,1,2,113781,151.55,C22 C26,S,,,"""Montreal"


###LabeledPoint
LabeledPoint: The features and labels of a data point

In [5]:
from pyspark.mllib.regression import LabeledPoint

def num(s):
    try:
        return int(s)
    except ValueError:
        try:
            return float(s)
        except ValueError:
            return 0

def parse_passenger_list(x):
    pclass = num(x[0])
    survived = num(x[1])
    # sex
    sex=0
    if x[4]=='male':
        sex = 1
    age=0
    age = num(x[5])
    sibsp = 0
    sibsp = num(x[6])
    parch = 0
    parch = num(x[7])
    fare = 0
    fare = num(x[9])
    cabin = x[10] # not now, categorical
    # return labelled point
    return LabeledPoint(survived,[pclass,sex,age,sibsp,parch,fare]) #,cabin])
    #[pclass,sex,age,sibsp,parch,fare,cabin,survived]

In [6]:
pass_rdd_01 = pass_rdd.map(lambda x: parse_passenger_list(x))

In [7]:
pass_rdd_01.count()

1310

In [8]:
pass_rdd_01.first()

LabeledPoint(1.0, [1.0,0.0,29.0,0.0,0.0,211.3375])

In [9]:
print "survived,[pclass,sex,age,sibsp,parch,fare]"
for x in pass_rdd_01.take(3):
    print x
# survived,[pclass,sex,age,sibsp,parch,fare]

survived,[pclass,sex,age,sibsp,parch,fare]
(1.0,[1.0,0.0,29.0,0.0,0.0,211.3375])
(1.0,[1.0,1.0,0.9167,1.0,2.0,151.55])
(0.0,[1.0,0.0,2.0,1.0,2.0,151.55])


###Decision trees
Decision trees and their ensembles are popular methods for the machine learning tasks of classification and regression. Decision trees are widely used since they are easy to interpret, handle categorical features, extend to the multiclass classification setting, do not require feature scaling, and are able to capture non-linearities and feature interactions. Tree ensemble algorithms such as random forests and boosting are among the top performers for classification and regression tasks.

MLlib supports decision trees for binary and multiclass classification and for regression, using both continuous and categorical features. The implementation partitions data by rows, allowing distributed training with millions of instances.

[Documentation mllib#decisiontree](https://spark.apache.org/docs/latest/mllib-decision-tree.html)

In [10]:
from pyspark.mllib.tree import DecisionTree

# numClasses: Number of classes (for Classification only)
model = DecisionTree.trainClassifier(pass_rdd_01, numClasses=2,categoricalFeaturesInfo={})

In [11]:
print(model)
#print(model.toDebugString())

DecisionTreeModel classifier of depth 5 with 61 nodes


In [12]:
pass_labels = pass_rdd_01.map(lambda x: x.label)

pass_features = pass_rdd_01.map(lambda x: x.features)
pass_features_array = pass_rdd_01.map(lambda x: [x.features])

In [13]:
pass_labels.count()

1310

In [14]:
for x in pass_labels.take(3):
    print x

1.0
1.0
0.0


In [15]:
predictions = model.predict(pass_features)

In [16]:
for x in predictions.take(3):
    print x

1.0
0.0
1.0


In [17]:
labelsAndPredictions = pass_rdd_01.map(lambda lp: lp.label).zip(predictions)

In [18]:
#labelsAndPredictions.first()
labelsAndPredictions.take(10)

[(1.0, 1.0),
 (1.0, 0.0),
 (0.0, 1.0),
 (0.0, 0.0),
 (0.0, 1.0),
 (1.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0)]

In [19]:
trainMSE = labelsAndPredictions.map(lambda (v, p): (v - p)**2).sum() / float(pass_rdd_01.count())

In [20]:
print trainMSE

0.173282442748


In [21]:
add = (lambda x,y : x+y)
seqOp = (lambda acc, x: acc + (x[0] == x[1]))
train_correct = labelsAndPredictions.aggregate(0, seqOp, add)
accuracy = train_correct / float(pass_rdd_01.count())
print 'accuracy: %2.2f%s' % (accuracy*100,'%')

accuracy: 82.67%


### Now Let us try Naive Bayes & See if it improves the accuracy
####Naive Bayes
Naive Bayes is a simple multiclass classification algorithm with the assumption of independence between every pair of features. Naive Bayes can be trained very efficiently. Within a single pass to the training data, it computes the conditional probability distribution of each feature given label, and then it applies Bayes’ theorem to compute the conditional probability distribution of label given an observation and use it for prediction.

MLlib supports multinomial naive Bayes, which is typically used for document classification. Within that context, each observation is a document and each feature represents a term whose value is the frequency of the term. Feature values must be nonnegative to represent term frequencies. Additive smoothing can be used by setting the parameter λ (default to 1.0). For document classification, the input feature vectors are usually sparse, and sparse vectors should be supplied as input to take advantage of sparsity. Since the training data is only used once, it is not necessary to cache it.

In [22]:
from pyspark.mllib.classification import NaiveBayes
nb_model = NaiveBayes.train(pass_rdd_01, 1.0)
print nb_model

<pyspark.mllib.classification.NaiveBayesModel object at 0x7fc2ac0c0ad0>


In [23]:
nb_predictions = pass_features_array.map(lambda x: nb_model.predict(x[0]))

In [24]:
#pass_features_array
#aa = pass_features_array.take(1)
#aa[0][0]
#a = aa[0]
#a = aa[0].tolist()
#nb_model.predict(a)

In [25]:
#nb_predictions.first()
nb_predictions.take(3)

[1.0, 1.0, 1.0]

In [26]:
labelsAndPredictions_nb = pass_rdd_01.map(lambda lp: lp.label).zip(nb_predictions)

In [27]:
labelsAndPredictions_nb.first()

(1.0, 1.0)

In [28]:
import numpy
trainMSE_nb = labelsAndPredictions_nb.map(lambda (l, p): numpy.square(l - p)).sum() / float(pass_rdd_01.count())
print trainMSE_nb

0.331297709924


In [29]:
seqOp = (lambda acc, x: acc + (x[0] == x[1]))
train_correct = labelsAndPredictions_nb.aggregate(0, seqOp, add)
accuracy = train_correct / float(pass_rdd_01.count())
print 'accuracy: %2.2f%s' % (accuracy*100,'%')

accuracy: 66.87%


In [30]:
# Homework
# https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/decision_tree_runner.py
# has some interesting (and elegant) routines we can use