##Preliminaries

In [1]:
from pyspark import SparkContext

sc = SparkContext(appName="TitanicLR")
fileNameTrain = 'wasb://kaggle@criteo.blob.core.windows.net/train.csv'
points = sc.textFile(fileNameTrain)

Import Spark SQL libraries

In [2]:
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)

def parsePoint(line):
    """
    Parse a line of text into an MLlib LabeledPoint object.
    """
    values = line.split(',')
    values = [0 if e == '' else e for e in values]
    gender = abs(hash(values[5]))
    embarked = abs(hash(values[12]))
    return Row(label=float(values[1]), pclass=float(values[2]), plastname=values[3], pgender=gender,page=float(values[6]), psibsp=float(values[7]), pparch=float(values[8]), pfare=float(values[10]), pembaked=embarked)


In [3]:
#skip header
header = points.first() #extract header
points = points.filter(lambda x:x !=header) #filter out header

print points.first()


1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S


Create DataFrame, infer schema and do some basic manipulation of the data

In [4]:
dfPoints = points.map(parsePoint)

# Infer the schema, and register the DataFrame as a table.
schemaPeople = sqlContext.createDataFrame(dfPoints)
schemaPeople.registerTempTable("titanic")

# SQL can be run over DataFrames that have been registered as a table.
teenagers = sqlContext.sql("SELECT plastname, page FROM titanic WHERE page >= 13 AND page <= 19")

teenagers = teenagers.map(lambda p: "Survived: " + p.plastname + ", age: " + str(p.page))
for teen in teenagers.collect():
    print(teen)

Survived: "Nasser, age: 14.0
Survived: "Vestrom, age: 14.0
Survived: "McGowan, age: 15.0
Survived: "Fortune, age: 19.0
Survived: "Vander Planke, age: 18.0
Survived: "Nicola-Yarred, age: 14.0
Survived: "Devaney, age: 19.0
Survived: "Arnold-Franchi, age: 18.0
Survived: "Crease, age: 19.0
Survived: "Andersson, age: 17.0
Survived: "Goodwin, age: 16.0
Survived: "Ilett, age: 17.0
Survived: "Ford, age: 16.0
Survived: "Zabour, age: 14.5
Survived: "Attalah, age: 17.0
Survived: "Newsom, age: 19.0
Survived: "Osen, age: 16.0
Survived: "Burke, age: 19.0
Survived: "Andrew, age: 18.0
Survived: "Nicholls, age: 19.0
Survived: "Gilnagh, age: 16.0
Survived: "Calic, age: 17.0
Survived: "Klasen, age: 18.0
Survived: "Carbines, age: 19.0
Survived: "Andersen-Jensen, age: 19.0
Survived: "Cohen, age: 18.0
Survived: "Carr, age: 16.0
Survived: "Sunderland, age: 16.0
Survived: "Mellors, age: 19.0
Survived: "Fahlstrom, age: 18.0
Survived: "Pengelly, age: 19.0
Survived: "Panula, age: 16.0
Survived: "de Pelsmaeker, a

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

# Prepare training documents from a list of (id, text, label) tuples.

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

lr = LogisticRegression(maxIter=10, regParam=0.01)
    .setLabelCol("label")
    .setFeaturesCol("features")
    .setPredictionCol("predictions")

pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)