In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

### Step 1
- Load the train and test sets
- Check the schema, the variables have their right types?
- If not, how to correctly load the datasets?

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

titanicSchemaTrain = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Survived", IntegerType(), True),
                           StructField("Pclass",  IntegerType(), True), 
                           StructField("Name",  StringType(), True), 
                           StructField("Sex",  StringType(), True), 
                           StructField("Age",  FloatType(), True), 
                           StructField("SibSp",  IntegerType(), True), 
                           StructField("Parch",  IntegerType(), True), 
                           StructField("Ticket",  StringType(), True), 
                            StructField("Fare",  FloatType(), True), 
                            StructField("Cabin",  StringType(), True),
                           StructField("Embarked",  StringType(), True)]     
                          )



titanicSchemaTest = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Pclass",  IntegerType(), True), 
                           StructField("Name",  StringType(), True), 
                           StructField("Sex",  StringType(), True), 
                           StructField("Age",  FloatType(), True), 
                           StructField("SibSp",  IntegerType(), True), 
                           StructField("Parch",  IntegerType(), True), 
                           StructField("Ticket",  StringType(), True), 
                            StructField("Fare",  FloatType(), True), 
                            StructField("Cabin",  StringType(), True),
                               StructField("Embarked",  StringType(), True)]
                          )
df_train = sqlc.read.load(path="data/train.csv", 
                          format="com.databricks.spark.csv", 
                          schema=titanicSchemaTrain,
                          header=True)

df_test = sqlc.read.load(path="data/test.csv", 
                          format="com.databricks.spark.csv", 
                          schema=titanicSchemaTest, header=True)

In [49]:

answerSchemaTrain = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Pclass",  IntegerType(), True), 
                           StructField("Name",  StringType(), True), 
                           StructField("Sex",  StringType(), True), 
                           StructField("Age",  FloatType(), True), 
                           StructField("SibSp",  IntegerType(), True), 
                           StructField("Parch",  IntegerType(), True), 
                           StructField("Ticket",  StringType(), True), 
                            StructField("Fare",  FloatType(), True), 
                            StructField("Cabin",  StringType(), True),
                           StructField("Embarked",  StringType(), True),
                               
                           StructField("Survived", IntegerType(), True)
                               ]     
                          )


df_answer = sqlc.read.load(path="titanic_answers.csv", 
                          format="com.databricks.spark.csv", 
                          schema=answerSchemaTrain, header=True)

In [50]:
df_answer.toPandas()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.829200,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.000000,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.687500,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.662500,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.287500,,S,1
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225000,,S,1
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.629200,,Q,0
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.000000,,S,1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.229200,,C,1
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.150000,,S,0


### Step 2
- Explore the features of your dataset
- You can use DataFrame's ***describe*** method to get summary statistics
    - hint: ***toPandas*** may be useful to ease the manipulation of small dataframes
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: ***na*** property of DataFrames provide several methods of handling NA values

### Step 3
- How to handle categorical features?
    - hint: check the Estimators and Transformers
- Assemble all desired features into a Vector using the VectorAssembler Transformer
- Make sure to end up with a DataFrame with two columns: ***Survived*** and ***vFeatures***

In [3]:
age_mean = df_train.describe().toPandas().set_index("summary").loc['mean','Age']
def remove_useless_features(df):
    return df.drop("Cabin")
df_train = remove_useless_features(df_train)
df_test = remove_useless_features(df_test)
from numpy import NaN
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import isnull, isnan, when, count, col

def average_missing_features(df):
    df = df.withColumn("age", when(col('age').isNull(), age_mean).otherwise(col('age'))) 
    df = df.withColumn("Embarked", when(col('Embarked').isNull(), 'C').otherwise(col('Embarked'))) 
    df = df.withColumn("Fare", when(col('Fare').isNull(), 0).otherwise(col('Fare'))) 
    
    
    return df


df_train =     average_missing_features(df_train)
df_test  =     average_missing_features(df_test)


In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
def convert_age(df):
    df = df.withColumn("age", col("age").cast(FloatType()))
    return df
df_train = convert_age(df_train)
df_test = convert_age(df_test)


In [71]:
df_train

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, age: string, SibSp: int, Parch: int, Ticket: string, Fare: float, Embarked: string]

In [5]:
def retrieve_title(name):
    index_c = name.index(',')
    index_point = name.index('.')
    if index_c >= 0 and index_point > 0:
        title = name[index_c+1:index_point].strip()
        return title
    return 'No'


#udf_retr_title = udf(retrieve_title, StringType())
def extract_title(df):
    df = df.withColumn("Name", udf_retr_title('Name') )     
    return df


#extract_title(df_train)    

In [73]:
df_test.describe().toPandas()

Unnamed: 0,summary,PassengerId,Pclass,Name,Sex,age,SibSp,Parch,Ticket,Fare,Embarked
0,count,418.0,418.0,418,418,418.0,418.0,418.0,418,418.0,418
1,mean,1100.5,2.2655502392344498,,,30.154603152261867,0.4473684210526316,0.3923444976076555,223850.98986486485,35.541956141234586,
2,stddev,120.81045760473994,0.8418375519640503,,,12.636665857360075,0.8967595611217135,0.9814288785371694,369523.7764694362,55.86768433838268,
3,min,892.0,1.0,"""Assaf Khalil, Mrs. Mariana (Miriam"""")""""""",female,0.17,0.0,0.0,110469,0.0,C
4,max,1309.0,3.0,"van Billiard, Master. Walter John",male,9.0,8.0,9.0,W.E.P. 5734,512.3292,S


Step 3
How to handle categorical features?
hint: check the Estimators and Transformers
Assemble all desired features into a Vector using the VectorAssembler Transformer
Make sure to end up with a DataFrame with two columns: Survived and vFeatures


In [6]:
from pyspark.ml.feature import StringIndexer
def categorize_df(df):
    indexerS = StringIndexer(inputCol="Sex", outputCol="SexC")
    indexerE = StringIndexer(inputCol="Embarked", outputCol="EmbarkedC")
    
    df = indexerS.fit(df).transform(df)
    df = indexerE.fit(df).transform(df)
    
    df = df.drop("Sex", "Embarked")
    return df

df_train = categorize_df(df_train)
df_test = categorize_df(df_test)

In [7]:
from pyspark.ml.feature import OneHotEncoder
def onehot_df(df):
    oneHotS = OneHotEncoder(inputCol="SexC", outputCol="SexV")
    oneHotE = OneHotEncoder(inputCol="EmbarkedC", outputCol="EmbarkedV")
    
    df = oneHotS.transform(df)
    df = oneHotE.transform(df)
    
    df = df.drop("SexC", "EmbarkedC")
    return df

df_train = onehot_df(df_train)
df_test = onehot_df(df_test)

In [8]:
from pyspark.ml.feature import VectorAssembler

def vectorize(df):

    assembler = VectorAssembler(inputCols = \
                ["Pclass","age", "SibSp", "Parch", "Fare", "SexV", "EmbarkedV"], outputCol = "vFeatures")

    df = assembler.transform(df)
    
    return df

df_train = vectorize(df_train)
df_test = vectorize(df_test)

df_train = df_train['PassengerId','vFeatures','Survived']
df_test = df_test['PassengerId','vFeatures']




In [9]:
df_train.toPandas()

Unnamed: 0,PassengerId,vFeatures,Survived
0,1,"[3.0, 22.0, 1.0, 0.0, 7.25, 1.0, 1.0, 0.0]",0
1,2,"[1.0, 38.0, 1.0, 0.0, 71.2833023071, 0.0, 0.0,...",1
2,3,"(3.0, 26.0, 0.0, 0.0, 7.92500019073, 0.0, 1.0,...",1
3,4,"[1.0, 35.0, 1.0, 0.0, 53.0999984741, 0.0, 1.0,...",1
4,5,"[3.0, 35.0, 0.0, 0.0, 8.05000019073, 1.0, 1.0,...",0
5,6,"(3.0, 29.6991176605, 0.0, 0.0, 8.45829963684, ...",0
6,7,"[1.0, 54.0, 0.0, 0.0, 51.8624992371, 1.0, 1.0,...",0
7,8,"[3.0, 2.0, 3.0, 1.0, 21.0750007629, 1.0, 1.0, ...",0
8,9,"[3.0, 27.0, 0.0, 2.0, 11.1332998276, 0.0, 1.0,...",1
9,10,"[2.0, 14.0, 1.0, 0.0, 30.0708007812, 0.0, 0.0,...",1


In [58]:
df_test.toPandas()

Py4JJavaError: An error occurred while calling o843.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 59.0 failed 1 times, most recent failure: Lost task 0.0 in stage 59.0 (TID 59, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$3: (struct<Pclass_double_VectorAssembler_4ca9b039333100e7d833:double,age_double_VectorAssembler_4ca9b039333100e7d833:double,SibSp_double_VectorAssembler_4ca9b039333100e7d833:double,Parch_double_VectorAssembler_4ca9b039333100e7d833:double,Fare_double_VectorAssembler_4ca9b039333100e7d833:double,SexV:vector,EmbarkedV:vector>) => vector)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Values to assemble cannot be null.
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:160)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:143)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:143)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$3.apply(VectorAssembler.scala:99)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$3.apply(VectorAssembler.scala:98)
	... 16 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:2803)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2800)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2800)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2823)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2800)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$3: (struct<Pclass_double_VectorAssembler_4ca9b039333100e7d833:double,age_double_VectorAssembler_4ca9b039333100e7d833:double,SibSp_double_VectorAssembler_4ca9b039333100e7d833:double,Parch_double_VectorAssembler_4ca9b039333100e7d833:double,Fare_double_VectorAssembler_4ca9b039333100e7d833:double,SexV:vector,EmbarkedV:vector>) => vector)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.SparkException: Values to assemble cannot be null.
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:160)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$assemble$1.apply(VectorAssembler.scala:143)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:143)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$3.apply(VectorAssembler.scala:99)
	at org.apache.spark.ml.feature.VectorAssembler$$anonfun$3.apply(VectorAssembler.scala:98)
	... 16 more


In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils

### INSERT YOUR CODE HERE

### Step 4
- In Step 5, you will apply a normalization Estimator
- BUT, it does not accept feature vectors of the Sparse type
- So, it is neccessary to apply an User Defined Function to make all features vectors of type VectorUDT
- In this step, you only have to replace ***YOUR DATAFRAME*** and ***NEW DATAFRAME*** with your variables

In [10]:
from pyspark.sql.functions import UserDefinedFunction
from pyspark.ml.linalg import VectorUDT, Vectors

to_vec = UserDefinedFunction(lambda x: Vectors.dense(x.toArray()), VectorUDT())

df_train = df_train.select("PassengerId", "Survived", to_vec("vFeatures").alias("features"))
df_test = df_test.select("PassengerId", to_vec("vFeatures").alias("features"))


### Step 5
- Apply a normalization Estimator of your choice to the ***features*** vector obtained in Step 4

In [11]:
df_train.toPandas()

Unnamed: 0,PassengerId,Survived,features
0,1,0,"[3.0, 22.0, 1.0, 0.0, 7.25, 1.0, 1.0, 0.0]"
1,2,1,"[1.0, 38.0, 1.0, 0.0, 71.2833023071, 0.0, 0.0,..."
2,3,1,"[3.0, 26.0, 0.0, 0.0, 7.92500019073, 0.0, 1.0,..."
3,4,1,"[1.0, 35.0, 1.0, 0.0, 53.0999984741, 0.0, 1.0,..."
4,5,0,"[3.0, 35.0, 0.0, 0.0, 8.05000019073, 1.0, 1.0,..."
5,6,0,"[3.0, 29.6991176605, 0.0, 0.0, 8.45829963684, ..."
6,7,0,"[1.0, 54.0, 0.0, 0.0, 51.8624992371, 1.0, 1.0,..."
7,8,0,"[3.0, 2.0, 3.0, 1.0, 21.0750007629, 1.0, 1.0, ..."
8,9,1,"[3.0, 27.0, 0.0, 2.0, 11.1332998276, 0.0, 1.0,..."
9,10,1,"[2.0, 14.0, 1.0, 0.0, 30.0708007812, 0.0, 0.0,..."


In [12]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
df_train_scalerModel = scaler.fit(df_train)

# Normalize each feature to have unit standard deviation.
df_train = df_train_scalerModel.transform(df_train)

# Compute summary statistics by fitting the StandardScaler
df_test_scalerModel = scaler.fit(df_test)

# Normalize each feature to have unit standard deviation.
df_test = df_test_scalerModel.transform(df_test)



In [82]:
df_train.toPandas()

Unnamed: 0,PassengerId,Survived,features,scaledFeatures
0,1,0,"[3.0, 22.0, 1.0, 0.0, 7.25, 1.0, 1.0, 0.0]","[3.5882109717, 1.69204539585, 0.906829250318, ..."
1,2,1,"[1.0, 38.0, 1.0, 0.0, 71.2833023071, 0.0, 0.0,...","[1.1960703239, 2.92262386556, 0.906829250318, ..."
2,3,1,"[3.0, 26.0, 0.0, 0.0, 7.92500019073, 0.0, 1.0,...","[3.5882109717, 1.99969001328, 0.0, 0.0, 0.1594..."
3,4,1,"[1.0, 35.0, 1.0, 0.0, 53.0999984741, 0.0, 1.0,...","[1.1960703239, 2.69189040249, 0.906829250318, ..."
4,5,0,"[3.0, 35.0, 0.0, 0.0, 8.05000019073, 1.0, 1.0,...","[3.5882109717, 2.69189040249, 0.0, 0.0, 0.1619..."
5,6,0,"[3.0, 29.6991176605, 0.0, 0.0, 8.45829963684, ...","[3.5882109717, 2.28419342265, 0.0, 0.0, 0.1702..."
6,7,0,"[1.0, 54.0, 0.0, 0.0, 51.8624992371, 1.0, 1.0,...","[1.1960703239, 4.15320233527, 0.0, 0.0, 1.0436..."
7,8,0,"[3.0, 2.0, 3.0, 1.0, 21.0750007629, 1.0, 1.0, ...","[3.5882109717, 0.153822308714, 2.72048775095, ..."
8,9,1,"[3.0, 27.0, 0.0, 2.0, 11.1332998276, 0.0, 1.0,...","[3.5882109717, 2.07660116764, 0.0, 2.481213427..."
9,10,1,"[2.0, 14.0, 1.0, 0.0, 30.0708007812, 0.0, 0.0,...","[2.3921406478, 1.076756161, 0.906829250318, 0...."


### Step 6
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the Binary Classification Evaluator to evaluate your model on the training data
- How is your model performing? Try to tune its parameters

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import SQLTransformer

pipeline = Pipeline(stages=[
    StringIndexer(inputCol="Sex", outputCol="SexC"),
    StringIndexer(inputCol="Embarked", outputCol="EmbarkedC"),
    OneHotEncoder(inputCol="SexC", outputCol="SexV"),
    OneHotEncoder(inputCol="EmbarkedC", outputCol="EmbarkedV"),
    VectorAssembler(inputCols = \
                ["Pclass","age", "SibSp", "Parch", "Fare", "SexV", "EmbarkedV"], outputCol = "features"),
    StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)
    SQLTransformer(statement="SELECT PassengerId, Survived, scaledFeatures FROM __THIS__")
])

# Fit the pipeline to training documents.
model = pipeline.fit(df_train)
df_train_2 =  model.transform(df_train)
df_train_3 = df_train_2    

In [15]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

rfR = RandomForestClassifier().setLabelCol("Survived").setFeaturesCol("scaledFeatures")
modelRFR = rfR.fit(df_train)

predictionsRFR = modelRFR.transform(df_test)
predictionsKG = predictionsRFR.select('PassengerId', 'prediction').withColumnRenamed('prediction', 'Survived').withColumn("Survived", col("Survived").cast(IntegerType()))
predictionsKG.write.save(path="data/preds2", 
                          format="com.databricks.spark.csv", header=True)

In [36]:
print(modelRFR.toDebugString)

RandomForestClassificationModel (uid=RandomForestClassifier_458fbcd580cd9f9d2d95) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 0 <= 2.3921406478026457)
     If (feature 1 <= 1.307489624066756)
      If (feature 3 <= 0.0)
       If (feature 6 <= 0.0)
        Predict: 1.0
       Else (feature 6 > 0.0)
        If (feature 5 <= 0.0)
         Predict: 1.0
        Else (feature 5 > 0.0)
         Predict: 0.0
      Else (feature 3 > 0.0)
       Predict: 1.0
     Else (feature 1 > 1.307489624066756)
      If (feature 5 <= 0.0)
       If (feature 1 <= 2.076601167635436)
        If (feature 1 <= 1.884323281743266)
         Predict: 1.0
        Else (feature 1 > 1.884323281743266)
         Predict: 0.0
       Else (feature 1 > 2.076601167635436)
        If (feature 4 <= 0.2112955410165939)
         Predict: 1.0
        Else (feature 4 > 0.2112955410165939)
         Predict: 1.0
      Else (feature 5 > 0.0)
       If (feature 0 <= 1.1960703239013228)
        If (feature 1 <= 3.922468872200

In [40]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel


gbtc = GBTClassifier().setLabelCol("Survived").setFeaturesCol("scaledFeatures").setMaxIter(10)
modelGBTC = gbtc.fit(df_train)
predictionsGBTC = modelGBTC.transform(df_test)

predictionsGBKG = predictionsGBTC.select('PassengerId', 'prediction').withColumnRenamed('prediction', 'Survived').withColumn("Survived", col("Survived").cast(IntegerType()))
predictionsGBKG.write.save(path="data/preds3", 
                          format="com.databricks.spark.csv", header=True)


In [78]:
predictionsKG.show(5)

+-----------+--------+
|PassengerId|Survived|
+-----------+--------+
|        892|       0|
|        893|       0|
|        894|       0|
|        895|       0|
|        896|       1|
+-----------+--------+
only showing top 5 rows



In [94]:
from pyspark.sql.functions import *
answer = df_answer.select('PassengerId', 'Survived')
predicted = df_train.select('Survived')

ans_predicted = df_answer.join(predictionsRFR, df_answer.PassengerId == predictionsRFR.PassengerId).drop(predictionsRFR.PassengerId)


In [103]:
ans_toev = ans_predicted.select('PassengerId', 'Survived', 'Prediction')

SyntaxError: unexpected EOF while parsing (<ipython-input-103-a80a3d336381>, line 1)

In [109]:
ans_toev = ans_toev.withColumn("Prediction", col("Prediction").cast(DoubleType())).withColumn("Survived", col("Survived").cast(DoubleType()))

In [110]:
ans_toev.show(20)

+-----------+--------+----------+
|PassengerId|Survived|Prediction|
+-----------+--------+----------+
|        892|     0.0|       0.0|
|        893|     1.0|       0.0|
|        894|     0.0|       0.0|
|        895|     0.0|       0.0|
|        896|     1.0|       1.0|
|        897|     1.0|       0.0|
|        898|     0.0|       0.0|
|        899|     1.0|       0.0|
|        900|     1.0|       1.0|
|        901|     0.0|       0.0|
|        902|     0.0|       0.0|
|        903|     0.0|       0.0|
|        904|     1.0|       1.0|
|        905|     0.0|       0.0|
|        906|     1.0|       1.0|
|        907|     1.0|       1.0|
|        908|     0.0|       0.0|
|        909|     0.0|       0.0|
|        910|     0.0|       0.0|
|        911|     1.0|       0.0|
+-----------+--------+----------+
only showing top 20 rows



In [82]:
pyspark.sql.functions.col?

In [63]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


evaluator = BinaryClassificationEvaluator().setLabelCol("prediction").setMetricName("areaUnderROC")

roc = evaluator.evaluate(predictionsRFR )
print(roc)

1.0


In [111]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator().setLabelCol("Survived") \
                                        .setPredictionCol("Prediction") \
                                        .setMetricName("accuracy")

accuracy = evaluator.evaluate(ans_toev)

print("Test Error = %s" % (1.0 - accuracy))

Test Error = 0.21770334928229662


In [56]:
BinaryClassificationEvaluator??

### Step 7
- Take a look at the test data - use DataFrame's ***createOrReplaceTempView*** method to perform SQL queries over the data
    - hint: check if there are any NULL values in the dataset - if so, handle them
- Apply the transformations to the test data
    - hint: you can use Pipelines to chain several Estimators/Transformers
    - warning: unfortunately, it is not possible to include the UDF from Step 4 in the Pipeline
- Make predictions using the model previously trained and the transformed test data
- Save it as ***submission.csv*** and submit it to Kaggle
- What was your score?

## Result = ???%