In [64]:
import pyspark
sc = pyspark.SparkContext('local[*]')

!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-1-4bc56731f3c8>:2 

### Step 1
- Load the train and test sets
- Check the schema, the variables have their right types?
- If not, how to correctly load the datasets?

In [79]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

titanicSchemaTrain = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Survived", IntegerType(), True),
                           StructField("Pclass",  IntegerType(), True), 
                           StructField("Name",  StringType(), True), 
                           StructField("Sex",  StringType(), True), 
                           StructField("Age",  IntegerType(), True), 
                           StructField("SibSp",  IntegerType(), True), 
                           StructField("Parch",  IntegerType(), True), 
                           StructField("Ticket",  StringType(), True), 
                            StructField("Fare",  FloatType(), True), 
                            StructField("Cabin",  StringType(), True),
                           StructField("Embarked",  StringType(), True)]     
                          )

titanicSchemaTest = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Pclass",  IntegerType(), True), 
                           StructField("Name",  StringType(), True), 
                           StructField("Sex",  StringType(), True), 
                           StructField("Age",  FloatType(), True), 
                           StructField("SibSp",  IntegerType(), True), 
                           StructField("Parch",  IntegerType(), True), 
                           StructField("Ticket",  StringType(), True), 
                            StructField("Fare",  FloatType(), True), 
                            StructField("Cabin",  StringType(), True),
                               StructField("Embarked",  StringType(), True)]
                          )
df_train = sqlc.read.load(path="data/train.csv", 
                          format="com.databricks.spark.csv", 
                          schema=titanicSchemaTrain,
                          header=True)

df_test = sqlc.read.load(path="data/test.csv", 
                          format="com.databricks.spark.csv", 
                          schema=titanicSchemaTest, header=True)

### Step 2
- Explore the features of your dataset
- You can use DataFrame's ***describe*** method to get summary statistics
    - hint: ***toPandas*** may be useful to ease the manipulation of small dataframes
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: ***na*** property of DataFrames provide several methods of handling NA values

### Step 3
- How to handle categorical features?
    - hint: check the Estimators and Transformers
- Assemble all desired features into a Vector using the VectorAssembler Transformer
- Make sure to end up with a DataFrame with two columns: ***Survived*** and ***vFeatures***

In [80]:
age_mean = df_train.describe().toPandas().set_index("summary").loc['mean','Age']
def remove_useless_features(df):
    return df.drop("Cabin")
df_train = remove_useless_features(df_train)
df_test = remove_useless_features(df_test)
from numpy import NaN
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import isnull, isnan, when, count, col

def average_missing_features(df):
    df = df.withColumn("age", when(col('age').isNull(), age_mean).otherwise(col('age'))) 
    return df


df_train =     average_missing_features(df_train)
df_test  =     average_missing_features(df_test)


Step 3
How to handle categorical features?
hint: check the Estimators and Transformers
Assemble all desired features into a Vector using the VectorAssembler Transformer
Make sure to end up with a DataFrame with two columns: Survived and vFeatures


In [81]:
from pyspark.ml.feature import StringIndexer
def categorize_df(df):
    indexerS = StringIndexer(inputCol="Sex", outputCol="SexC")
    indexerE = StringIndexer(inputCol="Embarked", outputCol="EmbarkedC")
    
    df = indexerS.fit(df).transform(df)
    df = indexerE.fit(df).transform(df)
    
    df = df.drop("Sex", "Embarked")
    return df

df_train = categorize_df(df_train)
df_test = categorize_df(df_test)

In [82]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
def convert_age(df):
    df = df.withColumn("age", col("age").cast(FloatType()))
    return df
df_train = convert_age(df_train)
df_test = convert_age(df_test)

    

In [84]:
from pyspark.ml.feature import VectorAssembler

def vectorize(df):

    assembler = VectorAssembler(inputCols = \
                ["Pclass","age", "SibSp", "Parch", "Fare", "SexC", "EmbarkedC"], outputCol = "vFeatures")

    df = assembler.transform(df)
    df = df['PassengerId','vFeatures','Survived']
    return df

df_train = vectorize(df_train)
#df_test = vectorize(df_test)



In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils

### INSERT YOUR CODE HERE

### Step 4
- In Step 5, you will apply a normalization Estimator
- BUT, it does not accept feature vectors of the Sparse type
- So, it is neccessary to apply an User Defined Function to make all features vectors of type VectorUDT
- In this step, you only have to replace ***YOUR DATAFRAME*** and ***NEW DATAFRAME*** with your variables

In [None]:
from pyspark.sql.functions import UserDefinedFunction
from pyspark.ml.linalg import VectorUDT, Vectors

to_vec = UserDefinedFunction(lambda x: Vectors.dense(x.toArray()), VectorUDT())

<NEW DATAFRAME> = <YOUR DATAFRAME>.select("Survived", to_vec("vFeatures").alias("features"))

### Step 5
- Apply a normalization Estimator of your choice to the ***features*** vector obtained in Step 4

In [None]:
from pyspark.ml.feature import StandardScaler

### INSERT YOUR CODE HERE

### Step 6
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the Binary Classification Evaluator to evaluate your model on the training data
- How is your model performing? Try to tune its parameters

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

### INSERT YOUR CODE HERE

### Step 7
- Take a look at the test data - use DataFrame's ***createOrReplaceTempView*** method to perform SQL queries over the data
    - hint: check if there are any NULL values in the dataset - if so, handle them
- Apply the transformations to the test data
    - hint: you can use Pipelines to chain several Estimators/Transformers
    - warning: unfortunately, it is not possible to include the UDF from Step 4 in the Pipeline
- Make predictions using the model previously trained and the transformed test data
- Save it as ***submission.csv*** and submit it to Kaggle
- What was your score?

In [None]:
from pyspark.ml import Pipeline

### INSERT YOUR CODE HERE

In [None]:
df_predictions = <YOUR PREDICTIONS DATAFRAME>.select("prediction").toPandas().reset_index()
df_predictions['index'] = df_predictions['index'] + 892
df_predictions.columns = ['PassengerId', 'Survived']

df_predictions.to_csv('submission.csv', index=False)

## Result = ???%