In [1]:
import findspark
findspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')

## Session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Logistic_Regression').getOrCreate()

## Load dataset

In [3]:
raw_data = spark.read.csv('titanic.csv',inferSchema=True,header=True)

raw_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



## Drop some unused features

In [4]:
selected_data = raw_data.select([
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked']).na.drop()

selected_data.show(5)

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
+--------+------+------+----+-----+-----+-------+--------+
only showing top 5 rows



## Split train/test

In [5]:
train_data,test_data = selected_data.randomSplit([0.7,0.3])

## String Indexer, One-hot encoder, Vector Assembler
Example

    A B C ---string indexer---> 0 1 2 ---One hot---> [1,0,0] [0,1,0] [0,0,1]


In [6]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

sex_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [7]:
from pyspark.ml.feature import VectorAssembler

# Define assembler 
assembler = VectorAssembler(
    inputCols=[
        'Pclass',
        'SexVec',
        'Age',
        'SibSp',
        'Parch',
        'Fare',
        'EmbarkVec'],
    outputCol='features')

## Logistic Regression

In [8]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features',labelCol='Survived')

## Pipeline model
- A sequence of transformation

In [9]:
from pyspark.ml import Pipeline

pipeline = Pipeline(
    stages=[
        sex_indexer, embark_indexer,
        sex_encoder,embark_encoder,
        assembler,
        lr])

In [10]:
# Train
model = pipeline.fit(train_data)

## Evaluate Testset

In [11]:
test_results = model.transform(test_data)

test_results.select('Survived','prediction').show(20)

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='prediction', # y_pred
    labelCol='Survived')           # y
roc = evaluator.evaluate(test_results)

roc

0.7515040650406504