# Spark Logistic Regression

## predict whether a passenger survived the titanic

In [3]:
#imports
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('titanic').getOrCreate()

#data
data = spark.read.csv('../Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/titanic.csv',inferSchema=True,header=True)
print(data.columns, '\n')
from pyspark.ml.feature import (VectorAssembler,OneHotEncoder,StringIndexer)
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')
assembler = VectorAssembler(inputCols=['Pclass','SexVec','Age','SibSp','Parch','Fare','EmbarkVec'],
                            outputCol='features')

#split
my_cols = data.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])
my_final_data = my_cols.na.drop()
train, test = my_final_data.randomSplit([0.7,.3])

#model
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,gender_encoder,embark_encoder,assembler,log_reg_titanic])
fit_model = pipeline.fit(train)
results = fit_model.transform(test)

#evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')
auc = evaluator.evaluate(results)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
accuracy = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Survived',metricName='accuracy')
f1 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='Survived',metricName='f1')
acc = accuracy.evaluate(results)
f1_score = f1.evaluate(results)
print('area under curve: ',auc)
print('accuracy: ',acc)
print('f1 score: ',f1_score, '\n')

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] 

area under curve:  0.8090807888040712
accuracy:  0.8149779735682819
f1 score:  0.8147050144383787 

