## Developing a classification model to predict whether a passenger would survive or not in Titanic based on the given info

In [1]:
#import the necessary libraries
#importing SparkSession
from pyspark.sql import SparkSession

In [2]:
#importing the StringIndexer to index the categorical variables
from pyspark.ml.feature import StringIndexer

In [3]:
#importing the Binary one hot coder
from pyspark.ml.feature import OneHotEncoder

In [4]:
#importing the VectorAssembler to group the selected features into a vector
from pyspark.ml.feature import VectorAssembler

In [5]:
#importing the pipeline feature from pyspark.ml
from pyspark.ml import Pipeline

In [6]:
#import regression tools
from pyspark.ml.classification import LogisticRegression

In [7]:
#import the evalautor 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [8]:
#starting a spark session
spark = SparkSession.builder.appName('titanic').getOrCreate()

In [9]:
#loading the data set
all_data = spark.read.csv('11.titanic.csv', inferSchema=True, header=True)

In [10]:
#exploring the data
all_data.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [11]:
#printing the columns properties / schema
all_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



#### Passenger Id, Name, and Ticket aren't useful for our analysis
#### Fare and Cabin are pretty much the same variable but cabin has mostly null data
#### Sex and Embarked are categorical data that need to be indexed

In [12]:
#First let's index the categorical variables: Sex and Embarked
gender_indexer = StringIndexer(inputCol='Sex', outputCol='Gender_indexed')

In [13]:
#Now we will index the Embark Column
embarked_indexer = StringIndexer(inputCol='Embarked', outputCol='Embarked_indexed')

In [14]:
#now we will one_hot_encode the indexed columns

#encode the gender_index column
gender_encoder = OneHotEncoder(inputCol='Gender_indexed', outputCol='Gender_coded')

In [15]:
#encode the embarked_index column
embarked_encoder = OneHotEncoder(inputCol='Embarked_indexed', outputCol='Embarked_coded')

In [16]:
#Now we need to handle the missing data

#removing the unnecessary columns : PassengerID, TicketNumber, Cabin, Name
data_with_na = all_data.select([
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked',
 'Survived'])

#dropping the rows with missing cells
final_data = data_with_na.na.drop()
final_data.describe().show()

+-------+------------------+------+-----------------+------------------+-------------------+------------------+--------+------------------+
|summary|            Pclass|   Sex|              Age|             SibSp|              Parch|              Fare|Embarked|          Survived|
+-------+------------------+------+-----------------+------------------+-------------------+------------------+--------+------------------+
|  count|               712|   712|              712|               712|                712|               712|     712|               712|
|   mean| 2.240168539325843|  null|29.64209269662921|0.5140449438202247|0.43258426966292135| 34.56725140449432|    null|0.4044943820224719|
| stddev|0.8368543166903446|  null|14.49293290032352|0.9306921267673427| 0.8541814457454133|52.938648174710906|    null|0.4911389472541192|
|    min|                 1|female|             0.42|                 0|                  0|               0.0|       C|                 0|
|    max|           

In [17]:
#now assemble the selected features into a vector
assembler = VectorAssembler(inputCols=['Pclass',
                                         'Gender_coded',
                                         'Age',
                                         'SibSp',
                                         'Parch',
                                         'Fare',
                                         'Embarked_coded'],
                           outputCol='features')

In [18]:
#start a logistic regression session
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [19]:
#now it is time to put all the stages as a pipeline
pipeline = Pipeline(stages=[gender_indexer,embarked_indexer,
                            gender_encoder,embarked_encoder,
                            assembler,
                            log_reg_titanic])

In [20]:
#split the data
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [21]:
#build and train the model
logistic_model = pipeline.fit(train_data)

### It is time to evaluate the model

In [22]:
#explore the predictions with test_data
results_titanic = logistic_model.transform(test_data)

In [23]:
results_titanic.columns

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked',
 'Survived',
 'Gender_indexed',
 'Embarked_indexed',
 'Gender_coded',
 'Embarked_coded',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [24]:
#explore the model
results_titanic.orderBy('Survived').select('Survived', 'probability', 'prediction').show()

+--------+--------------------+----------+
|Survived|         probability|prediction|
+--------+--------------------+----------+
|       0|[0.32664816146146...|       1.0|
|       0|[0.19057007121971...|       1.0|
|       0|[0.44962191140249...|       1.0|
|       0|[0.59616724000094...|       0.0|
|       0|[0.86514958918796...|       0.0|
|       0|[0.68784176655471...|       0.0|
|       0|[0.10238342082074...|       1.0|
|       0|[0.61745806779585...|       0.0|
|       0|[0.44178145135362...|       1.0|
|       0|[0.61865757127774...|       0.0|
|       0|[0.60414872493951...|       0.0|
|       0|[0.73232124140844...|       0.0|
|       0|[0.48337278734553...|       1.0|
|       0|[0.68947418434891...|       0.0|
|       0|[0.59555812007396...|       0.0|
|       0|[0.74259880303905...|       0.0|
|       0|[0.41838702585122...|       1.0|
|       0|[0.75936303175054...|       0.0|
|       0|[0.51640422563491...|       0.0|
|       0|[0.81727346104569...|       0.0|
+--------+-

In [25]:
#evaluate the model
eval_results_titanic = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Survived')

In [26]:
#AUC of the log regression
AUC = eval_results_titanic.evaluate(results_titanic)
print(AUC)

0.8740079365079367


### The model did a pretty good job predicting the outcome