In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sb # dataviz and exploratory analysis

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Let's begin by importing the data into two DataFrames, one for training and one for testing.
# For this challenge, I used the Kaggle Notebook, so if you're using another environment,
# remember to adjust your file paths accordingly.

train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

print(train.info(), '\n')
print(test.info())

# Exploratory Analysis

In [None]:
# Name and PassengerId will be discarded for not being able to teach our model anything of use

trainDropAltIndexes = train.drop(['PassengerId', 'Name'],axis=1)

print(trainDropAltIndexes.info(), '\n')

In [None]:
# Cabin is severy lacking, but it might be important to be kept if having one makes a difference
# Firstly, let's change every existing values to Specified and null to Unspecified

trainDropCabin = trainDropAltIndexes.copy()
trainDropCabin.loc[trainDropCabin['Cabin'].notnull(),'Cabin'] = 'Specified'
trainDropCabin.loc[trainDropCabin['Cabin']!='Specified', 'Cabin'] = 'Unspecified'
trainDropCabin.groupby(['Cabin'])['Survived'].count()

In [None]:
# Does having a cabin affect survivability?

trainDropCabin.groupby(['Cabin','Survived']).size()

In [None]:
# Since we've seen that it does, we'll keep the trainDropCabin DataFrame
# Does Fare affect survivability? We'll split it into quantiles to properly understand it

trainDropFare = trainDropCabin.copy()
trainDropFare.head()
quantiles = pd.qcut(trainDropFare['Fare'],2,['Low','High'])
trainDropFare['Fare'] = quantiles
trainDropFare.head()
trainDropFare.groupby(['Fare','Survived']).size()

In [None]:
# Does the port of embarkation affect survivability?

trainDropFare.groupby(['Embarked','Survived']).size()

In [None]:
# Embarking on port C clearly made a difference, but it might just be a spurious correlation.

trainDropFare = trainDropFare.drop(['Embarked'],axis=1)
trainDropFare.info()

In [None]:
# We still have to treat missing Age values and analyze Sex, Age, SibSp and Parch.
# Let's first observe if Sex affects survivability.

trainDropFare.groupby(['Sex', 'Survived']).size()

In [None]:
# Female people clearly had an edge in survivability.
# Does SibSp affect survivability?

trainDropSex = trainDropFare.copy()
trainDropSex.groupby(['SibSp', 'Survived']).size()

In [None]:
# Having specifically one SibSp apparently made some difference in greater numbers, but it was still feeble.
# We will drop SibSp, but perhaps Parch might give us more information.

trainDropSibSp = trainDropSex.copy()
trainDropSibSp = trainDropSibSp.drop(['SibSp'],axis=1)
trainDropSibSp.groupby(['Parch','Survived']).size()

In [None]:
# Having at least one parent or child evened the odds, so this aspect will be considered.

dictParch = {0:'None', 1:'Some', 2:'Some', 3:'Some', 4:'Some', 5:'Some', 6:'Some'}

trainDropParch = trainDropSibSp.copy()
trainDropParch.loc[:,'Parch'] = trainDropParch['Parch'].map(dictParch)
trainDropParch.groupby(['Parch','Survived']).size()

In [None]:
# As for age, it's not useful to keep it as a numeric value, so let's turn it into categorical column.
# Since the Titanic crew did evacuate children first, we will consider people up to 13 years old as children.

trainDropAge = trainDropParch.copy()
ageAdjust = ['Child' if (type(element)==float and element<14) else 'NotChild' for element in trainDropAge['Age']]
trainDropAge['Age'] = ageAdjust
trainDropAge.groupby(['Age','Survived']).size()

In [None]:
# Pclass does not need any adjustments for now, and it does affect survivability, so it will be kept.

trainDropAge.groupby(['Pclass','Survived']).size()

In [None]:
# Ticket works more like an index since class and fare are already in, so it will be disconsidered.

trainDropFinal = trainDropAge.copy()
trainDropFinal = trainDropFinal.drop('Ticket',axis=1)
trainDropFinal.info()

In [None]:
# Now, all data has been adjusted and no further null values remain in the model.

trainDropFinal.head()

# Final adjustments

In [None]:
# We must convert text to numbers so the models can actually work.
# 1 values will be given to data that increases survivability, and lower values to those that do not increase it.

trainFinal = trainDropFinal.copy()
trainFinal['Cabin'] = trainFinal['Cabin'].map({'Specified':1.0, 'Unspecified':0.0})
trainFinal['Fare'] = trainFinal['Fare'].map({'High':1.0, 'Low':0.0})
trainFinal['Parch'] = trainFinal['Parch'].map({'Some':1.0, 'None':0.0})
trainFinal['Age'] = trainFinal['Age'].map({'Child':1.0, 'NotChild':0.0})
trainFinal['Sex'] = trainFinal['Sex'].map({'female':1.0, 'male':0.0})
trainFinal['Pclass'] = trainFinal['Pclass'].map({1:1.0, 2:0.5, 3:0.0})

trainFinal.head()

In [None]:
# At long last, we just need to create a method that will treat the input test data just like the training data.

def treatData(dfInput):
    # Creating a copy of the input to keep the original version.
    dfAux = dfInput.copy()
    
    # Dropping unnecessary data.
    dfAux = dfAux.drop(['PassengerId', 'Name', 'Ticket', 'SibSp', 'Embarked'],axis=1)
    
    # Fixing Cabin.
    dfAux.loc[dfAux['Cabin'].notnull(),'Cabin'] = 'Specified'
    dfAux.loc[dfAux['Cabin']!='Specified', 'Cabin'] = 'Unspecified'
    
    # Fixing Fare.
    quantilesFare = pd.qcut(dfAux['Fare'],2,['Low','High'])
    dfAux['Fare'] = quantilesFare
    
    # Fixing Parch.
    dictParch = {0:'None', 1:'Some', 2:'Some', 3:'Some', 4:'Some', 5:'Some', 6:'Some'}
    dfAux['Parch'] = dfAux['Parch'].map(dictParch)
    
    # Fixing Age.
    categoryAge = ['Child' if (type(element)==float and element<14) else 'NotChild' for element in dfAux['Age']]
    dfAux['Age'] = categoryAge
    
    # Converting text values to numerical representation.
    dfAux['Cabin'] = dfAux['Cabin'].map({'Specified':1.0, 'Unspecified':0.0})
    dfAux['Fare'] = dfAux['Fare'].map({'High':1.0, 'Low':0.0})
    dfAux['Parch'] = dfAux['Parch'].map({'Some':1.0, 'None':0.0})
    dfAux['Age'] = dfAux['Age'].map({'Child':1.0, 'NotChild':0.0})
    dfAux['Sex'] = dfAux['Sex'].map({'female':1.0, 'male':0.0})
    dfAux['Pclass'] = dfAux['Pclass'].map({1:1.0, 2:0.5, 3:0.0})
    
    # Filling any remaining null values with 0.0.
    dfAux.fillna(0.0, inplace=True)
    
    return dfAux

In [None]:
# Testing the method to see if both dataframes are similar.

print(trainFinal.compare(treatData(train)))

In [None]:
# Testing if the method trats null values on test DataFrame.

treatData(test).info()

# Oversampling

In [None]:
# We might need to oversample our training data if there is imbalance among classes.

trainFinal.groupby('Survived').size()

In [None]:
sample = trainFinal.query('Survived==1').sample(207)
trainOversampled = pd.concat([trainFinal, sample], axis=0)
trainOversampled.groupby('Survived').size()

# Train-Test Split

In [None]:
# In this first attempt I'll be using SKLearn Decision Tree to create my model, so let's import it.
# I will also split the train DataFrame in model training and testing, so let's import that too from SKLearn.

import sklearn.tree as tree
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [None]:
# X will be a DataFrame with our independant variables, and Y our dependant one.

# Firstly, X will have our non-target columns.
X = trainOversampled[['Pclass', 'Sex', 'Age', 'Parch', 'Fare', 'Cabin']]

# We want to find out if which passengers have survived, so it will be our Y split.
Y = trainOversampled['Survived']

# Now I will create the training and initial testing splits 
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.25, stratify=Y)

# Decision Tree model

In [None]:
# Time to create the model and train it with the trainX and trainY splits
modelTree = tree.DecisionTreeClassifier(criterion='entropy', max_features=None)
modelTree.fit(trainX, trainY)

# Checking the initial accuracy of a prediction of testX split
accuracyTree = modelTree.score(testX, testY)*100
print('Accuracy of %.2f%%' % accuracyTree)

This model has a rather inconsistent accuracy score, ranging from 79% to roughly 85%.

When criterion was set as 'entropy' and max_features as 0.3, the inconsistency did reduce.

In [None]:
confusionMatrix = metrics.confusion_matrix(testY, modelTree.predict(testX), labels=modelTree.classes_)
metrics.ConfusionMatrixDisplay(confusionMatrix, display_labels=modelTree.classes_).plot()

# Test DataFrame

In [None]:
# Time to test it for real, at last. We first must adjust the values.

testAdjust = treatData(test)
testPrediction = modelTree.predict(testAdjust)

In [None]:
# Now I can finally generate the .csv file and submit it.
testSubmit = pd.concat([test['PassengerId'], pd.Series(testPrediction, name='Survived')], axis=1)
testSubmit.to_csv('./submission.csv',index=False)

In [None]:
pd.read_csv('./submission.csv')