In [1]:
import pandas as pd
import numpy as np
import datetime as DT

In [2]:
traindata = pd.read_csv('train.csv', index_col='PassengerId', header=0)
testdata = pd.read_csv('test.csv', index_col='PassengerId', header=0)
sample_submission = pd.read_csv('gender_submission.csv', index_col='PassengerId', header=0)
my_submission = sample_submission.copy()
my_submission['Survived'] = np.NaN

Import all data into Pandas DataFrames. To ensure that my submission complies with all form requirements, set submission to a copy of the sample submission with all outputs set to NaN (for now).

In [3]:
traindata.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Naive Bayes Model (very simplified proof of concept)
Implement a Naive Bayes Model to forecast survival:<br><br>
The purpose of this is to implement a Naive Bayes Model, not do a rigorous analysis. 
Therefore, I will only consider 3 factors that I know (from experience with this data set) have a high predictive value, and that are presented very cleanly in the data (no feature generation required).<br>

The model trains on DEMISE, rather than SURVIVAL. There are 2x more demise observations which improves model accuracy. Testing shows that this improved model accuracy from 77% to 80% during back-testing, and by 2% on Kaggle.

demise_chance_given_Pclass3_Female_under12 =<br>
\[(chance_Pclass3_given_demise <br>x chance_female_given_demise <br>x chance_under12_given_demise)
x global_chance_of_demise\] /<br> \[chance_Pclass3 x chance_female x chance_under12\]

The four cells below define these variables from df traindata.

In [4]:
demise = (traindata['Survived'] == 0)
global_chance_of_demise = demise.sum() / traindata.index.size
print('global_chance_of_demise', global_chance_of_demise)

global_chance_of_demise 0.6161616161616161


In [14]:
Pclass3 = (traindata['Pclass'] == 3)
Pclass3_given_demise = (Pclass3 & demise).sum() / demise.sum()
print("Pclass3_given_demise", Pclass3_given_demise)
chance_Pclass3 = Pclass3.sum() / traindata.index.size
print("chance_Pclass3", chance_Pclass3)

Pclass3_given_demise 0.6775956284153005
chance_Pclass3 0.5510662177328844


In [6]:
female = (traindata['Sex'] == 'female')
female_given_demise = (female & demise).sum() / demise.sum()
print("female_given_demise %s" % female_given_demise)
chance_female = female.sum() / traindata.index.size
print("chance_female %s" % chance_female)

female_given_demise 0.14754098360655737
chance_female 0.35241301907968575


In [7]:
under12 = (traindata['Age'] < 12.0)
under12_given_demise = (under12 & demise).sum() / demise.sum()
print("under12_given_demise: %s" % under12_given_demise)
chance_under12 = under12.sum() / traindata.index.size
print("chance_under12: %s" % chance_under12)

under12_given_demise: 0.052823315118397086
chance_under12: 0.07631874298540965


In [8]:
def NaiveBayesPredict(row):
    if row['Sex'] == 'female':
        N2 = female_given_demise 
    else: 
        N2 = 1-female_given_demise  
        
    if row['Pclass'] == 3:
        N1 = Pclass3_given_demise 
    else: 
        N1 = 1-Pclass3_given_demise  

    if row['Age'] < 12.0:
        N3 = under12_given_demise 
    else: 
        N3 = 1-under12_given_demise
    
    N4 = global_chance_of_demise
    
    if row['Pclass'] == 3:
        D1 = chance_Pclass3 
    else: 
        D1 = 1-chance_Pclass3
    
    if row['Sex'] == 'female':
        D2 = chance_female 
    else: 
        D2 = 1-chance_female
    
    if row['Age'] < 12.0:
        D3 = chance_under12 
    else: 
        D3 = 1-chance_under12
    
    return ((N1*N2*N3)*N4)/(D1*D2*D3)

# Model test
To test this model, we will generate a column "PredictedSurvival" in the original training data set.
How does our model preduct survival of the "training" set?
Output value of the cell is the prediction accuracy.

In [9]:
#this constant was found through trial-and-error.
BAYESIAN_PROB_THRESHOLD_FOR_DEMISE = 0.50


def PredictSurvival(dataset, thresh):
    return dataset\
    .apply(lambda row: NaiveBayesPredict(row),axis=1)\
    .apply(lambda row: 0 if row > thresh else 1)


traindata['PredictedSurvival'] = PredictSurvival(traindata, BAYESIAN_PROB_THRESHOLD_FOR_DEMISE)
print(traindata[['Survived', 'PredictedSurvival']].head())


accuracy = sum(traindata['Survived'] == traindata['PredictedSurvival'])/traindata.index.size
print('accuracy:', accuracy)

             Survived  PredictedSurvival
PassengerId                             
1                   0                  0
2                   1                  1
3                   1                  1
4                   1                  1
5                   0                  0
accuracy: 0.8002244668911336


# Generate predictions

We will now use this model to generate our submission to Kaggle.

In [10]:
my_submission['Survived'] = PredictSurvival(testdata, BAYESIAN_PROB_THRESHOLD_FOR_DEMISE)
my_submission.to_csv('Submission DG '+DT.datetime.now().strftime('%d-%b %H:%M')+' .csv')

# Model improvements
Future improvements to this model:
* [DONE] Predict demise, not demise, as that sample size is larger (891 - 342)
* Proper EDA, to determine the most predictive factors in the data -- or at least quantify variance% captured with proposed factors.
* Further research into modifying Naive Bayes for correlating features (The independence assumption between features used likely does not hold).