In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
matplotlib.style.use('ggplot')

# NameToCategory
Here, I create a function that's usedfor creating a "Title" feature by pulling out `Mr` `Miss`, etc. 

For groups that only a few ended up being bucketed into, I made a catch-all `Other` group.

In [3]:
# pull out name into something more useful. 
import re
mrPattern = re.compile('.*Mr\..*')
missPattern = re.compile('.*Miss\..*')
masterPattern = re.compile('.*Master\..*')
mrsPattern = re.compile('.*Mrs\..*')
donPattern = re.compile('.*Don\..*')
revPattern = re.compile('.*Rev\..*')
drPattern = re.compile('.*Dr\..*')
mmePattern = re.compile('.*Mme\..*')
msPattern = re.compile('.*Ms\..*')
majorPattern = re.compile('.*Major\..*')
ladyPattern = re.compile('.*Lady\..*')
sirPattern = re.compile('.*Sir\..*')
mllePattern = re.compile('.*Mlle\..*')
colPattern = re.compile('.*Col\..*')
captPattern = re.compile('.*Capt\..*')
countessPattern = re.compile('.*Countess\..*')
jonkheerPattern = re.compile('.*Jonkheer\..*')

def nameToCategory(name):
    if (mrPattern.match(name)):
        return 'Mr'
    elif (jonkheerPattern.match(name)):
        return 'Other'
    elif (countessPattern.match(name)):
        return 'Other'
    elif (captPattern.match(name)):
        return 'Other'
    elif (missPattern.match(name)):
        return 'Miss'
    elif (masterPattern.match(name)):
        return 'Master'
    elif (mrsPattern.match(name)):
        return 'Mrs'
    elif (donPattern.match(name)):
        return 'Other'
    elif (revPattern.match(name)):
        return 'Other'
    elif (drPattern.match(name)):
        return 'Other'
    elif (mmePattern.match(name)):
        return 'Mrs'
    elif (msPattern.match(name)):
        return 'Miss'
    elif (majorPattern.match(name)):
        return 'Other'
    elif (ladyPattern.match(name)):
        return 'Other'
    elif (sirPattern.match(name)):
        return 'Other'
    elif (mllePattern.match(name)):
        return 'Miss'
    elif (colPattern.match(name)):
        return 'Other'
    return 'Other'
#    raise Exception(name)

In [4]:
import math

def scrub(filePath):
    data = pd.read_csv(filePath)
    char_cabin = data['Cabin'].astype(str)
    new_cabin = np.array([cabin[0] for cabin in char_cabin])
    data['Cabin'] = pd.Categorical(new_cabin)

    c1Median = data.Age[data.Pclass == 1].median()
    c2Median = data.Age[data.Pclass == 2].median()
    c3Median = data.Age[data.Pclass == 3].median()

    def medianFor(row):
        if (row['Pclass'] == 1):
            return c1Median
        elif (row['Pclass'] == 2):
            return c2Median
        elif (row['Pclass'] == 3):
            return c3Median
        else:
            raise Exception('Goofed')
    
    def updateAge(row):
        if (math.isnan(row['Age'])):
            median = medianFor(row)
            row['Age'] = median
        return row
    
    # Update the missing ages with the median
    data = data.apply(updateAge, axis=1)
    
    new_embarked = np.where(data['Embarked'].isnull()
                           , 'S'
                           , data['Embarked'])
    
    data['Embarked'] = new_embarked
    
    data['Title'] = data['Name'].apply(nameToCategory)
    
    
    return data

    
titanic_train = scrub('train.csv')

titanic_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,n,S,Other
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,24.0,1,2,W./C. 6607,23.45,n,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C,C,Mr
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,n,Q,Mr


In [5]:
temp = pd.read_csv('train.csv')
temp[temp['Fare'].isnull()].count()


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [6]:
from sklearn import linear_model
from sklearn import preprocessing

In [7]:
label_encoder = label_encoder = preprocessing.LabelEncoder()

def trainFeaturesFor(df):
    encoded_sex = label_encoder.fit_transform(df["Sex"])
    encoded_class = label_encoder.fit_transform(df["Pclass"])
    encoded_cabin = label_encoder.fit_transform(df["Cabin"])
    encoded_title = label_encoder.fit_transform(df["Title"])
    encoded_parch = label_encoder.fit_transform(df["Parch"])

    train_features = pd.DataFrame([ encoded_class
                                  , encoded_cabin
                                  , encoded_sex
                                  , encoded_title
                                  , encoded_parch
                                  , df["Age"]
                                  ]).T
    return train_features

def trainModel(df):
    train_features = trainFeaturesFor(df)
    log_model = linear_model.LogisticRegression()
    log_model.fit( X = train_features
                 , y = df["Survived"])
    return log_model

log_model = trainModel(titanic_train)

preds = log_model.predict(X=trainFeaturesFor(titanic_train))

print(log_model.intercept_)
print(log_model.coef_)

[ 3.61009667]
[[-0.95636873 -0.06094102 -2.47985096 -0.07196262 -0.15789623 -0.02725761]]


In [8]:
pd.crosstab(preds,titanic_train["Survived"])

Survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,468,103
1,81,239


In [9]:
log_model.score( X=trainFeaturesFor(titanic_train)
               , y=titanic_train['Survived'])

0.79349046015712688

In [10]:
titanic_test = scrub('test.csv')
test_features = trainFeaturesFor(titanic_test)
test_preds = log_model.predict(X=test_features)


In [11]:

submission = pd.DataFrame({ "PassengerId": titanic_test["PassengerId"]
                          , "Survived":test_preds})

submission.to_csv( "submission.csv"
                 , index=False)

# Take 2: This time with cross-validation
## And also trying out a Neural Net

In [12]:
import math

def scrub(filePath):
    data = pd.read_csv(filePath)
    char_cabin = data['Cabin'].astype(str)
    new_cabin = np.array([cabin[0] for cabin in char_cabin])
    data['Cabin'] = pd.Categorical(new_cabin)

    c1Median = data.Age[data.Pclass == 1].median()
    c2Median = data.Age[data.Pclass == 2].median()
    c3Median = data.Age[data.Pclass == 3].median()

    def medianFor(row):
        if (row['Pclass'] == 1):
            return c1Median
        elif (row['Pclass'] == 2):
            return c2Median
        elif (row['Pclass'] == 3):
            return c3Median
        else:
            raise Exception('Goofed')
    
    def updateAge(row):
        if (math.isnan(row['Age'])):
            median = medianFor(row)
            row['Age'] = median
        return row
    
    # Update the missing ages with the median
    data = data.apply(updateAge, axis=1)
    
    new_embarked = np.where(data['Embarked'].isnull()
                           , 'S'
                           , data['Embarked'])
    
    data['Embarked'] = new_embarked
    
    data['Title'] = data['Name'].apply(nameToCategory)
    
    
    return data

    
completeDf = scrub('train.csv')

completeDf.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,n,S,Other
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,24.0,1,2,W./C. 6607,23.45,n,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C,C,Mr
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,n,Q,Mr


## Splitting the data into train and test

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split( completeDf
                               , test_size=0.2
                               , random_state=1)

[len(train), len(test)]

[712, 179]

## Training and scoring the model

In [14]:
log_model = trainModel(train)

preds = log_model.predict(X=trainFeaturesFor(train))

log_train_score = log_model.score( X=trainFeaturesFor(train)
                                  , y=train['Survived'])

print(log_train_score)
pd.crosstab(preds,train["Survived"])

0.800561797753


Survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,384,83
1,59,186


## Scoring the model on the test data

In [15]:
test_preds = log_model.predict(X=trainFeaturesFor(test))

log_score = log_model.score( X=trainFeaturesFor(test)
                            , y=test['Survived'])
print(log_score)
pd.crosstab(test_preds, test["Survived"])

0.787709497207


Survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,90,22
1,16,51


# Setting up data to be Neural Net friendly

In [33]:
nn_df = scrub('train.csv')

## Dropping Name and Ticket because they get in the way
nn_df = nn_df.drop(['Name', 'Ticket', 'Fare'], axis=1)

## Getting dummies variables for the categorical data
nn_df = pd.get_dummies(nn_df)

print(nn_df.columns)

nn_df.tail()

Index([u'PassengerId', u'Survived', u'Pclass', u'Age', u'SibSp', u'Parch',
       u'Sex_female', u'Sex_male', u'Cabin_A', u'Cabin_B', u'Cabin_C',
       u'Cabin_D', u'Cabin_E', u'Cabin_F', u'Cabin_G', u'Cabin_T', u'Cabin_n',
       u'Embarked_C', u'Embarked_Q', u'Embarked_S', u'Title_Master',
       u'Title_Miss', u'Title_Mr', u'Title_Mrs', u'Title_Other'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Cabin_A,Cabin_B,...,Cabin_T,Cabin_n,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other
886,887,0,2,27.0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
887,888,1,1,19.0,0,0,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
888,889,0,3,24.0,1,2,1,0,0,0,...,0,1,0,0,1,0,1,0,0,0
889,890,1,1,26.0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
890,891,0,3,32.0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,1,0,0


## Splitting into train and test

In [28]:
from sklearn.model_selection import train_test_split

nn_train, nn_test = train_test_split( nn_df
                                     , test_size=0.2
                                     , random_state=1)

[len(train), len(test)]

[712, 179]

In [29]:
from sklearn.neural_network import MLPClassifier

X = nn_train.drop('Survived', axis=1)
y = nn_train.Survived

clf = MLPClassifier( solver='lbfgs'
                    , activation='logistic'
                    , alpha=1e-3
                    , hidden_layer_sizes=(25, 25)
                    , random_state=1)
clf.fit(X, y)

MLPClassifier(activation='logistic', alpha=0.001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(25, 25), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [30]:
nn_preds = clf.predict(nn_test.drop('Survived', axis=1))

nn_train_score = clf.score( X=nn_train.drop('Survived', axis=1)
                          , y=nn_train['Survived'])

nn_score = clf.score( X=nn_test.drop('Survived', axis=1)
                     , y=nn_test["Survived"])

print(nn_score)

pd.crosstab(nn_preds, nn_test["Survived"])

0.77094972067


Survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,95,30
1,11,43


In [31]:
print([log_train_score, log_score])
print([nn_train_score, nn_score])


[0.800561797752809, 0.78770949720670391]
[0.8202247191011236, 0.77094972067039103]


## Setting up data for submit. 

In [37]:
nn_titanic_test = scrub('test.csv')


## Dropping Name and Ticket because they get in the way
nn_titanic_test = nn_titanic_test.drop(['Name', 'Ticket', 'Fare'], axis=1)

## Getting dummies variables for the categorical data
nn_titanic_test = pd.get_dummies(nn_titanic_test)

nn_titanic_test.insert(13, 'Cabin_T', 0)

print(nn_titanic_test.columns)

submit_preds = clf.predict( X=nn_titanic_test)

# Check if there is anything bad
# nn_titanic_test.isnull().any()

submission = pd.DataFrame({ "PassengerId": nn_titanic_test["PassengerId"]
                          , "Survived":submit_preds})

submission.to_csv( "submission.csv"
                 , index=False)

Index([u'PassengerId', u'Pclass', u'Age', u'SibSp', u'Parch', u'Sex_female',
       u'Sex_male', u'Cabin_A', u'Cabin_B', u'Cabin_C', u'Cabin_D', u'Cabin_E',
       u'Cabin_F', u'Cabin_T', u'Cabin_G', u'Cabin_n', u'Embarked_C',
       u'Embarked_Q', u'Embarked_S', u'Title_Master', u'Title_Miss',
       u'Title_Mr', u'Title_Mrs', u'Title_Other'],
      dtype='object')
