In [8]:
import pandas as pd
import numpy as np
import os

In [32]:
# load data
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

# clean up data
for t in [test_data, train_data]:
    # create a family column that's sibs/spouses + parch
    t['Family'] = t.SibSp + t.Parch

    # extract titles
    t['Title'] = t.Name.str.partition(',')[2].str.split().str[0]

    # create a new cabin group for unknown
    t.Cabin.fillna('Zed', inplace=True)

    # create a Deck column based on cabin values (A..G, etc)
    t['Deck'] = t.Cabin.str[0]

    # only a few empty values, fill with most likely departure city
    t.Embarked.fillna('S', inplace=True)

    t.loc[(t.Title.isin(['Don.', 'Rev.', 'Dr.', 'Major.', 'Sir.', 'Col.', 'Capt.', 'the', 'Jonkheer.'])), 'Title'] = "Mr."
    t.loc[(t.Title.isin(['Mme.', 'Lady.', 'Mlle.', 'Dona.'])), 'Title'] = "Mrs."
    t.loc[(t.Title.isin(['Ms.' ])), 'Title'] = "Miss."


    # fill missing ages with something reasonable from title
    t.loc[t.Age.isnull() & (t.Title.isin(['Master.'])), 'Age'] = 6
    t.loc[t.Age.isnull() & (t.Title.isin(['Miss.'])), 'Age'] = 21
    t.loc[t.Age.isnull() & (t.Title.isin(['Mr.'])), 'Age'] = 33
    t.loc[t.Age.isnull() & (t.Title.isin(['Mrs.'])), 'Age'] = 36

    # change deck T to known deck as it's an outlier that's screwing up training for some reason
    # TODO: need to understand why this is the case
    t.loc[(t.Deck.isin(['T' ])), 'Deck'] = "C"

    # drop maybe uselsess columns
    t = t.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True)


combined_data = pd.concat([train_data,test_data])

combined_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family,Title,Deck
0,1,0.0,3,male,22.0,1,0,7.25,S,1,Mr.,Z
1,2,1.0,1,female,38.0,1,0,71.2833,C,1,Mrs.,C
2,3,1.0,3,female,26.0,0,0,7.925,S,0,Miss.,Z
3,4,1.0,1,female,35.0,1,0,53.1,S,1,Mrs.,C
4,5,0.0,3,male,35.0,0,0,8.05,S,0,Mr.,Z


In [33]:
print("Unique Values")
print(combined_data.nunique())

Unique Values
PassengerId    1309
Survived          2
Pclass            3
Sex               2
Age              98
SibSp             7
Parch             8
Fare            281
Embarked          3
Family            9
Title             4
Deck              8
dtype: int64


In [34]:
train_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Family         0
Title          0
Deck           0
dtype: int64

In [35]:
#take a look at the nulls for column
#combined_data[combined_data['Embarked'].isnull()]
#combined_data.SibSp.unique()
#combined_data.Parch.unique()
#combined_data[combined_data.SibSp == combined_data.Parch]


In [36]:
# take a look at some easy patterns (class, male/female) to see if there are patterns
def surviving_percent(row_filter, data):
    result = data.loc[row_filter]["Survived"]
    return (100*sum(result)/len(result)), sum(result), len(result)

def print_surviving_percent(filter_desc, row_filter, data):
    percent, survived, total = surviving_percent(row_filter, data)
    print(f'% of {filter_desc} survived: {percent:.3}% ({survived} of {total})')

data = train_data

print_surviving_percent('total', (),data)
print_surviving_percent('men', (data.Sex == 'male'),data)
print_surviving_percent('women', (data.Sex == 'female'),data)
print('')
print_surviving_percent('first class', (data.Pclass == 1),data)
print_surviving_percent('second class', (data.Pclass == 2),data)
print_surviving_percent('third class', (data.Pclass == 3),data)
print('')
print_surviving_percent('first class men', (data.Pclass == 1) & (data.Sex == 'male'),data)
print_surviving_percent('second class men', (data.Pclass == 2) & (data.Sex == 'male'),data)
print_surviving_percent('third class men', (data.Pclass == 3) & (data.Sex == 'male'),data)
print('')
print_surviving_percent('first class women', (data.Pclass == 1) & (data.Sex == 'female'),data)
print_surviving_percent('second class women', (data.Pclass == 2) & (data.Sex == 'female'),data)
print_surviving_percent('third class women', (data.Pclass == 3) & (data.Sex == 'female'),data)
#print('')
#print_surviving_percent('children 4 and under', (data.Age <= 4.0) ,data)
#print_surviving_percent('children 8 and under', (data.Age <= 8.0) ,data)
#print_surviving_percent('children 8 to 16', (data.Age >= 8.0) & (data.Age <= 16.0),data)
#print_surviving_percent('people over 16', (data.Age > 16.0) ,data)
#print('')
#print_surviving_percent('people wihtout age', (data['Age'].isnull()) ,data)
#print_surviving_percent('men wihtout age', (data['Age'].isnull() & (data.Sex == 'male')) ,data)
#print_surviving_percent('women wihtout age', (data['Age'].isnull() & (data.Sex == 'female')) ,data)
#print('')
#print_surviving_percent('people wihtout cabin', (data['Cabin'].isnull()) ,data)
#print_surviving_percent('men wihtout cabin', (data['Cabin'].isnull() & (data.Sex == 'male')) ,data)
#print_surviving_percent('women wihtout cabin', (data['Cabin'].isnull() & (data.Sex == 'female')) ,data)
#print('')
#print_surviving_percent('people embarked S', (data.Embarked == 'S') ,data)
#print_surviving_percent('people embarked C', (data.Embarked == 'C') ,data)
#print_surviving_percent('people embarked Q', (data.Embarked == 'Q') ,data)
#print('')
print_surviving_percent('family < 1', (data.Family < 1) ,data)
print_surviving_percent('family < 2', (data.Family < 2) ,data)
print_surviving_percent('family < 3', (data.Family < 3) ,data)
print_surviving_percent('family < 4', (data.Family < 4) ,data)
print_surviving_percent('family < 5', (data.Family < 5) ,data)
print('')
print_surviving_percent('Parent/children < 1', (data.Parch < 1) ,data)
print_surviving_percent('Parent/children < 2', (data.Parch < 2) ,data)
print_surviving_percent('Parent/children < 3', (data.Parch < 3) ,data)
print_surviving_percent('Parent/children < 4', (data.Parch < 4) ,data)
print_surviving_percent('Parent/children < 5', (data.Parch < 5) ,data)
print('')
print_surviving_percent('people embarked S', (data.Embarked == 'S') ,data)
print_surviving_percent('people embarked C', (data.Embarked == 'C') ,data)
print_surviving_percent('people embarked Q', (data.Embarked == 'Q') ,data)



% of total survived: 38.4% (342 of 891)
% of men survived: 18.9% (109 of 577)
% of women survived: 74.2% (233 of 314)

% of first class survived: 63.0% (136 of 216)
% of second class survived: 47.3% (87 of 184)
% of third class survived: 24.2% (119 of 491)

% of first class men survived: 36.9% (45 of 122)
% of second class men survived: 15.7% (17 of 108)
% of third class men survived: 13.5% (47 of 347)

% of first class women survived: 96.8% (91 of 94)
% of second class women survived: 92.1% (70 of 76)
% of third class women survived: 50.0% (72 of 144)
% of family < 1 survived: 30.4% (163 of 537)
% of family < 2 survived: 36.1% (252 of 698)
% of family < 3 survived: 38.9% (311 of 800)
% of family < 4 survived: 40.0% (332 of 829)
% of family < 5 survived: 39.7% (335 of 844)

% of Parent/children < 1 survived: 34.4% (233 of 678)
% of Parent/children < 2 survived: 37.4% (298 of 796)
% of Parent/children < 3 survived: 38.6% (338 of 876)
% of Parent/children < 4 survived: 38.7% (341 of 881)

In [39]:
vals = combined_data.Deck
print(f'decks: {vals.unique()}')
print_surviving_percent('Deck A', (train_data.Deck == 'A') ,data)
print_surviving_percent('Deck B', (train_data.Deck == 'B') ,data)
print_surviving_percent('Deck C', (train_data.Deck == 'C') ,data)
print_surviving_percent('Deck D', (train_data.Deck == 'D') ,data)
print_surviving_percent('Deck E', (train_data.Deck == 'E') ,data)
print_surviving_percent('Deck F', (train_data.Deck == 'F') ,data)
print_surviving_percent('Deck G', (train_data.Deck == 'G') ,data)
#print_surviving_percent('Deck T', (train_data.Deck == 'T') ,data)
print_surviving_percent('Deck Z', (train_data.Deck == 'Z') ,data)


decks: ['Z' 'C' 'E' 'G' 'D' 'A' 'B' 'F']
% of Deck A survived: 46.7% (7 of 15)
% of Deck B survived: 74.5% (35 of 47)
% of Deck C survived: 58.3% (35 of 60)
% of Deck D survived: 75.8% (25 of 33)
% of Deck E survived: 75.0% (24 of 32)
% of Deck F survived: 61.5% (8 of 13)
% of Deck G survived: 50.0% (2 of 4)
% of Deck Z survived: 30.0% (206 of 687)


In [40]:
 # get unique titles
 titles = combined_data.Title.unique()
titles 

array(['Mr.', 'Mrs.', 'Miss.', 'Master.'], dtype=object)

In [41]:
combined_data[combined_data.Deck == 'T']

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family,Title,Deck


In [42]:
for ds in [combined_data, train_data, test_data]:
    for t in ds.Title.unique():
        c = len(ds[ds.Title == t])
        av = sum(ds[ds.Title == t].Age)/c
        print(f'{t} : {c}, avg age: {av}')
    print()

Mr. : 784, avg age: 32.85395408163265
Mrs. : 202, avg age: 36.73267326732673
Miss. : 262, avg age: 21.647290076335878
Master. : 61, avg age: 5.550491803278687

Mr. : 539, avg age: 33.01762523191095
Mrs. : 129, avg age: 35.72868217054263
Miss. : 183, avg age: 21.65573770491803
Master. : 40, avg age: 4.716749999999999

Mr. : 245, avg age: 32.49387755102041
Mrs. : 73, avg age: 38.50684931506849
Miss. : 79, avg age: 21.627721518987343
Master. : 21, avg age: 7.138571428571429



In [45]:
#train
from sklearn.ensemble import RandomForestClassifier
y = train_data.Survived
features = ['Pclass','Sex','Age','SibSp','Parch','Embarked','Family','Title','Deck']

X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

#sub 1: 100/5
#sub 2: 200/50

model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [25]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'Family', 'Title', 'Deck'],
      dtype='object')

In [26]:
test_data.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Family', 'Title', 'Deck'],
      dtype='object')

In [29]:
X.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Family', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master.',
       'Title_Miss.', 'Title_Mr.', 'Title_Mrs.', 'Deck_A', 'Deck_B', 'Deck_C',
       'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Z'],
      dtype='object')

In [30]:
X_test.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Family', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master.',
       'Title_Miss.', 'Title_Mr.', 'Title_Mrs.', 'Deck_A', 'Deck_B', 'Deck_C',
       'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Z'],
      dtype='object')