In [436]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

In [437]:
# load the data and split into training and validation sets
# test_data will be used in final submission
# combined_data is created for data exploration but not used for training/validation/testing
full_train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

combined_data = pd.concat([train_data, test_data])
train_data, validation_data = train_test_split(full_train_data, test_size=0.2)
all_datasets = [train_data, validation_data, test_data, combined_data, full_train_data]

combined_data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck,Name,Ticket,Cabin
598,599,0.0,3,male,32.0,0,0,7.2250,C,Mr.,x,,,
839,840,1.0,1,male,32.0,0,0,29.7000,C,Mr.,C,,,
472,473,1.0,2,female,33.0,1,2,27.7500,S,Mrs.,x,,,
527,528,0.0,1,male,32.0,0,0,221.7792,S,Mr.,C,,,
53,54,1.0,2,female,29.0,1,0,26.0000,S,Mrs.,x,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,male,,0,0,8.0500,S,,,"Spector, Mr. Woolf",A.5. 3236,
414,1306,,1,female,39.0,0,0,108.9000,C,,,"Oliva y Ocana, Dona. Fermina",PC 17758,C105
415,1307,,3,male,38.5,0,0,7.2500,S,,,"Saether, Mr. Simon Sivertsen",SOTON/O.Q. 3101262,
416,1308,,3,male,,0,0,8.0500,S,,,"Ware, Mr. Frederick",359309,


In [438]:
# how many nulls?
combined_data.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Sex               0
Age              86
SibSp             0
Parch             0
Fare              1
Embarked          0
Title           418
Deck            418
Name            710
Ticket          710
Cabin          1037
dtype: int64

In [439]:
# number of unique values for each column?
combined_data.nunique()

PassengerId    1128
Survived          2
Pclass            3
Sex               2
Age              95
SibSp             7
Parch             8
Fare            269
Embarked          3
Title            16
Deck              8
Name            418
Ticket          363
Cabin            76
dtype: int64

In [440]:
# unique values and value counts for some columns
d = combined_data
for c in ['Survived', 'Pclass', 'Sex', 'SibSp']:
    print(d[c].value_counts(dropna=False, sort=True),'\n')

0.0    436
NaN    418
1.0    274
Name: Survived, dtype: int64 

3    618
1    278
2    232
Name: Pclass, dtype: int64 

male      729
female    399
Name: Sex, dtype: int64 

0    759
1    280
2     36
4     22
3     18
8      7
5      6
Name: SibSp, dtype: int64 



In [441]:
# unique values and value counts for some more columns
d = combined_data
for c in ['Parch', 'Embarked']:
    print(d[c].value_counts(dropna=False, sort=True), '\n')

#bin the ages
print(d.Age.value_counts(dropna=False, sort=False, bins=10))


0    855
1    152
2     99
3      8
5      5
4      5
6      2
9      2
Name: Parch, dtype: int64 

S    777
C    236
Q    115
Name: Embarked, dtype: int64 

(0.0892, 8.153]      69
(8.153, 16.136]      57
(16.136, 24.119]    262
(24.119, 32.102]    315
(32.102, 40.085]    146
(40.085, 48.068]     92
(48.068, 56.051]     54
(56.051, 64.034]     36
(64.034, 72.017]      9
(72.017, 80.0]        2
Name: Age, dtype: int64


In [442]:
# extract titles from names
for d in all_datasets:
    title = d.Name.str.partition(',')[2].str.split().str[0]   
    d['Title'] = title
print(combined_data.Title.value_counts(dropna=False, sort=True), '\n')


NaN        710
Mr.        240
Miss.       78
Mrs.        72
Master.     21
Col.         2
Rev.         2
Ms.          1
Dr.          1
Dona.        1
Name: Title, dtype: int64 



In [443]:
# extract deck from cabin
for d in all_datasets:
    # drop row with problematic Cabin
    d.loc[d.Cabin == 'T','Cabin']=  pd.NA
     # create a new cabin group for nulls (representing unknown cabin)
    d.Cabin.fillna('x', inplace=True)
    # create a Deck column based on cabin values (A..G, etc)
    d['Deck'] = d.Cabin.str[0]

print(combined_data.Deck.value_counts(dropna=False, sort=True))


x    1037
C      35
B      18
D      13
E       9
F       8
A       7
G       1
Name: Deck, dtype: int64


In [444]:
# replace lesser used titles with reasonable generics
for d in all_datasets:
    d.loc[(d.Title.isin(['Don.', 'Rev.', 'Dr.', 'Major.', 'Sir.', 'Col.', 'Capt.', 'the', 'Jonkheer.'])), 'Title'] = "Mr."
    d.loc[(d.Title.isin(['Mme.', 'Lady.', 'Mlle.', 'Dona.'])), 'Title'] = "Mrs."
    d.loc[(d.Title.isin(['Ms.' ])), 'Title'] = "Miss."

print(combined_data.Title.value_counts(dropna=False, sort=True))

NaN        710
Mr.        245
Miss.       79
Mrs.        73
Master.     21
Name: Title, dtype: int64


In [445]:
# Fill in missing ages with something reasonable based on title

# get the mean ages for each title and assign those to the missing values for each of those titles
mean_ages = dict()
for title in combined_data.Title.unique():
    mean =  combined_data[combined_data.Title == title].Age.mean()
    print (f"{title} mean Age = {mean:.0f}")
    mean_ages[title] = mean

for d in all_datasets:
   for title in mean_ages.keys():
      d.loc[d.Age.isnull() & (d.Title == title), 'Age'] = mean_ages[title]


nan mean Age = nan
Mr. mean Age = 32
Mrs. mean Age = 39
Miss. mean Age = 22
Master. mean Age = 7


In [446]:
# drop maybe uselsess columns and incomplete rows
for d in all_datasets:
   d.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True)
   d.dropna(subset=['Fare', 'Embarked'], inplace=True)
   # problematic single value row in training set
   #d = d[d.Deck != 'T']
# lets make sure there are no nulls (other than Survived in combined data set)
combined_data.isnull().sum()

PassengerId      0
Survived       417
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Title          710
Deck             0
dtype: int64

In [447]:
# look at some basic statistics

#return survived, total for some row filter
def survived(row_filter, data):
    result = data.loc[row_filter]["Survived"]
    return sum(result), len(result)
results = []
d = full_train_data
for filter_name, filter in [
    ['Males', d.Sex == 'male'],
    ['Females', d.Sex == 'female'],
    ['1st class', d.Pclass == 1],
    ['2nd class', d.Pclass == 2],
    ['3rd class', d.Pclass == 3],
    ['1st class females', (d.Pclass == 1) & (d.Sex == 'female')],
    ['3rd class males', (d.Pclass == 3) & (d.Sex == 'male')],
    ['Children under 10', d.Age <= 10],
    ['Adults over 50', d.Age >= 50],
    ['Embarking from S', d.Embarked == 'S'],
    ['Embarking from Q', d.Embarked == 'Q'],
    ['Embarking from C', d.Embarked == 'C'],
    ['Title of Mr', d.Title == 'Mr.'],
    ['Title of Mrs', d.Title == 'Mrs.'],
    ['Title of Miss', d.Title == 'Miss.'],
    ['Title of Master', d.Title == 'Master.'],
    ['Deck A', d.Deck == 'A'],
    ['Deck B', d.Deck == 'B'],
    ['Deck C', d.Deck == 'C'],
    ['Deck D', d.Deck == 'D'],
    ['Deck E', d.Deck == 'E'],
    ['Deck F', d.Deck == 'F'],
    ['Deck G', d.Deck == 'G'],
    ['Deck unknown', d.Deck == 'x'],
    ]:
    
    surv, total = survived(filter, full_train_data)
    results.append({"Filter": filter_name,"% Survival": 100*surv/total, "Survived": surv, "Total": total})

pd.DataFrame(results, columns = ["Filter", "% Survival", "Survived", "Total"] ).style.format({'% Survival':'{:.1f}%'})


Unnamed: 0,Filter,% Survival,Survived,Total
0,Males,18.9%,109,577
1,Females,74.0%,231,312
2,1st class,62.6%,134,214
3,2nd class,47.3%,87,184
4,3rd class,24.2%,119,491
5,1st class females,96.7%,89,92
6,3rd class males,13.5%,47,347
7,Children under 10,58.8%,40,68
8,Adults over 50,35.6%,26,73
9,Embarking from S,33.7%,217,644


In [448]:
# another look ad the cleaned data
combined_data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck
598,599,0.0,3,male,32.000000,0,0,7.2250,C,,x
839,840,1.0,1,male,32.000000,0,0,29.7000,C,,x
472,473,1.0,2,female,33.000000,1,2,27.7500,S,,x
527,528,0.0,1,male,32.000000,0,0,221.7792,S,,x
53,54,1.0,2,female,29.000000,1,0,26.0000,S,,x
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,male,32.340426,0,0,8.0500,S,Mr.,x
414,1306,,1,female,39.000000,0,0,108.9000,C,Mrs.,C
415,1307,,3,male,38.500000,0,0,7.2500,S,Mr.,x
416,1308,,3,male,32.340426,0,0,8.0500,S,Mr.,x


In [537]:
# train and test on validation set
from sklearn.ensemble import RandomForestClassifier

generate_submission = False

if generate_submission:
    test_data_for_run = test_data
    train_data_for_run = full_train_data
else:
    test_data_for_run = validation_data
    train_data_for_run = train_data #full_train_data

#train
y = train_data_for_run.Survived
features = ['Pclass','Sex','Age','SibSp','Parch','Embarked','Title', 'Deck']


X = pd.get_dummies(train_data_for_run[features])
X_test = pd.get_dummies(test_data_for_run[features])

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1, min_samples_split=4)
model.fit(X, y)
predictions = model.predict(X_test)

if generate_submission:
    output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
    output.to_csv('submission.csv', index=False)
    print("Submission saved!")
else:
    predictions = predictions.tolist()
    ground_truth = test_data_for_run.Survived.to_list()
    results = list(p ^ g for p,g in zip(predictions, ground_truth))

    accuracy = (len(results)-sum(results))/len(results)
    print(f'accuracy: {accuracy}')


accuracy: 0.8212290502793296
