In [33]:
%load_ext autoreload
%autoreload 2

import random

import pandas as pd

from titanic import TitanicHelper


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
helper = TitanicHelper("./data/passengers.parquet")

In [35]:
helper.did_they_survive("Braund, Mr. Owen Harris")

nan

In [36]:
train = pd.read_csv("./data/train.csv")
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Prediction is just matching a name exactly to the "lookup" table

In [37]:
train["prediction"] = train["Name"].apply(helper.did_they_survive).astype("Int64")

In [38]:
train["prediction"].value_counts(dropna=False)

prediction
<NA>    891
Name: count, dtype: Int64

Who didn't we match?

In [39]:
train[train["prediction"].isnull()].sample(5, random_state=42)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,prediction
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C,
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S,
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S,
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S,
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C,


- Sometimes the reference data just flat out doesn't have someone e.g. Dantcheff, Mr. Ristiu
- Sometimes the names are different (e.g. Miss Ellen Mary in the passenger list, Mockler, Miss. Helen Mary "Ellie" on Kaggle)

Try to use edit distance to match the remaining people to their closest equivalents

In [40]:
train.loc[train["prediction"].isnull(), "closest_match"] = train.loc[
    train["prediction"].isnull(), "Name"
].apply(helper.get_closest_match)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,prediction,closest_match
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,,"Braund, Mr Owen Harris"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,,"Cumings, Mrs Florence Briggs"
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,,"Heikkinen, Miss Laina"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,,"Futrelle, Mr Jacques Heath"
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,,"Allen, Mr William Henry"


Predict those people now

In [41]:
train.loc[train["prediction"].isnull(), "prediction"] = (
    train.loc[train["prediction"].isnull(), "closest_match"]
    .apply(helper.did_they_survive)
    .astype("Int64")
)
train["prediction"].value_counts(dropna=False)

prediction
0    587
1    304
Name: count, dtype: Int64

How many mistakes do we make if we successfully matched people by name?

In [42]:
predictions = train[train["prediction"].notna()].copy()
(predictions["prediction"] == predictions["Survived"]).value_counts()

True     821
False     70
Name: count, dtype: Int64

Who did we "predict" incorrectly?

In [43]:
predictions[predictions["prediction"] != predictions["Survived"]]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,prediction,closest_match
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,"Heikkinen, Miss Laina"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,"Futrelle, Mr Jacques Heath"
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0,"Nasser, Mr Nicholas"
22,23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q,0,"McGowan, Miss Anna Louise"
25,26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female,38.0,1,5,347077,31.3875,,S,0,"Asplund, Mr Carl Oscar Vilhelm Gustafsson"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,843,1,1,"Serepeca, Miss. Augusta",female,30.0,0,0,113798,31.0000,,C,0,"Meyer, Mr August"
852,853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C,1,"Wells, Miss Joan"
856,857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S,0,"Wick, Colonel George Dennick"
857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.5500,E17,S,0,"Daly, Mr Peter Dennis"


Turns out even getting an exact match can be wrong because Kaggle and the Titanic website disagree!

Let's try matching the test set

In [44]:
test = pd.read_csv("./data/test.csv")
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


Get the closest matched names for everyone regardless of whether they're an exact match

In [45]:
test["closest_match"] = test["Name"].apply(helper.get_closest_match)

test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,closest_match
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,"Kelly, Mr James"
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,"Wilkes, Mrs Ellen"
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,"Myles, Mr Thomas Francis"
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,"Wirz, Mr Albert"
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,"Hirvonen, Mrs Helga Elisabeth Lindqvist"


In [46]:
test["Survived"] = test["closest_match"].apply(helper.did_they_survive).astype("Int64")

test["Survived"].value_counts(dropna=False)

Survived
0    275
1    143
Name: count, dtype: Int64

In [47]:
test[["PassengerId", "Survived"]].to_csv("./data/test_submission_2.csv", index=False)