In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
pd.set_option('display.max_columns', 50)

In [2]:
titanic_df = pd.read_csv("../data/train.csv")

In [3]:
display(titanic_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
titanic_df["NaN_Age"] = titanic_df["Age"].isnull().astype(int)
titanic_df["Age"] = titanic_df.groupby(["Pclass", "Sex"])["Age"].apply(lambda x: x.fillna(x.median()))

This next code block tries to split cabin information into a number and letter. Here, I assume there is some relationship between number and letter combination, and location, and with location and chance of surviving.

In [5]:
# Make NaN values into "N0", so that elementwise __item__(0) and __item__(1:)
# returns either the first letter or 'N', or the number or '0' respectively
titanic_df["Cabin"] = titanic_df["Cabin"].fillna("N0")
titanic_df["Cabin_Letter"] = titanic_df["Cabin"].apply(lambda x: x[0])

# Here, it gets a bit trickyer because some people have several rooms
# This means I have to use regexp to get just the first one, and throw the others away
# I assume number of rooms is correlated with columns like fair, and that it therefore
# isn't very important. There are also not that many people that have several rooms,
# so using that data runs the risk of overfitting
titanic_df["Cabin_Number"] = titanic_df["Cabin"].str.extract(r"\w(\d{1,3})\s?.*")
titanic_df["Cabin_Number"].fillna(0, inplace=True)
titanic_df["Cabin_Number"] = titanic_df["Cabin_Number"].astype(int)

Here, I I make some variables cathergorical. It could be that cabin letter should be numerical rather than
cathegorical, because the decks are on top of each other, and it could be that height is the important part.
However, because I don't want to look further into deck placement, I just do it this way

In [6]:
categorical_cols = ["Cabin_Letter", "Sex", "Embarked"]
titanic_df = pd.get_dummies(titanic_df, columns=categorical_cols, prefix=categorical_cols, prefix_sep='_')

In [7]:
X = titanic_df[["Pclass", "Age", "NaN_Age", "SibSp", "Parch", "Fare", "Sex_female",
               "Cabin_Number", "Cabin_Letter_A", "Cabin_Letter_B", "Cabin_Letter_C",
               "Cabin_Letter_D", "Cabin_Letter_E", "Cabin_Letter_F", "Cabin_Letter_G",
               "Cabin_Letter_N", "Cabin_Letter_T", "Embarked_C", "Embarked_Q", "Embarked_S"]]
Y = titanic_df["Survived"]

In [8]:
display(X)

Unnamed: 0,Pclass,Age,NaN_Age,SibSp,Parch,Fare,Sex_female,Cabin_Number,Cabin_Letter_A,Cabin_Letter_B,Cabin_Letter_C,Cabin_Letter_D,Cabin_Letter_E,Cabin_Letter_F,Cabin_Letter_G,Cabin_Letter_N,Cabin_Letter_T,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,0,1,0,7.2500,0,0,0,0,0,0,0,0,0,1,0,0,0,1
1,1,38.0,0,1,0,71.2833,1,85,0,0,1,0,0,0,0,0,0,1,0,0
2,3,26.0,0,0,0,7.9250,1,0,0,0,0,0,0,0,0,1,0,0,0,1
3,1,35.0,0,1,0,53.1000,1,123,0,0,1,0,0,0,0,0,0,0,0,1
4,3,35.0,0,0,0,8.0500,0,0,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,0,13.0000,0,0,0,0,0,0,0,0,0,1,0,0,0,1
887,1,19.0,0,0,0,30.0000,1,42,0,1,0,0,0,0,0,0,0,0,0,1
888,3,21.5,1,1,2,23.4500,1,0,0,0,0,0,0,0,0,1,0,0,0,1
889,1,26.0,0,0,0,30.0000,0,148,0,0,1,0,0,0,0,0,0,1,0,0


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)