In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
training_data_path = "./train.csv"
training_data = pd.read_csv(training_data_path)

In [2]:
training_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Noting down some useful metrics:
- ~0.384 of the people survived.
- More than half of the passengers were 3rd class.
- More than half of the passengers were unmarried.
- More than 75% of the passengers did not have parents or children on board.

In [3]:
training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Comments:
- Name, Ticket number, Cabin, and Embarked are not as important as other columns on here
- SibSp and Parch only applies to some inputs

In [4]:
training_data.columns[training_data.isnull().any()]

Index(['Age', 'Cabin', 'Embarked'], dtype='object')

Using \['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'\]. Sex needs to be encoded. 

In [5]:
training_data.Sex.unique()

array(['male', 'female'], dtype=object)

Preprocessing:

In [6]:
from preprocess import train_preprocess
training_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
train_X_processed, val_X_processed, train_y, val_y = train_preprocess(training_data, training_features)

In [7]:
print(train_X_processed.head())
print(val_X_processed.head())

   Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male
0     3.0  28.0    0.0    0.0   7.8958         0.0       1.0
1     3.0  17.0    4.0    2.0   7.9250         1.0       0.0
2     3.0  30.0    1.0    0.0  16.1000         0.0       1.0
3     3.0  22.0    0.0    0.0   7.2500         0.0       1.0
4     2.0  45.0    0.0    0.0  13.5000         1.0       0.0
   Pclass   Age  SibSp  Parch      Fare  Sex_female  Sex_male
0     3.0  29.9    0.0    0.0   14.4583         0.0       1.0
1     3.0  29.9    0.0    0.0    7.5500         0.0       1.0
2     3.0   7.0    4.0    1.0   29.1250         0.0       1.0
3     1.0  29.9    1.0    0.0  146.5208         1.0       0.0
4     3.0  29.0    0.0    2.0   15.2458         1.0       0.0


In [8]:
from sklearn.ensemble import RandomForestClassifier
model_randomforest = RandomForestClassifier(random_state=1)
model_randomforest.fit(train_X_processed, train_y)

In [9]:
from sklearn.metrics import accuracy_score
y_pred = model_randomforest.predict(val_X_processed)
print(accuracy_score(val_y, y_pred))

0.8385650224215246


In [10]:
test_data_path = "./test.csv"
test_data = pd.read_csv(test_data_path)
from preprocess import test_preprocess
X = test_preprocess(test_data, training_features)
survived = pd.DataFrame(model_randomforest.predict(X))
survived = survived.rename(columns={0:'Survived'})
df_subm = pd.concat([test_data['PassengerId'], survived], axis=1)
print(df_subm)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [11]:
df_subm.to_csv('./out.csv',index=False)