In [85]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

# TODO: use S3 to to fetch data from.
submission = pd.read_csv('./data/gender_submission.csv')
training = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

training['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
all_data = pd.concat([training,test])


# Modelling
I will use all data, because it is historical, and I can do rundom split testing. 

In [104]:
# From Feature Engineering section
all_data['cabin_num'] = training.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
all_data['cabin_adv'] = training.Cabin.apply(lambda x: str(x)[0])

# Fill na's with median
all_data.Age = all_data.Age.fillna(training.Age.median())
all_data.Fare = all_data.Fare.fillna(training.Fare.median())

# log norm of fare
all_data['norm_fare'] = np.log(all_data.Fare+1)

# converted fare to category for pd.get_dummies()
all_data.Pclass = all_data.Pclass.astype(str)

#created dummy variables from categories (also can use OneHotEncoder)
all_dummies = pd.get_dummies(all_data[['Pclass','Sex','Age','SibSp','Parch','norm_fare','Embarked','cabin_adv','cabin_num','train_test', 'Survived']])
all_dummies

Unnamed: 0,Age,SibSp,Parch,norm_fare,cabin_num,train_test,Survived,Pclass_1,Pclass_2,Pclass_3,...,Embarked_S,cabin_adv_A,cabin_adv_B,cabin_adv_C,cabin_adv_D,cabin_adv_E,cabin_adv_F,cabin_adv_G,cabin_adv_T,cabin_adv_n
0,22.0,1,0,2.110213,0,1,0.0,False,False,True,...,True,False,False,False,False,False,False,False,False,True
1,38.0,1,0,4.280593,1,1,1.0,True,False,False,...,False,False,False,True,False,False,False,False,False,False
2,26.0,0,0,2.188856,0,1,1.0,False,False,True,...,True,False,False,False,False,False,False,False,False,True
3,35.0,1,0,3.990834,1,1,1.0,True,False,False,...,True,False,False,True,False,False,False,False,False,False
4,35.0,0,0,2.202765,0,1,0.0,False,False,True,...,True,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,0,2.202765,0,0,,False,False,True,...,True,False,False,False,False,False,False,False,False,True
414,39.0,0,0,4.699571,0,0,,True,False,False,...,False,False,False,False,False,False,False,False,False,True
415,38.5,0,0,2.110213,0,0,,False,False,True,...,True,False,False,False,False,False,False,False,False,True
416,28.0,0,0,2.202765,0,0,,False,False,True,...,True,False,False,False,False,False,False,False,False,True


In [90]:
features = ['Age', 'SibSp', 'Parch', 'norm_fare', 'cabin_num', 'train_test',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'cabin_adv_A',
       'cabin_adv_B', 'cabin_adv_C', 'cabin_adv_D', 'cabin_adv_E',
       'cabin_adv_F', 'cabin_adv_G', 'cabin_adv_T', 'cabin_adv_n']
train_data
train_data = all_dummies[all_dummies.train_test == 1]

X_train = train_data[features]
y_train = train_data.Survived

test_data = all_dummies[all_dummies.train_test == 0]

test_X = test_data[features]
test_y = submission.Survived

## Split Testing

In [91]:
# from sklearn.model_selection import train_test_split

# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)
y_train

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

## DecisionTreeRegressor

In [92]:
from sklearn.tree import DecisionTreeRegressor


model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)


In [93]:
print(f"On test data {model.predict(X_train.head(10))}")
print(f"Actual values {list(y_train.head(10))}")

On test data [0. 1. 1. 1. 0. 0. 0. 0. 1. 1.]
Actual values [0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]


In [94]:
from sklearn.metrics import mean_absolute_error

predicted = model.predict(test_X)
print(f"Predicted from test dataset {predicted[:10]}")
print(f"Actual val from test dataset {list(test_y[:10])}")

print("MAE", mean_absolute_error(predicted, test_y))

Predicted from test dataset [0.  0.  1.  0.5 1.  0.  0.  0.  1.  0. ]
Actual val from test dataset [0, 1, 0, 0, 1, 0, 1, 0, 1, 0]
MAE 0.21082535885167464


## Random Forest Tree

In [95]:
from sklearn.ensemble import RandomForestRegressor

model_forest = RandomForestRegressor(random_state=0)
model_forest.fit(X_train, y_train)

In [98]:
forest_predict = model_forest.predict(test_X)

print("Random Forest predict: ", [round(i) for i in forest_predict[:10]])
print("Actual values: ", list(test_y[:10]))
print("MAE", mean_absolute_error(forest_predict, test_y))
print("MAE", mean_absolute_error([round(i) for i in forest_predict], test_y))

Random Forest predict:  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0]
Actual values:  [0, 1, 0, 0, 1, 0, 1, 0, 1, 0]
MAE 0.23173786089953766
MAE 0.14832535885167464


In [103]:
res_df = pd.DataFrame({"PassengerId": test.PassengerId, "Survived": [round(i) for i in forest_predict]})
# Rename the columns
res_df.columns = ['PassengerId', 'Survived']

# Display the transposed DataFrame
res_df.to_csv('output.csv', index=False)


##### 85.17% Accuracy