In [144]:
import numpy as np
import pandas as pd

In [145]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [146]:
print(train.shape)

(891, 12)


In [147]:
print(test.shape)

(418, 11)


In [148]:
print(train.columns)
print(test.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [149]:
#train.head(30)

In [150]:
#print('did not survive: ', train[train['Survived']==0].value_counts()
#train[(train['Survived']==1) & (train['Pclass']==3) & (train['Sex']=='male')]

In [151]:
# calculate the majority class, and the accuracy if always predict that class
mask = train['Survived']==0
train[mask].shape
print(549/891) # percentage to beat from dummy

0.6161616161616161


In [152]:
# remove unwanted columns
feature_to_keep_train = ['PassengerId', 'Survived', 'Age', 'Pclass', 'Sex', 'SibSp', 'Parch','Fare','Embarked']
feature_to_keep_test = ['PassengerId', 'Pclass', 'Age', 'Sex','SibSp', 'Parch','Fare','Embarked']

train = train[feature_to_keep_train]
test = test[feature_to_keep_test]
train.head()

Unnamed: 0,PassengerId,Survived,Age,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,1,0,22.0,3,male,1,0,7.25,S
1,2,1,38.0,1,female,1,0,71.2833,C
2,3,1,26.0,3,female,0,0,7.925,S
3,4,1,35.0,1,female,1,0,53.1,S
4,5,0,35.0,3,male,0,0,8.05,S


In [153]:
# create one hot columns
train = pd.get_dummies(train, columns=["Pclass",'Sex', "Embarked"], prefix=["Pclass", 'Sex', "Embarked"])
test = pd.get_dummies(test, columns=["Pclass",'Sex', "Embarked"], prefix=["Pclass", 'Sex', "Embarked"])
train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,3,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,5,0,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [154]:
test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,893,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,894,62.0,0,0,9.6875,0,1,0,0,1,0,1,0
3,895,27.0,0,0,8.6625,0,0,1,0,1,0,0,1
4,896,22.0,1,1,12.2875,0,0,1,1,0,0,0,1


In [155]:
#### special consideration for filling up age #########
train['Age'] = train['Age'].fillna(value=30)
test['Age'] = test['Age'].fillna(value=30)

In [156]:
# splitting training set and dev set
from sklearn.utils import shuffle
train = shuffle(train)

m, _ = train.shape  # 819, number of rows in training set
m_train = 712 # about 80% of the training data. dev set will be approx. 20% of the training data

X_train = train.iloc[:m_train,2:]
y_train = train.iloc[:m_train,1]

X_dev = train.iloc[m_train:m,2:]
y_dev = train.iloc[m_train:m,1]

X_test_id = test.iloc[:,0]
X_test = test.drop(columns=['PassengerId'])
X_test = X_test.fillna(0)

In [157]:
X_test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,62.0,0,0,9.6875,0,1,0,0,1,0,1,0
3,27.0,0,0,8.6625,0,0,1,0,1,0,0,1
4,22.0,1,1,12.2875,0,0,1,1,0,0,0,1


In [158]:
print(X_train.shape)
print(X_dev.shape)
print(X_test.shape)

(712, 12)
(179, 12)
(418, 12)


In [159]:
X_train['Fare'].head(10)

499      7.7958
4        8.0500
208      7.7500
46      15.5000
538     14.5000
350      9.2250
387     13.0000
22       8.0292
325    135.6333
530     26.0000
Name: Fare, dtype: float64

In [160]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train[['SibSp','Fare','Parch']] = scaler.fit_transform(X_train[['SibSp','Fare','Parch']])
X_dev[['SibSp','Fare','Parch']] = scaler.fit_transform(X_dev[['SibSp','Fare','Parch']])
X_test[['SibSp','Fare','Parch']] = scaler.fit_transform(X_test[['SibSp','Fare','Parch']])
X_train.head(10)
# we must apply the scaling to the test set that we computed for the training set
#test = scaler.fit_transform(test)

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
499,24.0,0.0,0.0,0.015216,0,0,1,0,1,0,0,1
4,35.0,0.0,0.0,0.015713,0,0,1,0,1,0,0,1
208,16.0,0.0,0.0,0.015127,0,0,1,1,0,0,1,0
46,30.0,0.125,0.0,0.030254,0,0,1,0,1,0,1,0
538,30.0,0.0,0.0,0.028302,0,0,1,0,1,0,0,1
350,23.0,0.0,0.0,0.018006,0,0,1,0,1,0,0,1
387,36.0,0.0,0.0,0.025374,0,1,0,1,0,0,0,1
22,15.0,0.0,0.0,0.015672,0,0,1,1,0,0,1,0
325,36.0,0.0,0.0,0.264739,1,0,0,1,0,1,0,0
530,2.0,0.125,0.166667,0.050749,0,1,0,1,0,0,0,1


In [161]:
######## Random Gradient Boosting Trees ########

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
params = {'learning_rate':[0.04, 0.05, 0.06, 0.07, 0.1, 0.15, 0.2], 'n_estimators':[20,40,60,100,120]}

clf = GradientBoostingClassifier()

# grid search
grid_clf_acc = GridSearchCV(clf, param_grid=params, scoring='accuracy')
grid_clf_acc.fit(X_train, y_train)

print(np.array(grid_clf_acc.cv_results_['mean_test_score']))
print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

[0.81601124 0.81741573 0.81039326 0.80337079 0.81460674 0.81039326
 0.81601124 0.80477528 0.81320225 0.81179775 0.81741573 0.81039326
 0.80477528 0.81320225 0.81460674 0.81741573 0.80617978 0.81179775
 0.81039326 0.81741573 0.81179775 0.80898876 0.81039326 0.81320225
 0.81882022 0.80898876 0.81601124 0.81601124 0.81460674 0.81320225
 0.80477528 0.81039326 0.80758427 0.80898876 0.81320225]
Grid best parameter (max. accuracy):  {'learning_rate': 0.1, 'n_estimators': 120}
Grid best score (accuracy):  0.8188202247191011


In [162]:
####### Evaluate model with dev set ######## 
from sklearn.metrics import accuracy_score
clf = GradientBoostingClassifier(learning_rate=0.08, n_estimators=40, max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_dev)
accuracy_score(y_dev, y_pred)

0.8156424581005587

In [163]:
####### predict the test set #######

y_test = clf.predict(X_test)
output = pd.concat([pd.Series(X_test_id), pd.Series(y_test)], axis=1)

In [164]:
output.columns = ['PassengerId','Survived']
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [165]:
y_test.shape

(418,)

In [166]:
output[output.isnull().any(axis=1)]

Unnamed: 0,PassengerId,Survived


In [167]:
np.all(np.isfinite(output))

True

In [168]:
output = output.set_index('PassengerId')
output.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [169]:
output.to_csv('predict.csv')