In [192]:
import numpy as np
import pandas as pd

In [193]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [194]:
print(train.shape)

(891, 12)


In [195]:
print(test.shape)

(418, 11)


In [196]:
print(train.columns)
print(test.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [197]:
# calculate the majority class, and the accuracy if always predict that class
mask = train['Survived']==0
train[mask].shape
print(549/891) # percentage to beat from dummy

0.6161616161616161


In [198]:
# remove unwanted columns
feature_to_keep_train = ['PassengerId', 'Survived', 'Age', 'Pclass', 'Sex', 'SibSp', 'Parch','Fare','Embarked']
feature_to_keep_test = ['PassengerId', 'Pclass', 'Age', 'Sex','SibSp', 'Parch','Fare','Embarked']

train = train[feature_to_keep_train]
test = test[feature_to_keep_test]
train.head()

Unnamed: 0,PassengerId,Survived,Age,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,1,0,22.0,3,male,1,0,7.25,S
1,2,1,38.0,1,female,1,0,71.2833,C
2,3,1,26.0,3,female,0,0,7.925,S
3,4,1,35.0,1,female,1,0,53.1,S
4,5,0,35.0,3,male,0,0,8.05,S


In [199]:
# create one hot columns
train = pd.get_dummies(train, columns=["Pclass",'Sex', "Embarked"], prefix=["Pclass", 'Sex', "Embarked"])
test = pd.get_dummies(test, columns=["Pclass",'Sex', "Embarked"], prefix=["Pclass", 'Sex', "Embarked"])
train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,3,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,5,0,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [200]:
test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,893,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,894,62.0,0,0,9.6875,0,1,0,0,1,0,1,0
3,895,27.0,0,0,8.6625,0,0,1,0,1,0,0,1
4,896,22.0,1,1,12.2875,0,0,1,1,0,0,0,1


In [201]:
#### special consideration for filling up age #########
train['Age'] = train['Age'].fillna(value=30)
test['Age'] = test['Age'].fillna(value=30)

In [202]:
# splitting training set and dev set
from sklearn.utils import shuffle
train = shuffle(train)

m, _ = train.shape  # 819, number of rows in training set
m_train = 712 # about 80% of the training data. dev set will be approx. 20% of the training data

X_train = train.iloc[:m_train,2:]
y_train = train.iloc[:m_train,1]

X_dev = train.iloc[m_train:m,2:]
y_dev = train.iloc[m_train:m,1]

X_test_id = test.iloc[:,0]
X_test = test.drop(columns=['PassengerId'])
X_test = X_test.fillna(0)

In [203]:
X_test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,62.0,0,0,9.6875,0,1,0,0,1,0,1,0
3,27.0,0,0,8.6625,0,0,1,0,1,0,0,1
4,22.0,1,1,12.2875,0,0,1,1,0,0,0,1


In [204]:
print(X_train.shape)
print(X_dev.shape)
print(X_test.shape)

(712, 12)
(179, 12)
(418, 12)


In [205]:
# scaling some columns using minMaxScaler

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train[['SibSp','Fare','Parch']] = scaler.fit_transform(X_train[['SibSp','Fare','Parch']])
X_dev[['SibSp','Fare','Parch']] = scaler.fit_transform(X_dev[['SibSp','Fare','Parch']])
X_test[['SibSp','Fare','Parch']] = scaler.fit_transform(X_test[['SibSp','Fare','Parch']])
X_train.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
446,13.0,0.0,0.166667,0.038061,0,1,0,1,0,0,0,1
42,30.0,0.0,0.0,0.015412,0,0,1,0,1,1,0,0
30,40.0,0.0,0.0,0.054107,1,0,0,0,1,1,0,0
204,18.0,0.0,0.0,0.015713,0,0,1,0,1,0,0,1
406,51.0,0.0,0.0,0.015127,0,0,1,0,1,0,0,1
302,19.0,0.0,0.0,0.0,0,0,1,0,1,0,0,1
186,30.0,0.125,0.0,0.030254,0,0,1,1,0,0,1,0
562,28.0,0.0,0.0,0.02635,0,1,0,0,1,0,0,1
376,22.0,0.0,0.0,0.014151,0,0,1,1,0,0,0,1
107,30.0,0.0,0.0,0.015176,0,0,1,0,1,0,0,1


In [206]:
######## Random Gradient Boosting Trees ########

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
params = {'learning_rate':[0.04, 0.05, 0.06, 0.07, 0.1, 0.15, 0.2], 'n_estimators':[20,40,60,100,120]}

clf = GradientBoostingClassifier()

# grid search
grid_clf_acc = GridSearchCV(clf, param_grid=params, scoring='accuracy')
grid_clf_acc.fit(X_train, y_train)

print(np.array(grid_clf_acc.cv_results_['mean_test_score']))
print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

[0.81460674 0.81601124 0.82022472 0.82022472 0.81601124 0.81460674
 0.81601124 0.82022472 0.81460674 0.81179775 0.81601124 0.8244382
 0.82162921 0.81601124 0.81460674 0.81320225 0.81741573 0.81882022
 0.81320225 0.81179775 0.81179775 0.82162921 0.80898876 0.80898876
 0.80477528 0.8244382  0.81460674 0.80758427 0.80196629 0.78932584
 0.81741573 0.81320225 0.80898876 0.7991573  0.79213483]
Grid best parameter (max. accuracy):  {'learning_rate': 0.06, 'n_estimators': 40}
Grid best score (accuracy):  0.824438202247191


In [207]:
####### Evaluate model with dev set ######## 
from sklearn.metrics import accuracy_score
clf = GradientBoostingClassifier(learning_rate=0.08, n_estimators=40, max_depth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_dev)
accuracy_score(y_dev, y_pred)

0.8435754189944135

In [208]:
####### predict the test set #######
y_test = clf.predict(X_test)
output = pd.concat([pd.Series(X_test_id), pd.Series(y_test)], axis=1)

In [209]:
output.columns = ['PassengerId','Survived']
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [210]:
# output[output.isnull().any(axis=1)]
# np.all(np.isfinite(output))

In [211]:
output = output.set_index('PassengerId')
output.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [212]:
output.to_csv('predict.csv')