In [176]:
import numpy as np
import pandas as pd

In [177]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df2 = pd.read_csv('gender_submission.csv')

In [178]:
print(train.shape)

(891, 12)


In [179]:
print(train.columns)
print(test.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [180]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [181]:
# calculate the majority class, and the accuracy if always predict that class
mask = train['Survived']==0
train[mask].shape
print(549/891) # percentage to beat from dummy

0.6161616161616161


In [182]:
# remove unwanted columns
feature_to_keep_train = ['PassengerId', 'Survived', 'Pclass', 'Sex', 'SibSp', 'Parch','Fare','Embarked']
feature_to_keep_test = ['PassengerId', 'Pclass', 'Sex','SibSp', 'Parch','Fare','Embarked']

train = train[feature_to_keep_train]
test = test[feature_to_keep_test]
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,1,0,3,male,1,0,7.25,S
1,2,1,1,female,1,0,71.2833,C
2,3,1,3,female,0,0,7.925,S
3,4,1,1,female,1,0,53.1,S
4,5,0,3,male,0,0,8.05,S


In [183]:
# normalize column: Fare
print(train['Fare'].head())
#train['Fare'] = (train['Fare'] - train['Fare'].mean())/train['Fare'].std()**2
print(train['Fare'].head())

0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64
0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64


In [184]:
# create one hot columns
train = pd.get_dummies(train, columns=["Pclass", "Sex", "Embarked"], prefix=["Pclass", "Sex", "Embarked"])
test = pd.get_dummies(test, columns=["Pclass", "Sex", "Embarked"], prefix=["Pclass", "Sex", "Embarked"])
train.head()

Unnamed: 0,PassengerId,Survived,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,1,0,7.25,0,0,1,0,1,0,0,1
1,2,1,1,0,71.2833,1,0,0,1,0,1,0,0
2,3,1,0,0,7.925,0,0,1,1,0,0,0,1
3,4,1,1,0,53.1,1,0,0,1,0,0,0,1
4,5,0,0,0,8.05,0,0,1,0,1,0,0,1


In [185]:
# splitting training set and dev set
from sklearn.utils import shuffle
train = shuffle(train)

m, _ = train.shape  # 819, number of rows in training set
m_train = 712 # about 80% of the training data. dev set will be approx. 20% of the training data

X_train = train.iloc[:m_train,2:]
X_dev = train.iloc[m_train:m,2:]
X_train.head()

Unnamed: 0,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
841,0,0,10.5,0,1,0,0,1,0,0,1
446,0,1,19.5,0,1,0,1,0,0,0,1
134,0,0,13.0,0,1,0,0,1,0,0,1
348,1,1,15.9,0,0,1,0,1,0,0,1
296,0,0,7.2292,0,0,1,0,1,1,0,0


In [186]:
# create Y train and dev set
y_train = train.iloc[:m_train,1]
y_dev = train.iloc[m_train:m,1]

print(type(y_train))
print(y_train.size + y_dev.size)

<class 'pandas.core.series.Series'>
891


In [187]:
######## Random Gradient Boosting Trees ########

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
params = {'learning_rate':[0.07, 0.1, 0.15, 0.2], 'n_estimators':[120,180,240]}

clf = GradientBoostingClassifier()

# grid search
grid_clf_acc = GridSearchCV(clf, param_grid=params, scoring='accuracy')
grid_clf_acc.fit(X_train, y_train)

print(np.array(grid_clf_acc.cv_results_['mean_test_score']))
print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

[0.8005618  0.8005618  0.80477528 0.80477528 0.80337079 0.7991573
 0.80617978 0.80477528 0.80617978 0.79494382 0.79775281 0.79775281]
Grid best parameter (max. accuracy):  {'learning_rate': 0.15, 'n_estimators': 120}
Grid best score (accuracy):  0.8061797752808989


In [223]:
####### Evaluate model with dev set ######## 
from sklearn.metrics import accuracy_score
clf = GradientBoostingClassifier(learning_rate=0.07, n_estimators=120, max_depth=7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_dev)
accuracy_score(y_dev, y_pred)

  y = column_or_1d(y, warn=True)


0.8324022346368715

In [189]:
print(X_train.shape)
print(type(X_train))
print(y_train.shape)
print(type(y_train))

(712, 11)
<class 'pandas.core.frame.DataFrame'>
(712,)
<class 'pandas.core.series.Series'>


In [190]:
X_train = np.array(X_train)
y_train = np.array(y_train)
y_train = y_train.reshape((m_train,1))
print(X_train.shape)
print(type(X_train))
print(y_train.shape)
print(type(y_train))

(712, 11)
<class 'numpy.ndarray'>
(712, 1)
<class 'numpy.ndarray'>


In [191]:
test.head()

Unnamed: 0,PassengerId,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,0,0,7.8292,0,0,1,0,1,0,1,0
1,893,1,0,7.0,0,0,1,1,0,0,0,1
2,894,0,0,9.6875,0,1,0,0,1,0,1,0
3,895,0,0,8.6625,0,0,1,0,1,0,0,1
4,896,1,1,12.2875,0,0,1,1,0,0,0,1


In [192]:
test.shape

(418, 12)

In [193]:
#test['Fare'] = (test['Fare'] - test['Fare'].mean())/test['Fare'].std()**2

In [194]:
output = test.PassengerId
X_test = test.iloc[:,1:]

print(X_test.columns)
print(X_dev.columns)

test.head(10)
X_test = X_test.fillna(value=0)
y_test = clf.predict(X_test)

Index(['SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')
Index(['SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [195]:
np.any(np.isnan(X_test))

False

In [199]:
y_test = pd.Series(y_test)

In [213]:
output = pd.concat([output, y_test], axis=1)

In [214]:
output.columns = ['PassengerId','Survived']
output = output.set_index('PassengerId')

In [215]:
output.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
,0.0
,1.0
,0.0
,0.0
,1.0


In [216]:
output.to_csv('predict.csv')