In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

import string
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [278]:
df_train = pd.read_csv('train.csv')
df_train.name = 'Training Set'
df_test = pd.read_csv('test.csv')
df_test.name = 'Training Set'
df_all = pd.concat([df_train, df_test], sort=True)
df_all.name = 'All Set' 
df_all.reset_index(inplace=True, drop=True)
dfs = [df_train, df_test]

In [279]:
print(df_train.shape)
df_train.head(3)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [280]:
print(df_test.shape)
df_test.head(3)

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [281]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 97.2+ KB


In [282]:
def display_missing(df):
    print('{}'.format(df.name))
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')

In [283]:
for df in dfs:
    display_missing(df)

Training Set
PassengerId column missing values: 0
Survived column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 177
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 0
Cabin column missing values: 687
Embarked column missing values: 2


Training Set
PassengerId column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 86
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 1
Cabin column missing values: 327
Embarked column missing values: 0




# 1. Missing Values

In [284]:
df_all_corr = df_all.corr().abs().unstack().sort_values(kind='quicksort', ascending=False).reset_index()

In [285]:
df_all_corr.rename(columns={'level_0':'Feature_1', 'level_1':'Feature_2', 0:'Coeficiente Correlation'}, inplace=True )

In [286]:
df_all_corr[df_all_corr['Feature_1']=='Age']

Unnamed: 0,Feature_1,Feature_2,Coeficiente Correlation
6,Age,Age,1.0
9,Age,Pclass,0.408106
17,Age,SibSp,0.243699
22,Age,Fare,0.17874
25,Age,Parch,0.150917
29,Age,Survived,0.077221
41,Age,PassengerId,0.028814


In [287]:
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass']).median()['Age']

In [288]:
age_by_pclass_sex.head()

Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
Name: Age, dtype: float64

In [289]:
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

In [290]:
df_all[df_all['Embarked'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
61,38.0,B28,,80.0,"Icard, Miss. Amelie",0,62,1,female,0,1.0,113572
829,62.0,B28,,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,830,1,female,0,1.0,113572


In [291]:
df_all['Embarked'] = df_all['Embarked'].fillna('S')

In [292]:
df_all[df_all['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
1043,60.5,,S,,"Storey, Mr. Thomas",0,1044,3,male,0,,3701


In [293]:
med_fare = df_all.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]

In [294]:
df_all['Fare'] = df_all['Fare'].fillna(med_fare)

In [295]:
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

In [296]:
df_all.head(2)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,M
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,C


In [297]:
df_all.Deck.value_counts()

M    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: Deck, dtype: int64

In [298]:
df_all['Deck'].loc[df_all['Deck'] == 'T'] = 'M'

In [299]:
df_all.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,M
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,C
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,M
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,C
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,M


In [300]:
df_all.Deck.value_counts()

M    1015
C      94
B      65
D      46
E      41
A      22
F      21
G       5
Name: Deck, dtype: int64

In [301]:
df_all[['Deck','Survived']].groupby(['Deck'], as_index=False).mean().sort_values(by='Deck', ascending=True)

Unnamed: 0,Deck,Survived
0,A,0.466667
1,B,0.744681
2,C,0.59322
3,D,0.757576
4,E,0.75
5,F,0.615385
6,G,0.5
7,M,0.299419


In [302]:
df_all.drop(['Cabin'], inplace=True, axis=1)

In [303]:
display_missing(df_all)

All Set
Age column missing values: 0
Embarked column missing values: 0
Fare column missing values: 0
Name column missing values: 0
Parch column missing values: 0
PassengerId column missing values: 0
Pclass column missing values: 0
Sex column missing values: 0
SibSp column missing values: 0
Survived column missing values: 418
Ticket column missing values: 0
Deck column missing values: 0




In [304]:
df_all.head()

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck
0,22.0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,M
1,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,C
2,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,M
3,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,C
4,35.0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,M


# 2. Correlations

In [305]:
df_train = df_all.loc[:890]
df_test = df_all.loc[891:]
df_test.drop(columns=['Survived'], inplace=True)
dfs = [df_train, df_test]

In [306]:
df_train['Survived'].value_counts()

0.0    549
1.0    342
Name: Survived, dtype: int64

In [307]:
df_train_corr = df_train.corr().abs().unstack().sort_values(kind='quicksort', ascending=False).reset_index()

In [308]:
df_train_corr.rename(columns={'level_0':'Feature_1', 'level_1':'Feature_2', 0:'Correlation'}, inplace=True)

In [309]:
df_train_corr.drop(df_train_corr.iloc[1::2].index, inplace=True)

In [310]:
df_train_corr_nd = df_train_corr.drop(df_train_corr[df_train_corr['Correlation']==1.0].index)

In [311]:
corr = df_train_corr_nd['Correlation'] > 0.1
df_train_corr_nd[corr]

Unnamed: 0,Feature_1,Feature_2,Correlation
8,Pclass,Fare,0.5495
10,Age,Pclass,0.417667
12,Parch,SibSp,0.414838
14,Survived,Pclass,0.338481
16,Survived,Fare,0.257307
18,SibSp,Age,0.249747
20,Parch,Fare,0.216225
22,Age,Parch,0.176733
24,SibSp,Fare,0.159651
26,Age,Fare,0.124061


# 3. Feature Engineering

In [312]:
df_all = pd.concat([df_train, df_test], sort=True)

In [313]:
df_all['Fare'] = pd.qcut(df_all['Fare'], 4)

In [314]:
df_all['Age'] = pd.qcut(df_all['Age'], 5)

In [315]:
df_all.head(2)

Unnamed: 0,Age,Deck,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,"(21.0, 25.0]",M,S,"(-0.001, 7.896]","Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,"(29.5, 40.0]",C,C,"(31.275, 512.329]","Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599


In [316]:
df_all['Fare'].value_counts()

(-0.001, 7.896]      338
(14.454, 31.275]     328
(31.275, 512.329]    323
(7.896, 14.454]      320
Name: Fare, dtype: int64

In [317]:
df_all['Family_Size'] = df_all['Parch'] + df_all['SibSp'] + 1

In [318]:
df_all.head(2)

Unnamed: 0,Age,Deck,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size
0,"(21.0, 25.0]",M,S,"(-0.001, 7.896]","Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,2
1,"(29.5, 40.0]",C,C,"(31.275, 512.329]","Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,2


In [319]:
df_all['Title'] = df_all['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [320]:
df_all.head(2)

Unnamed: 0,Age,Deck,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title
0,"(21.0, 25.0]",M,S,"(-0.001, 7.896]","Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,2,Mr
1,"(29.5, 40.0]",C,C,"(31.275, 512.329]","Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,2,Mrs


In [321]:
df_all['Is_Married'] = 0
df_all['Is_Married'].loc[df_all['Title'] == 'Mrs'] = 1

In [322]:
df_all.head(2)

Unnamed: 0,Age,Deck,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title,Is_Married
0,"(21.0, 25.0]",M,S,"(-0.001, 7.896]","Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,2,Mr,0
1,"(29.5, 40.0]",C,C,"(31.275, 512.329]","Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,2,Mrs,1


In [323]:
df_all['Title'] .value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
Jonkheer          1
Mme               1
Don               1
the Countess      1
Dona              1
Lady              1
Sir               1
Capt              1
Name: Title, dtype: int64

In [324]:
df_all['Title']  = df_all['Title'].replace(['Lady', 'the Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_all['Title']  = df_all['Title'].replace('Mlle', 'Miss')
df_all['Title']  = df_all['Title'].replace('Ms', 'Miss')
df_all['Title']  = df_all['Title'].replace('Mme', 'Mrs')

In [325]:
df_all['Title'] .value_counts()

Mr        757
Miss      264
Mrs       198
Master     61
Rare       29
Name: Title, dtype: int64

In [326]:
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
df_all['Family_Size_Grouped'] = df_all['Family_Size'].map(family_map)

In [327]:
df_all['Is_Alone'] = 0
df_all['Is_Alone'].loc[df_all['Family_Size'] == 1] = 1

In [328]:
df_train = df_all.loc[:890]
df_test = df_all.loc[891:]
dfs = [df_train, df_test]

In [329]:
df_all.head(2)

Unnamed: 0,Age,Deck,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title,Is_Married,Family_Size_Grouped,Is_Alone
0,"(21.0, 25.0]",M,S,"(-0.001, 7.896]","Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,2,Mr,0,Small,0
1,"(29.5, 40.0]",C,C,"(31.275, 512.329]","Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,2,Mrs,1,Small,0


In [330]:
print(df_train.shape)
print(df_test.shape)
df_train.head(2)

(891, 17)
(418, 17)


Unnamed: 0,Age,Deck,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Family_Size,Title,Is_Married,Family_Size_Grouped,Is_Alone
0,"(21.0, 25.0]",M,S,"(-0.001, 7.896]","Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,2,Mr,0,Small,0
1,"(29.5, 40.0]",C,C,"(31.275, 512.329]","Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,2,Mrs,1,Small,0


# 4. Label Enconding

In [333]:
str_features = ['Age','Deck','Embarked','Fare','Sex', 'Title', 'Family_Size_Grouped']

for df in dfs:
    for feature in str_features:        
        df[feature] = LabelEncoder().fit_transform(df[feature])

In [334]:
cat_features = ['Deck','Embarked','Sex','Pclass', 'Title', 'Family_Size_Grouped']
encoded_features = []

for df in dfs:
    for feature in cat_features:
        encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
        n = df[feature].nunique()
        cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
        encoded_df = pd.DataFrame(encoded_feat, columns=cols)
        encoded_df.index = df.index
        encoded_features.append(encoded_df)

df_train = pd.concat([df_train, *encoded_features[:6]], axis=1)
df_test = pd.concat([df_test, *encoded_features[6:]], axis=1)

In [335]:
print(df_train.shape)
print(df_test.shape)

(891, 42)
(418, 42)


In [336]:
df_train.head(2)

Unnamed: 0,Age,Deck,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,Pclass_3,Title_1,Title_2,Title_3,Title_4,Title_5,Family_Size_Grouped_1,Family_Size_Grouped_2,Family_Size_Grouped_3,Family_Size_Grouped_4
0,1,7,2,0,"Braund, Mr. Owen Harris",0,1,3,1,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,2,0,3,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [337]:
drop_cols = ['Name', 'Ticket', 'PassengerId', 'Pclass', 'Sex', 'Embarked', 'Title', 'Family_Size', 'Family_Size_Grouped', 'SibSp', 'Parch', 'Deck']

In [338]:
df_train = df_train.drop(columns=drop_cols)
df_test = df_test.drop(columns=drop_cols)

In [339]:
print(df_train.shape)
print(df_test.shape)

(891, 30)
(418, 30)


In [340]:
X = df_train.drop("Survived", axis=1)
y = df_train["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# 5. Predict Models

In [341]:
# Logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
print('Accuracy: ', acc_log)
print(roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1]))

Accuracy:  83.47
0.879694941822


In [342]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
print('Accuracy: ', acc_decision_tree)
print('AUC: ', roc_auc_score(y_test, decision_tree.predict_proba(X_test)[:,1]))

Accuracy:  91.65
AUC:  0.793693856635


In [343]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print('Accuracy: ', acc_random_forest)
print('AUC: ', roc_auc_score(y_test, random_forest.predict_proba(X_test)[:,1]))

Accuracy:  91.65
AUC:  0.855579670827


In [344]:
clf = RandomForestClassifier(criterion='gini', 
                            n_estimators=700,
                            max_depth=4,
                            min_samples_split=19,
                            min_samples_leaf=6, 
                            max_features='auto', 
                            oob_score=True, 
                            n_jobs=-1,
                            verbose=1) 

clf.fit(StandardScaler().fit_transform(X_train), y_train)
print('RandomForestClassifier oob score: {}'.format(clf.oob_score_))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:    0.4s finished


RandomForestClassifier oob score: 0.8105939004815409


In [345]:
# Boosting
from sklearn import ensemble
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test)))
print(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))

             precision    recall  f1-score   support

        0.0       0.87      0.88      0.88       171
        1.0       0.79      0.76      0.77        97

avg / total       0.84      0.84      0.84       268

0.884517996021


In [346]:
tuned_parameters = [{'learning_rate': [0.001, 0.01, 0.1],
                     'max_depth' : [1, 3, 10],
                     'min_samples_split': [2, 10],
                     'n_estimators': [10, 100, 500],
                     'subsample': [0.5, 1]}]

In [347]:
clf = GridSearchCV(ensemble.GradientBoostingClassifier(), tuned_parameters, cv=5, verbose=10)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10, subsample=0.5, score=0.6048387096774194, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=10,

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s


[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.6048387096774194, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.6048387096774194, total=   0.0s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=1 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=2, n_estimators=100, subsample=1, score=0.608, total=   0.0s
[

[CV]  learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.808, total=   0.2s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.8, total=   0.2s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.776, total=   0.2s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.782258064516129, total=   0.2s
[CV] learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.75, total=   0.2s
[CV] 

[CV]  learning_rate=0.001, max_depth=3, min_samples_split=2, n_estimators=500, subsample=1, score=0.782258064516129, total=   0.4s
[CV] learning_rate=0.001, max_depth=3, min_samples_split=2, n_estimators=500, subsample=1 
[CV]  learning_rate=0.001, max_depth=3, min_samples_split=2, n_estimators=500, subsample=1, score=0.7661290322580645, total=   0.4s
[CV] learning_rate=0.001, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.001, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.001, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV]

[CV]  learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.608, total=   0.3s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.608, total=   0.3s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.608, total=   0.3s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.6048387096774194, total=   0.3s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.6048387096774194, to

[CV]  learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1, score=0.6048387096774194, total=   0.2s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1, score=0.6048387096774194, total=   0.2s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.792, total=   1.1s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.784, total=   1.1s
[CV] learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.001, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.856,

[CV]  learning_rate=0.01, max_depth=1, min_samples_split=2, n_estimators=500, subsample=1, score=0.8, total=   0.1s
[CV] learning_rate=0.01, max_depth=1, min_samples_split=2, n_estimators=500, subsample=1 
[CV]  learning_rate=0.01, max_depth=1, min_samples_split=2, n_estimators=500, subsample=1, score=0.808, total=   0.1s
[CV] learning_rate=0.01, max_depth=1, min_samples_split=2, n_estimators=500, subsample=1 
[CV]  learning_rate=0.01, max_depth=1, min_samples_split=2, n_estimators=500, subsample=1, score=0.7983870967741935, total=   0.1s
[CV] learning_rate=0.01, max_depth=1, min_samples_split=2, n_estimators=500, subsample=1 
[CV]  learning_rate=0.01, max_depth=1, min_samples_split=2, n_estimators=500, subsample=1, score=0.7580645161290323, total=   0.1s
[CV] learning_rate=0.01, max_depth=1, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=1, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.0

[CV]  learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.792, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.76, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.848, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.8145161290322581, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.7983870967741935, total=   0.0s
[CV] le

[CV]  learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=100, subsample=1, score=0.7741935483870968, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.784, total=   0.4s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.776, total=   0.4s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.824, total=   0.4s
[CV] learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.8387096774193549, total=   0.4s

[CV]  learning_rate=0.01, max_depth=10, min_samples_split=2, n_estimators=500, subsample=1, score=0.776, total=   2.4s
[CV] learning_rate=0.01, max_depth=10, min_samples_split=2, n_estimators=500, subsample=1 
[CV]  learning_rate=0.01, max_depth=10, min_samples_split=2, n_estimators=500, subsample=1, score=0.8225806451612904, total=   2.3s
[CV] learning_rate=0.01, max_depth=10, min_samples_split=2, n_estimators=500, subsample=1 
[CV]  learning_rate=0.01, max_depth=10, min_samples_split=2, n_estimators=500, subsample=1, score=0.8387096774193549, total=   2.3s
[CV] learning_rate=0.01, max_depth=10, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=10, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] learning_rate=0.01, max_depth=10, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=10, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.608, total=   0.0s
[CV] le

[CV]  learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.816, total=   0.0s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.84, total=   0.0s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.8306451612903226, total=   0.0s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.7903225806451613, total=   0.0s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=1 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=2, n_estimators=100, subsample=1, score=0.848, total=   0.0s
[CV] learning_rate=0

[CV]  learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.856, total=   0.2s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.832, total=   0.2s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.816, total=   0.2s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.8306451612903226, total=   0.2s
[CV] learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=1, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.7903225806451613, total=   0.2s
[CV] l

[CV]  learning_rate=0.1, max_depth=3, min_samples_split=2, n_estimators=500, subsample=1, score=0.8225806451612904, total=   0.3s
[CV] learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.808, total=   0.0s
[CV] learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.752, total=   0.0s
[CV] learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.824, total=   0.0s
[CV] learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_samples_split=10, n_estimators=10, subsample=0.5, score=0.8225806451612904, total=   0.0s
[CV] learning_rat

[CV]  learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=10, subsample=1, score=0.8306451612903226, total=   0.0s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=10, subsample=1 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=10, subsample=1, score=0.8306451612903226, total=   0.0s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.824, total=   0.4s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.776, total=   0.4s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=2, n_estimators=100, subsample=0.5, score=0.776, total=   0.4s
[CV] learning_r

[CV]  learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1, score=0.8, total=   0.3s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1, score=0.8225806451612904, total=   0.3s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=100, subsample=1, score=0.8467741935483871, total=   0.3s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.816, total=   1.1s
[CV] learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=10, min_samples_split=10, n_estimators=500, subsample=0.5, score=0.784, total=   1.2s
[CV] lear

[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:  3.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'learning_rate': [0.001, 0.01, 0.1], 'max_depth': [1, 3, 10], 'min_samples_split': [2, 10], 'n_estimators': [10, 100, 500], 'subsample': [0.5, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [348]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

Best parameters set found on development set:

{'learning_rate': 0.1, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.5}

Grid scores on development set:

0.607 (+/-0.003) for {'learning_rate': 0.001, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 10, 'subsample': 0.5}
0.607 (+/-0.003) for {'learning_rate': 0.001, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 10, 'subsample': 1}
0.607 (+/-0.003) for {'learning_rate': 0.001, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.5}
0.607 (+/-0.003) for {'learning_rate': 0.001, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 1}
0.783 (+/-0.040) for {'learning_rate': 0.001, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 500, 'subsample': 0.5}
0.783 (+/-0.040) for {'learning_rate': 0.001, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 500, 'subsample': 1}
0.607 (+/-0.003) for {'learning_rate': 0.001, 'max_depth': 1, 'min_samples_

In [349]:
print(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))

0.873244106831


In [350]:
df_test = df_test.drop(['Survived'], axis=1)

In [351]:
y_pred = clf.predict(StandardScaler().fit_transform(df_test)).astype(int)

# 5. Submission

In [358]:
PassengerId = pd.read_csv('test.csv')

In [359]:
submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = PassengerId['PassengerId']
submission_df['Survived'] = y_pred

In [360]:
submission_df.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


In [361]:
submission_df.to_csv('submissions.csv', header=True, index=False)
submission_df.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
