In [1]:
import os

import numpy as np
import pandas as pd
import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

DIR_TRAIN = os.getcwd() + "\\data\\train.csv"
DIR_TEST = os.getcwd() + "\\data\\test.csv"

test_names = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
train_names = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
               'Embarked']

Using TensorFlow backend.


In [2]:
def loadData(is_train, dir):
    if is_train:
        names = train_names
    else:
        names = test_names
    data = pd.read_csv(dir, header=0, names=names)
    return data

In [3]:
def splitData(datas, labels, splite):
    return train_test_split(datas, labels, test_size=splite, random_state=42)

In [4]:
data_train = loadData(is_train=True, dir=DIR_TRAIN)

In [5]:
data_test = loadData(is_train=False,dir=DIR_TEST)

In [6]:
x_train = data_train.drop(['PassengerId','Ticket','Survived','Cabin'],axis = 1)
y_train = data_train['Survived']

In [7]:
print(x_train.head())
print(y_train.head())

   Pclass                                               Name     Sex   Age  \
0       3                            Braund, Mr. Owen Harris    male  22.0   
1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
2       3                             Heikkinen, Miss. Laina  female  26.0   
3       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
4       3                           Allen, Mr. William Henry    male  35.0   

   SibSp  Parch     Fare Embarked  
0      1      0   7.2500        S  
1      1      0  71.2833        C  
2      0      0   7.9250        S  
3      1      0  53.1000        S  
4      0      0   8.0500        S  
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [8]:
x_val = data_test.drop(['PassengerId','Ticket','Cabin'],axis=1)

In [9]:
print(x_train.shape)
print(x_val.shape)
data = x_train.append(x_val)
print(data.shape)

(891, 8)
(418, 8)
(1309, 8)


In [10]:
data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [11]:
data['Title'] = data['Name'].map(lambda x: x.split(', ')[1].split('.')[0])
data.sample(20)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
28,1,"Brady, Mr. John Bertram",male,41.0,0,0,30.5,S,Mr
196,3,"Mernagh, Mr. Robert",male,,0,0,7.75,Q,Mr
644,3,"Baclini, Miss. Eugenie",female,0.75,2,1,19.2583,C,Miss
17,3,"Assaf, Mr. Gerios",male,21.0,0,0,7.225,C,Mr
531,3,"Toufik, Mr. Nakli",male,,0,0,7.2292,C,Mr
402,1,"Gibson, Miss. Dorothy Winifred",female,22.0,0,1,59.4,C,Miss
176,2,"Sincock, Miss. Maude",female,20.0,0,0,36.75,S,Miss
51,3,"Nosworthy, Mr. Richard Cater",male,21.0,0,0,7.8,S,Mr
91,3,"Andreasson, Mr. Paul Edvin",male,20.0,0,0,7.8542,S,Mr
722,2,"Gillespie, Mr. William Henry",male,34.0,0,0,13.0,S,Mr


In [12]:
data = data.drop('Name',axis=1)
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,male,22.0,1,0,7.25,S,Mr
1,1,female,38.0,1,0,71.2833,C,Mrs
2,3,female,26.0,0,0,7.925,S,Miss
3,1,female,35.0,1,0,53.1,S,Mrs
4,3,male,35.0,0,0,8.05,S,Mr


In [13]:
data['FamilySize'] = data['SibSp']+data['Parch']+1
data.sample(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize
880,2,female,25.0,0,1,26.0,S,Mrs,2
264,2,male,30.0,0,0,12.7375,C,Mr,1
121,3,male,,1,0,7.75,Q,Mr,2
21,2,male,34.0,0,0,13.0,S,Mr,1
231,3,male,29.0,0,0,7.775,S,Mr,1
575,3,male,19.0,0,0,14.5,S,Mr,1
865,2,female,42.0,0,0,13.0,S,Mrs,1
145,3,male,31.0,3,0,18.0,S,Mr,4
275,1,female,63.0,1,0,77.9583,S,Miss,2
387,2,female,36.0,0,0,13.0,S,Miss,1


In [14]:
data['IsAlone'] = 1
data['IsAlone'].loc[data['FamilySize'] > 1] = 0
data.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
60,3,male,22.0,0,0,7.2292,C,Mr,1,1
773,3,male,,0,0,7.225,C,Mr,1,1
410,3,female,,0,0,7.75,Q,Miss,1,1
203,3,male,45.5,0,0,7.225,C,Mr,1,1
56,3,male,35.0,0,0,7.8958,S,Mr,1,1
638,3,female,41.0,0,5,39.6875,S,Mrs,6,0
685,2,male,25.0,1,2,41.5792,C,Mr,4,0
711,1,male,,0,0,26.55,S,Mr,1,1
172,3,female,1.0,1,1,11.1333,S,Miss,3,0
223,3,male,,0,0,7.8958,S,Mr,1,1


In [15]:
print('-' *10)
print(x_train.isnull().sum())
print('-' *10)
print(x_val.isnull().sum())

----------
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64
----------
Pclass       0
Name         0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [16]:
print(data['Title'].value_counts())
#将Title小于10的变为Msic

title_names = (data['Title'].value_counts()<10)

data['Title'] = data['Title'].map(lambda x: 'Msic' if title_names.loc[x] == True else x)
data.sample(20)

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Mlle              2
Major             2
Ms                2
Don               1
Mme               1
Sir               1
the Countess      1
Capt              1
Lady              1
Dona              1
Jonkheer          1
Name: Title, dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
408,3,male,21.0,0,0,7.775,S,Mr,1,1
725,3,male,20.0,0,0,8.6625,S,Mr,1,1
625,1,male,61.0,0,0,32.3208,S,Mr,1,1
289,3,male,,0,0,8.05,S,Mr,1,1
619,2,male,26.0,0,0,10.5,S,Mr,1,1
370,1,male,25.0,1,0,55.4417,C,Mr,2,0
301,3,male,,2,0,23.25,Q,Mr,3,0
101,3,male,,0,0,7.8958,S,Mr,1,1
762,3,male,20.0,0,0,7.2292,C,Mr,1,1
335,1,male,30.0,0,0,26.0,S,Mr,1,1


In [17]:
age_mean = data['Age'].mean()
fare_mean = data['Fare'].mean()
print('age mean : {}    fare mean : {}'.format(age_mean,fare_mean))

age mean : 29.881137667304014    fare mean : 33.29547928134557


In [18]:
#mode 泛数
data['Embarked'].mode()

0    S
dtype: object

In [19]:
data['Embarked'].fillna(data['Embarked'].mode()[0],inplace=True)
data = data.fillna({'Age':age_mean,'Fare':fare_mean})

In [20]:
print(data.isnull().sum())

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
Title         0
FamilySize    0
IsAlone       0
dtype: int64


In [21]:
data['Embarked'] = pd.Categorical(data['Embarked'])
data['Embarked'] = data['Embarked'].cat.codes+1

data['Sex'] = pd.Categorical(data['Sex'])
data['Sex'] = data['Sex'].cat.codes+1

data['Title'] = pd.Categorical(data['Title'])
data['Title'] = data['Title'].cat.codes+1

In [22]:
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
0,3,2,22.0,1,0,7.25,3,3,2,0
1,1,1,38.0,1,0,71.2833,1,4,2,0
2,3,1,26.0,0,0,7.925,3,2,1,1
3,1,1,35.0,1,0,53.1,3,4,2,0
4,3,2,35.0,0,0,8.05,3,3,1,1


In [23]:
age_bin = [-1,0,5,12,18,25,35,60,120]
group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
age_cat = pd.cut(data['Age'],bins=age_bin,labels=group_names)
data['Age'] = age_cat.cat.codes

In [24]:
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
0,3,2,4,1,0,7.25,3,3,2,0
1,1,1,6,1,0,71.2833,1,4,2,0
2,3,1,5,0,0,7.925,3,2,1,1
3,1,1,5,1,0,53.1,3,4,2,0
4,3,2,5,0,0,8.05,3,3,1,1


In [25]:
fare_bin = (-1, 0, 8, 15, 31, 1000)
group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
fare_cat = pd.cut(data['Fare'],bins=fare_bin,labels=group_names)
data['Fare'] = fare_cat.cat.codes
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
0,3,2,4,1,0,1,3,3,2,0
1,1,1,6,1,0,4,1,4,2,0
2,3,1,5,0,0,1,3,2,1,1
3,1,1,5,1,0,4,3,4,2,0
4,3,2,5,0,0,2,3,3,1,1


In [26]:
if os.path.exists('./working/train.csv'):
    os.remove('./working/train.csv')
    
save_train = data.iloc[0:891].copy()
save_train['Survived'] = y_train
save_train.to_csv('./working/train.csv',index=False)

if os.path.exists('./working/test.csv'):
    os.remove('./working/test.csv')
    
save_val = data.iloc[891:].copy()
save_val.to_csv('./working/test.csv',index=False)

In [27]:
def generator_data():
    x_train = pd.read_csv('./working/train.csv', header=0,)
    y_train = x_train.pop('Survived') 
    x_train,x_test,y_train,y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=33)
    
    y_train = np.asarray(y_train)
    x_train = np.asarray(x_train)

    y_test = np.asarray(y_test)
    y_train = np.asarray(y_train)
    
    return x_train,y_train,x_test,y_test

In [28]:
x_train_,y_train_,x_test_,y_test_ = generator_data()
x_train_

array([[3, 2, 5, ..., 3, 1, 1],
       [2, 1, 6, ..., 4, 3, 0],
       [3, 2, 4, ..., 3, 1, 1],
       ...,
       [2, 2, 4, ..., 3, 2, 0],
       [3, 2, 4, ..., 3, 1, 1],
       [2, 2, 5, ..., 3, 1, 1]], dtype=int64)

In [50]:
from hyperopt import Trials, STATUS_OK, tpe,hp
from hyperopt.fmin import fmin
from hyperas import optim
from hyperas.distributions import choice, uniform


def create_model(x_train,y_train,x_test,y_test):
    '''
    :return: keras model
    '''
    model = keras.models.Sequential()
    model.add(keras.layers.Dense({{choice([16,32,64,128])}}, activation='relu',kernel_regularizer=keras.regularizers.l1_l2(l1=0.001,l2=0.001)))
    model.add(keras.layers.Dropout({{uniform(0.1,0.8)}}))
    model.add(keras.layers.Dense({{choice([16,32,64,128])}}, activation='relu',kernel_regularizer=keras.regularizers.l1_l2(l1=0.001,l2=0.001)))
    model.add(keras.layers.Dropout({{uniform(0.1,0.8)}}))
    model.add(keras.layers.Dense({{choice([16,32,64,128])}}, activation='relu',kernel_regularizer=keras.regularizers.l1_l2(l1=0.001,l2=0.001)))
    model.add(keras.layers.Dropout({{uniform(0.1,0.8)}}))
    model.add(keras.layers.Dense(1,activation='sigmoid'))
#     model.add(keras.layers.Dense(8, activation='relu'))
    model.compile(optimizer=keras.optimizers.RMSprop(), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy])
    
    history = model.fit(x_train,y_train,batch_size=128,validation_data=[x_test,y_test],epochs=20)
    
    validation_acc = np.amax(history.history['val_binary_accuracy']) 
    
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [51]:
??optim.minimize

In [52]:
keras.backend.clear_session()

best_run, best_model= optim.minimize(model=create_model,
                                          data=generator_data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials(),notebook_name='titanic_v2')



>>> Imports:
#coding=utf-8

try:
    import os
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import keras
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK, tpe, hp
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    from keras.models import load_model
except:
    pass

try:
    import xgboost as xgb
except:
    pass

try:
    from sklearn.model_selection import StratifiedKFold
except:
    pass

try:
    import hyperopt.pyll.stochastic
except:
    pass

try:
    from sklearn.metrics import make_scorer
except:
    pass

>>> Hyperas search space:

def get_space():
    return {
        'Dense': hp.choice('Dense', [16,32,64,128]),
        'Dropout': hp.uniform('

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 712 samples, validate on 179 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 712 samples, validate on 179 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 712 samples, validate on 179 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [53]:
print("Evalutation of best performing model:")
print(best_model.evaluate(x_test_,y_test_))
print("Best performing model chosen hyper-parameters:")
print(best_run)

Evalutation of best performing model:
[0.8496508391875794, 0.7318435767509418]
Best performing model chosen hyper-parameters:
{'Dense': 2, 'Dense_1': 0, 'Dense_2': 3, 'Dropout': 0.38886345096749875, 'Dropout_1': 0.4391118666124083, 'Dropout_2': 0.11825586217831917}


In [54]:
best_model.save('./working/best_model.h5')

In [55]:
from keras.models import load_model
model = load_model('./working/best_model.h5')

In [56]:
model.evaluate(x_test_,y_test_,batch_size=64)



[0.849650854837961, 0.7318435797478233]

#### xgboost

In [65]:
import xgboost as xgb  
from sklearn.model_selection import StratifiedKFold
import hyperopt.pyll.stochastic
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

In [58]:
def gini(truth, predictions):
    g = np.asarray(np.c_[truth, predictions, np.arange(len(truth)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(truth) + 1) / 2.
    return gs / len(truth)

def gini_xgb(predictions, truth):
    truth = truth.get_label()
    return 'gini', -1.0 * gini(truth, predictions) / gini(truth, truth)

def gini_lgb(truth, predictions):
    score = gini(truth, predictions) / gini(truth, truth)
    return 'gini', score, True

def gini_sklearn(truth, predictions):
    return gini(truth, predictions) / gini(truth, truth)

gini_scorer = make_scorer(gini_sklearn, greater_is_better=True, needs_proba=True)

In [112]:
xgb_space={
  #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
            'learning_rate': hp.choice('learning_rate',[.01, .03, .05, .1, .25]), #default: .3
            'max_depth': hp.choice('max_depth',[1,2,4,6,8,10]), #default 2
            'n_estimators':hp.choice('grid_n_estimator',[10, 50, 100, 300]) , 
            'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
            'gamma': hp.uniform('gamma', 0.0, 0.5),
}

In [113]:
hyperopt.pyll.stochastic.sample(xgb_space)

{'colsample_bytree': 0.5059774069533536,
 'gamma': 0.25090304858874624,
 'learning_rate': 0.25,
 'max_depth': 2,
 'n_estimators': 50}

In [114]:
def xgb_objective(params):
    param = {
        'gamma':"{:.3f}".format(params['gamma']),
        'colsample_bytree':"{:.3f}".format(params['colsample_bytree']),
        'learning_rate':params['learning_rate'],
        'n_estimators':params['n_estimators'],
        'max_depth':params['max_depth']
    }
    
    clf = xgb.XGBClassifier(seed=0,** param)
    
    score = cross_val_score(clf, x_train_, y_train_, scoring=gini_scorer, cv=StratifiedKFold()).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

In [115]:
xgb_best =fmin(fn=xgb_objective,
            space=xgb_space,
            algo=tpe.suggest,
            max_evals=20)



Gini 0.699 params {'colsample_bytree': 0.4591725362551607, 'gamma': 0.17689873629271607, 'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 10}
Gini 0.719 params {'colsample_bytree': 0.9259237692643127, 'gamma': 0.2954896374846197, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 10}
Gini 0.677 params {'colsample_bytree': 0.7347267819815047, 'gamma': 0.009514868292543444, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100}
Gini 0.693 params {'colsample_bytree': 0.5379487706695507, 'gamma': 0.44549637243973556, 'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 100}
Gini 0.710 params {'colsample_bytree': 0.9571077216993986, 'gamma': 0.28291546661859346, 'learning_rate': 0.03, 'max_depth': 4, 'n_estimators': 50}
Gini 0.713 params {'colsample_bytree': 0.6467897677195162, 'gamma': 0.2968993646304432, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 10}
Gini 0.695 params {'colsample_bytree': 0.3001702130077853, 'gamma': 0.2903123227531913, 'learning_rate': 0.1, '



Gini 0.680 params {'colsample_bytree': 0.5808757120037438, 'gamma': 0.1672106626200895, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}




Gini 0.679 params {'colsample_bytree': 0.6073293646773715, 'gamma': 0.15376039065544467, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}
Gini 0.719 params {'colsample_bytree': 0.9586320130168144, 'gamma': 0.05761775719369472, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 10}
Gini 0.723 params {'colsample_bytree': 0.6269196811198354, 'gamma': 0.019427652487297553, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 10}
Gini 0.675 params {'colsample_bytree': 0.5022863378480525, 'gamma': 0.37224895141323827, 'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 300}
Gini 0.711 params {'colsample_bytree': 0.8294260744834123, 'gamma': 0.45786624291147, 'learning_rate': 0.03, 'max_depth': 4, 'n_estimators': 100}




Gini 0.717 params {'colsample_bytree': 0.907589958621102, 'gamma': 0.1648322450919782, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
Gini 0.719 params {'colsample_bytree': 0.6924411627332555, 'gamma': 0.4906142288743329, 'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 50}




In [116]:
print(xgb_best)

{'colsample_bytree': 0.5793219363822731, 'gamma': 0.030289511477984088, 'grid_n_estimator': 3, 'learning_rate': 0, 'max_depth': 0}


In [84]:
import lightgbm as lgbm

In [85]:
lgb_space = {
    'num_leaves': hp.quniform('num_leaves', 8, 128, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}

In [92]:
def lgbm_objective(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = lgbm.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.01,
        **params
    )
    
    score = cross_val_score(clf, x_train_, y_train_, scoring=gini_scorer, cv=StratifiedKFold()).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

In [94]:
lgbm_best = fmin(fn=lgbm_objective,space=lgb_space,max_evals=20,algo=tpe.suggest)



Gini 0.706 params {'num_leaves': 106, 'colsample_bytree': '0.795'}




Gini 0.713 params {'num_leaves': 26, 'colsample_bytree': '0.542'}




Gini 0.706 params {'num_leaves': 94, 'colsample_bytree': '0.745'}




Gini 0.709 params {'num_leaves': 50, 'colsample_bytree': '0.680'}




Gini 0.715 params {'num_leaves': 124, 'colsample_bytree': '0.445'}




Gini 0.706 params {'num_leaves': 82, 'colsample_bytree': '0.780'}




Gini 0.709 params {'num_leaves': 42, 'colsample_bytree': '0.690'}




Gini 0.700 params {'num_leaves': 84, 'colsample_bytree': '0.878'}




Gini 0.706 params {'num_leaves': 70, 'colsample_bytree': '0.723'}




Gini 0.709 params {'num_leaves': 64, 'colsample_bytree': '0.660'}




Gini 0.712 params {'num_leaves': 58, 'colsample_bytree': '0.379'}




Gini 0.712 params {'num_leaves': 126, 'colsample_bytree': '0.397'}




Gini 0.709 params {'num_leaves': 16, 'colsample_bytree': '0.622'}




Gini 0.706 params {'num_leaves': 112, 'colsample_bytree': '0.776'}




Gini 0.713 params {'num_leaves': 116, 'colsample_bytree': '0.502'}




Gini 0.700 params {'num_leaves': 76, 'colsample_bytree': '0.868'}




Gini 0.709 params {'num_leaves': 20, 'colsample_bytree': '0.679'}




Gini 0.706 params {'num_leaves': 20, 'colsample_bytree': '0.701'}




Gini 0.715 params {'num_leaves': 14, 'colsample_bytree': '0.475'}




Gini 0.713 params {'num_leaves': 84, 'colsample_bytree': '0.520'}


In [95]:
print(lgbm_best)

{'colsample_bytree': 0.8783036158249162, 'num_leaves': 84.0}


In [102]:
from sklearn.ensemble import RandomForestClassifier

In [106]:
def rf_objective(params):
    params = {'n_estimators': int(params['n_estimators']), 'max_depth': int(params['max_depth'])}
    clf = RandomForestClassifier(n_jobs=4, class_weight='balanced', **params)
    score = cross_val_score(clf, x_train_,y_train_, scoring=gini_scorer, cv=StratifiedKFold()).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

In [107]:
rf_space = {
    'n_estimators': hp.quniform('n_estimators', 25, 500, 25),
    'max_depth': hp.quniform('max_depth', 1, 10, 1)
}

In [108]:
rf_best = fmin(fn=rf_objective,
            space=rf_space,
            algo=tpe.suggest,
            max_evals=20)



Gini 0.710 params {'n_estimators': 425, 'max_depth': 4}




Gini 0.652 params {'n_estimators': 250, 'max_depth': 1}




Gini 0.698 params {'n_estimators': 350, 'max_depth': 8}




Gini 0.719 params {'n_estimators': 175, 'max_depth': 5}




Gini 0.724 params {'n_estimators': 375, 'max_depth': 6}




Gini 0.695 params {'n_estimators': 425, 'max_depth': 2}




Gini 0.713 params {'n_estimators': 125, 'max_depth': 7}




Gini 0.693 params {'n_estimators': 275, 'max_depth': 2}




Gini 0.720 params {'n_estimators': 250, 'max_depth': 5}




Gini 0.711 params {'n_estimators': 275, 'max_depth': 4}
Gini 0.697 params {'n_estimators': 25, 'max_depth': 3}




Gini 0.719 params {'n_estimators': 475, 'max_depth': 6}




Gini 0.720 params {'n_estimators': 350, 'max_depth': 5}
Gini 0.705 params {'n_estimators': 25, 'max_depth': 3}




Gini 0.727 params {'n_estimators': 300, 'max_depth': 6}




Gini 0.717 params {'n_estimators': 475, 'max_depth': 7}




Gini 0.724 params {'n_estimators': 375, 'max_depth': 6}




Gini 0.695 params {'n_estimators': 175, 'max_depth': 2}




Gini 0.708 params {'n_estimators': 475, 'max_depth': 3}




Gini 0.721 params {'n_estimators': 225, 'max_depth': 6}


In [109]:
print(rf_best)

{'max_depth': 1.0, 'n_estimators': 250.0}


In [110]:
rf_model = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=250,
    max_depth=1
)

In [111]:
lgbm_model = lgbm.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.01,
    num_leaves=84,
    colsample_bytree=0.88
)

In [117]:
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=1,
    colsample_bytree=0.58,
    gamma=0.05
)

In [118]:
from keras.wrappers.scikit_learn import KerasClassifier

In [194]:
def build_keras_fn(epcho=20):
    model = load_model('./working/best_model.h5')
    return model

In [182]:
class MyKerasClassifier(KerasClassifier):
    def predict(self, x, **kwargs):
        p = super(MyKerasClassifier, self).predict(x, **kwargs)
#         print('predict')
#         print(p.shape)
        if p.shape[1] == 1:
#             print(p[:,0].shape)
            return p[:,0]
        return p
    def predict_proba(self, x, **kwargs):
        probs = super(MyKerasClassifier, self).predict_proba(x, **kwargs)
#         print('predict_proba')
#         print(probs.shape)
        return probs
    

In [195]:
keras_clf = MyKerasClassifier(build_fn=build_keras_fn)

In [196]:
vote_est = [
    #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
   
    ('rfc', rf_model),
    
    ('lgbm', lgbm_model),
    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
   ('xgb', xgb_model),
    
    ('keras',keras_clf)

]

In [197]:
from sklearn.model_selection import ShuffleSplit
cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

In [198]:
from sklearn import ensemble
from sklearn import model_selection

In [199]:
print(x_train_.shape)
print(y_train_.shape)

(712, 10)
(712,)


In [200]:
import types

In [201]:
 isinstance(build_keras_fn, types.FunctionType)

True

In [202]:
#Hard Vote or majority rules w/Tuned Hyperparameters
grid_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
grid_hard_cv = model_selection.cross_validate(grid_hard,x_train_,y_train_, cv  = cv_split)
grid_hard.fit(x_train_,y_train_)

print("Hard Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}". format(grid_hard_cv['train_score'].mean()*100)) 
print("Hard Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}". format(grid_hard_cv['test_score'].mean()*100))
print("Hard Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}". format(grid_hard_cv['test_score'].std()*100*3))
print('-'*10)

#Soft Vote or weighted probabilities w/Tuned Hyperparameters
grid_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')
grid_soft_cv = model_selection.cross_validate(grid_soft, x_train_,y_train_, cv  = cv_split)
grid_soft.fit(x_train_,y_train_)

print("Soft Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}". format(grid_soft_cv['train_score'].mean()*100)) 
print("Soft Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}". format(grid_soft_cv['test_score'].mean()*100))
print("Soft Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}". format(grid_soft_cv['test_score'].std()*100*3))
print('-'*10)

Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
(214,)
(427,)
Epoch 1/1
Hard Voting w/Tuned Hyperparameters Training w/bin score mean: 81.85
Hard Voting w/Tuned Hyperparameters Test w/bin score mean: 82.34
Hard Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- 3.70
----------




Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Soft Voting w/Tuned Hyperparameters Training w/bin score mean: 83.75
Soft Voting w/Tuned Hyperparameters Test w/bin score mean: 82.38
Soft Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- 4.86
----------




In [229]:
predictions = grid_hard.predict(np.asarray(data[891:]))
predictions= predictions.reshape((418,1))
predictions

(418,)


array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
    

In [230]:
ids = data_test['PassengerId'].copy()
new_output = ids.to_frame()
new_output

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
5,897
6,898
7,899
8,900
9,901


In [231]:
new_output['Survived'] = predictions
new_output.sample(10)

Unnamed: 0,PassengerId,Survived
88,980,1
234,1126,0
33,925,1
320,1212,0
348,1240,0
411,1303,1
358,1250,0
344,1236,0
60,952,0
183,1075,0


In [232]:


new_output.to_csv('./working/my_submit.csv',index=False)