In [5]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import xgboost as xgb

In [6]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3)

In [11]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=1000, max_depth=2, learning_rate=0.5, nthread=7)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8314606741573034

In [13]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [14]:
param = {'max_depth': 2, 'eta': 0.5, 'silent': 1, 'objective': 'binary:logistic'}
param['nthread'] = 7
param['eval_metric'] = 'auc'
evallist = [(dtest, 'eval'), (dtrain, 'train')]
plst = param.items()

In [15]:
num_round = 50
bst = xgb.train(plst, dtrain, num_round, evallist)

[0]	eval-auc:0.841095	train-auc:0.83032
[1]	eval-auc:0.878886	train-auc:0.856715
[2]	eval-auc:0.882222	train-auc:0.860867
[3]	eval-auc:0.886995	train-auc:0.86653
[4]	eval-auc:0.883201	train-auc:0.872533
[5]	eval-auc:0.88519	train-auc:0.878433
[6]	eval-auc:0.886934	train-auc:0.881503
[7]	eval-auc:0.889015	train-auc:0.89201
[8]	eval-auc:0.889259	train-auc:0.892757
[9]	eval-auc:0.88877	train-auc:0.895702
[10]	eval-auc:0.898439	train-auc:0.902999
[11]	eval-auc:0.894584	train-auc:0.90393
[12]	eval-auc:0.893972	train-auc:0.904883
[13]	eval-auc:0.895685	train-auc:0.906777
[14]	eval-auc:0.896695	train-auc:0.908634
[15]	eval-auc:0.8959	train-auc:0.911671
[16]	eval-auc:0.890453	train-auc:0.912277
[17]	eval-auc:0.89082	train-auc:0.912959
[18]	eval-auc:0.890575	train-auc:0.914394
[19]	eval-auc:0.892931	train-auc:0.916673
[20]	eval-auc:0.887852	train-auc:0.919845
[21]	eval-auc:0.885282	train-auc:0.919683
[22]	eval-auc:0.883446	train-auc:0.920435
[23]	eval-auc:0.886444	train-auc:0.920446
[24]	eval-a

In [16]:
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

In [17]:
(sum((ypred>0.5) == y_test)) / 267.0

0.8689138576779026

In [18]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import numpy as np

In [19]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [23]:
import lightgbm as lgb

estimator = lgb.LGBMClassifier(num_leaves=2)

param_grid = {
    'learning_rate': [0.01, 0.1, 0.05, 0.5, 1],
    'n_estimators': [20, 40, 60, 80, 100, 120]
}

gbm = GridSearchCV(estimator, param_grid, cv=5, scoring='roc_auc')

gbm.fit(X, y)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.5, 'n_estimators': 80}


In [24]:
sum(gbm.best_estimator_.predict(X_test) == y_test) / (len(y_test)*1.0)

0.8576779026217228

In [25]:
sum(
    (gbm.best_estimator_.predict_proba(X_test)[:,1]> 0.51 ) == y_test) / (len(y_test)*1.0)

0.8614232209737828

In [26]:

from xgboost.sklearn import XGBClassifier
estimator = XGBClassifier()

In [27]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}

In [28]:

clf = GridSearchCV(estimator, parameters, n_jobs=5, 
                   cv=5, 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(X, y)
print('Best parameters found by grid search are:', gbm.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.


Best parameters found by grid search are: {'learning_rate': 0.5, 'n_estimators': 80}


[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    2.8s remaining:    4.2s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    2.8s finished


In [29]:
sum(clf.best_estimator_.predict(X_test) == y_test) / (len(y_test)*1.0)

0.797752808988764

In [30]:
sum(
    (clf.best_estimator_.predict_proba(X_test)[:,1]> 0.51 ) == y_test) / (len(y_test)*1.0)

0.8014981273408239