In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, manifold
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

%matplotlib inline

np.random.seed(8888)

## Load different feature sets

In [None]:
train_dat = pd.read_csv('features/basic_train.csv')
test_dat = pd.read_csv('features/basic_test.csv')

train_labels = np.load('train_labels_parsed.npy')

In [None]:
train_dat = pd.read_csv('features/zeros_ones_train.csv')
test_dat = pd.read_csv('features/zeros_ones_test.csv')

train_labels = np.load('features/train_labels.npy')

In [None]:
train_dat = pd.read_csv('features/cat_sums_train.csv')
test_dat = pd.read_csv('features/cat_sums_test.csv')

train_labels = np.load('features/train_labels.npy')

In [None]:
#train_dat = pd.read_csv('features/trimmed_train.csv')
test_dat = pd.read_csv('features/trimmed_test.csv')

#train_labels = np.load('features/train_labels.npy')

## Hold out data for testing

In [None]:
samp = int(len(train_dat)*1.0)
print(len(train_dat),'total samples')
print(samp,'used for training')

In [None]:
train_dat.head()

In [None]:
xg_train = xgb.DMatrix( train_dat.iloc[:samp].as_matrix(), label=train_labels[:samp])
xg_test = xgb.DMatrix( train_dat.iloc[samp:].as_matrix(), label=train_labels[samp:])

In [None]:
param = {   'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'booster': 'gbtree',
            'nthread':4,
            'max_depth':8,
            'colsample_bytree':0.77,
            'subsample':0.83,
            'eta':0.023,
            'gamma': 0.0001
        }
watchlist = [ (xg_train, 'train') ]#, (xg_test, 'test') ]
num_round = 1800
bst = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=1000)
bst.save_model('{}rounds.model'.format(num_round))

In [None]:
xg_pred = xgb.DMatrix( test_dat.as_matrix())
print(len(test_dat))

pred = bst.predict(xg_pred)
sample = pd.read_csv('sample_submission.csv')
print(len(sample))
sample.QuoteConversion_Flag = pred
sample.to_csv('xgb_gamma.csv', index=False)

### Training on 95% data
- Ones and zeros sums - 1800 rounds 0.969 cv (0.96778 LB)
- Category sums - 1800 rounds 0.969296 cv (0.96770 LB)
- Trimmed - 1800 rounds 0.969328 cv (0.956 LB)

In [None]:
bst = xgb.Booster()
bst.load_model('pure_06.model')

In [None]:
xgb.plot_importance(bst)
plt.gcf().set_size_inches((10,50))

### Trim unimportant features

In [None]:
cols = sorted(bst.get_fscore().items(), key=lambda x: x[1],reverse=True)
print(cols)
# Trim off unimportant features
cols = cols[:230]
# Transforms feature names (f0, f1, ..., f276) to column names (SalesField8, ...)
cols = [ train_dat.columns[int(i.replace('f',''))] for i,j in cols ]
xg_train = xgb.DMatrix( train_dat[cols][:samp].as_matrix(), label=train_labels[:samp])
xg_test = xgb.DMatrix( train_dat[cols][samp:].as_matrix(), label=train_labels[samp:])

In [None]:
np.save('important_cols.npy', cols)

In [None]:
cols = list(np.load('important_cols.npy'))

In [None]:
param = {   'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'nthread':4,
            'max_depth':6,
            'colsample_bytree':0.75,
            'subsample':0.83,
            'min_child_weight':4,
            'eta':0.01,
        }
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 6000
model = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=150)
model.save_model('trimmed_{}rounds.model'.format(num_round))

In [None]:
xg_pred = xgb.DMatrix( test_dat.as_matrix())

pred = model.predict(xg_pred)
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = pred
sample.to_csv('xgb_trimmed_06.csv', index=False)

#### 0.966560 after 953 rounds using params for pure_03 (`trimmed_03.model`, `trimmed_03.csv`)

- 0.9593 after 10 rounds and removing   20 least important features
- 0.9594 after 10 rounds and removing   30 least important features
- 0.959489 after 10 rounds and removing 40 least important features
- 0.959701 after 10 rounds and removing 50 least important features
- 0.963020 after 100 rounds and removing 50 least important features

In [None]:
xgb.plot_importance(model)
plt.gcf().set_size_inches((10,50))

## Feature Engineering
Try grouping similiar fields (PersonalField, GeographicField...) and make interaction features for each group

In [None]:
train_eng = train_dat.loc[:,cols]
test_eng = test_dat.loc[:,cols]

imp_col = sorted(bst.get_fscore().items(), key=lambda x: x[1])
# Choose the most important features to create new interaction features
int_col = imp_col[100:]
-
xg_train = xgb.DMatrix( train_eng.iloc[:samp].as_matrix(), label=train_labels[:samp], feature_names=train_eng.columns)
xg_test = xgb.DMatrix( train_eng.iloc[samp:].as_matrix(), label=train_labels[samp:], feature_names=train_eng.columns)
train_eng.head()

In [None]:
param = {   'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'nthread':4,
            'booster':'gbtree'
            'max_depth':6,
            'colsample_bytree':0.77,
            'subsample':0.83,
            'min_child_weight':5,
            'eta':0.023,
        }
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 1800
model = xgb.train(param, xg_train, num_round, watchlist, early_stopping_rounds=100)

In [None]:
model.save_model('eng_{}rounds.model'.format(num_round))

- 0.959776 best after 10 rounds
- 0.962689 best after 100 rounds

In [None]:
xgb.plot_importance(model)
plt.gcf().set_size_inches((10,50))

In [None]:
xg_pred = xgb.DMatrix( test_eng.as_matrix(), feature_names=test_eng.columns)
pred = model.predict(xg_pred)
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = pred
sample.to_csv('eng_02.csv', index=False)

In [None]:
tsne = manifold.TSNE(n_components=2, random_state=1010)
tsne.fit(train_eng.iloc[0:1000].as_matrix(), train_labels[0:1000])
print('Done fit')
Y = tsne.fit_transform(train_eng.iloc[1000:3000].as_matrix())
color = train_labels[1000:3000]
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)

#### Hyperparameter searching

[See kaggle script](https://www.kaggle.com/phunter/flavours-of-physics/gridsearchcv-with-feature-in-xgboost/files)

In [None]:
from sklearn import cross_validation, metrics, ensemble
from sklearn.grid_search import GridSearchCV

In [None]:
samp = int(len(train_dat)*0.95)
print(samp)
xg_train = xgb.DMatrix( train_dat.iloc[:samp].as_matrix(), label=train_labels[:samp])
xg_test = xgb.DMatrix( train_dat.iloc[samp:].as_matrix(), label=train_labels[samp:])

m = xgb.XGBClassifier()

parameters = {'nthread':[4],
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [5],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.75],
              'n_estimators': [500, 750, 1000], #number of trees
              'seed': [80085]}

def auc_score(clf, X, y):
    pred_prob = clf.predict_proba(X)[:,1]
    return metrics.roc_auc_score(y, pred_prob)


clf = GridSearchCV(m, parameters, n_jobs=1, 
                cv=cross_validation.StratifiedKFold(train_labels[samp:],
                n_folds=5, shuffle=True), 
                scoring=auc_score,
                verbose=2, refit=True)

clf.fit(train_dat.iloc[:samp], train_labels[:samp])

best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)

In [None]:
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))