In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb
from multiprocessing import Pool, cpu_count



In [2]:
train = pd.read_csv('train.gz')
test = pd.read_csv('test.gz')

In [3]:
train = train.replace(-1, np.NaN)
d_median = train.median(axis=0)
d_mean = train.mean(axis=0)
train = train.fillna(-1)
one_hot = {c: list(train[c].unique()) for c in train.columns if c not in ['id','target']}

In [4]:
def transform_df(df):
    df = pd.DataFrame(df)
    dcol = [c for c in df.columns if c not in ['id','target']]
    df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c: #standard arithmetic
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)
            #df[c+str('_sq')] = np.power(df[c].values,2).astype(np.float32)
            #df[c+str('_sqr')] = np.square(df[c].values).astype(np.float32)
            #df[c+str('_log')] = np.log(np.abs(df[c].values) + 1)
            #df[c+str('_exp')] = np.exp(df[c].values) - 1
    for c in one_hot:
        if len(one_hot[c])>2 and len(one_hot[c]) < 7:
            for val in one_hot[c]:
                df[c+'_oh_' + str(val)] = (df[c].values == val).astype(np.int)
    return df

In [None]:
def multi_transform(df):
    print('Init Shape: ', df.shape)
    p = Pool(cpu_count())
    print(cpu_count())
    df = p.map(transform_df, np.array_split(df, cpu_count()))
    df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
    p.close(); p.join()
    print('After Shape: ', df.shape)
    return df

In [5]:
def gini(y, pred):
    fpr, tpr, thr = roc_curve(y, pred, pos_label=1)
    g = 2 * auc(fpr, tpr) -1
    return g

In [6]:
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred)

In [7]:
params = {'eta': 0.02, 'max_depth': 4, 'objective': 'binary:logistic', 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 0.77, 'scale_pos_weight': 1.6, 'gamma': 10, 'reg_alpha': 8, 'reg_lambda': 1.3, 'eval_metric': 'auc', 'seed': 99, 'silent': True}
x1, x2, y1, y2 = train_test_split(train, train['target'], test_size=0.25, random_state=99)

In [8]:
#x1 = multi_transform(x1)
#x2 = multi_transform(x2)
#test = multi_transform(test)

x1 = transform_df(x1)
x2 = transform_df(x2)
test = transform_df(test)

In [9]:
col = [c for c in x1.columns if c not in ['id','target']]
col = [c for c in col if not c.startswith('ps_calc_')]
print(x1.values.shape, x2.values.shape)

(446409, 189) (148803, 189)


In [11]:
#remove duplicates just in case
#tdups = multi_transform(train)
tdups = transform_df(train)
dups = tdups[tdups.duplicated(subset=col, keep=False)]

x1 = x1[~(x1['id'].isin(dups['id'].values))]
x2 = x2[~(x2['id'].isin(dups['id'].values))]
print(x1.values.shape, x2.values.shape)

(446409, 189) (148803, 189)


In [12]:
y1 = x1['target']
y2 = x2['target']
x1 = x1[col]
x2 = x2[col]

In [13]:
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 5000,  watchlist, feval=gini_xgb, maximize=True, verbose_eval=50, early_stopping_rounds=200)

[0]	train-gini:0.160714	valid-gini:0.155906
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 200 rounds.
[50]	train-gini:0.240407	valid-gini:0.236128
[100]	train-gini:0.252383	valid-gini:0.245
[150]	train-gini:0.267676	valid-gini:0.253018
[200]	train-gini:0.278681	valid-gini:0.259201
[250]	train-gini:0.287017	valid-gini:0.264416
[300]	train-gini:0.293689	valid-gini:0.268962
[350]	train-gini:0.299458	valid-gini:0.2723
[400]	train-gini:0.30407	valid-gini:0.274121
[450]	train-gini:0.307311	valid-gini:0.275897
[500]	train-gini:0.310462	valid-gini:0.277034
[550]	train-gini:0.313355	valid-gini:0.277711
[600]	train-gini:0.315898	valid-gini:0.278514
[650]	train-gini:0.318179	valid-gini:0.278938
[700]	train-gini:0.320075	valid-gini:0.279287
[750]	train-gini:0.322275	valid-gini:0.279825
[800]	train-gini:0.323857	valid-gini:0.280056
[850]	train-gini:0.325472	valid-gini:0.280408
[900]	train-gini:0.327018	valid-gin

In [14]:
test['target'] = model.predict(xgb.DMatrix(test[col]), ntree_limit=model.best_ntree_limit+45)
test['target'] = (np.exp(test['target'].values) - 1.0).clip(0,1)

In [15]:
test[['id','target']].to_csv('xgb_submission.csv', index=False, float_format='%.5f')

In [16]:
#LightGBM
def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [17]:
params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'max_bin': 10, 'subsample': 0.8, 'subsample_freq': 10, 'colsample_bytree': 0.8, 'min_child_samples': 500, 'metric': 'auc', 'is_training_metric': False, 'seed': 99}

In [18]:
model2 = lgb.train(params, lgb.Dataset(x1, label=y1), 1000, lgb.Dataset(x2, label=y2), verbose_eval=50, feval=gini_lgb, early_stopping_rounds=200)


Training until validation scores don't improve for 200 rounds.
[50]	valid_0's auc: 0.619807	valid_0's gini: 0.239614
[100]	valid_0's auc: 0.622392	valid_0's gini: 0.244784
[150]	valid_0's auc: 0.625078	valid_0's gini: 0.250156
[200]	valid_0's auc: 0.628524	valid_0's gini: 0.257048
[250]	valid_0's auc: 0.63162	valid_0's gini: 0.26324
[300]	valid_0's auc: 0.634464	valid_0's gini: 0.268929
[350]	valid_0's auc: 0.636321	valid_0's gini: 0.272643
[400]	valid_0's auc: 0.637643	valid_0's gini: 0.275286
[450]	valid_0's auc: 0.638319	valid_0's gini: 0.276638
[500]	valid_0's auc: 0.639261	valid_0's gini: 0.278522
[550]	valid_0's auc: 0.639561	valid_0's gini: 0.279121
[600]	valid_0's auc: 0.640341	valid_0's gini: 0.280683
[650]	valid_0's auc: 0.640551	valid_0's gini: 0.281102
[700]	valid_0's auc: 0.640748	valid_0's gini: 0.281495
[750]	valid_0's auc: 0.640659	valid_0's gini: 0.281317
[800]	valid_0's auc: 0.640597	valid_0's gini: 0.281194
[850]	valid_0's auc: 0.640909	valid_0's gini: 0.281818
[900]

In [19]:
test['target'] = model2.predict(test[col], num_iteration=model2.best_iteration)
test['target'] = (np.exp(test['target'].values) - 1.0).clip(0,1)

In [20]:
test[['id','target']].to_csv('lgb_submission.csv', index=False, float_format='%.5f')

In [21]:
df1 = pd.read_csv('xgb_submission.csv')
df2 = pd.read_csv('lgb_submission.csv')

In [22]:
df2.columns = [x+'_' if x not in ['id'] else x for x in df2.columns]
blend = pd.merge(df1, df2, how='left', on='id')
for c in df1.columns:
    if c != 'id':
        blend[c] = (blend[c] * 0.5)  + (blend[c+'_'] * 0.5)
blend = blend[df1.columns]
blend['target'] = (np.exp(blend['target'].values) - 1.0).clip(0,1)

In [23]:
blend.to_csv('blend1.csv', index=False, float_format='%.5f')