In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import os
print(os.listdir("../input"))

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

plt.style.use('seaborn')
sns.set(font_scale=1)
import gc

try:
    import cudf as gd
    import nvstrings
    from librmm_cffi import librmm
    from nvstring_workaround import get_unique_tokens,on_gpu,get_token_counts,is_in
    from cudf_workaround import unique,rename_col,to_pandas,merge
except:
    print('cudf not imported')

['kfolds.pkl', 'loo_mean.csv', 'sample_submission.csv', 'test.csv', 'train.csv']
cudf not imported


In [2]:
var_dtypes = dict()
for i in range(200):
    var_dtypes['var_'+str(i)] = 'float32'

In [39]:
random_state = 42
np.random.seed(random_state)
# df_train = pd.read_csv('../input/train.csv', dtype=var_dtypes)
# df_test = pd.read_csv('../input/test.csv', dtype=var_dtypes)

df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [10]:
def augment(x,y,t=1):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    #xn = np.vstack(xn) # it seems removing 0 augmentation is better with mtr features
    ys = np.ones(xs.shape[0])
    #yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs])
    y = np.concatenate([y,ys])
    return x,y

In [11]:
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
#features = ['var_12','var_81'] # check these 2 features first
# in future we can check 1 feature at a time
X_test = df_test[features].values

In [66]:
def mtr_encodes(x,y,xt,xte,names,window_sizes=[20,50]):
    names = names.copy()
    x0,xt0,xte0 = x.copy(),xt.copy(),xte.copy()
    x,xt,xte = [x0],[xt0],[xte0]
    for i in range(x0.shape[1]):
        print('feature mtr encoding',names[i])
        a,b,c = mtr_encode(x0[:,i],y,xt0[:,i],xte0[:,i],window_sizes)
        x.append(a)
        xt.append(b)
        xte.append(c)
        names.extend(['%s_mtr_%d'%(names[i],j) for j in window_sizes])
    x = np.hstack(x)
    xt = np.hstack(xt)
    xte = np.hstack(xte)
    return x,xt,xte,names

def mtr_encode(x,y,xt,xte,window_sizes):
    ids = np.arange(x.shape[0])
    x1,x2,y1,y2 = train_test_split(ids,y, test_size=0.5, random_state=42,stratify=y)

    xnew = np.zeros([x.shape[0],len(window_sizes)]).astype(np.float32)
    _,xnew[x2],_ = mtr_pd(x[x1],y1,x[x2],None,window_sizes)
    _,xnew[x1],_ = mtr_pd(x[x2],y2,x[x1],None,window_sizes)
    _,xt,xte = mtr_pd(x,y,xt,xte,window_sizes)
    return xnew,xt,xte

def mtr_pd(x,y,xt,xte,window_sizes):
    col = 'mean_y'
    tr = pd.DataFrame()
    tr['y'] = y.astype(np.float32)
    tr['x'] = x
    df = tr.groupby('x').agg({'y':'mean'})
    df.columns = [col]
    df = df.reset_index()
    df = df.sort_values('x')
    
    cols = []
 
    for i in [df.shape[0]//ws for ws in window_sizes]:
        df['mtr_%d'%i] = df[col].rolling(i,min_periods=1).mean()
        cols.append('mtr_%d'%i)
    tr = tr.merge(df,on='x',how='left')
    te = pd.DataFrame()
    te['x'] = xt
    te = te.merge(df,on='x',how='left')

    if xte is not None:
        tes = pd.DataFrame()
        tes['x'] = xte
        tes = tes.merge(df,on='x',how='left')
        #print('test null ratio %.4f'%(tes[cols[0]].isnull().sum()*1.0/tes.shape[0]))
        f = open('mtr_null.txt','a')
        ratio = tes[cols[0]].isnull().sum()*1.0/tes.shape[0]
        f.write('test null ratio %.4f\n'%(ratio))
        xte = tes[cols].values
        del tes
        #print('valid null ratio %.4f'%(te[cols[0]].isnull().sum()*1.0/te.shape[0]))
        ratio = te[cols[0]].isnull().sum()*1.0/te.shape[0]
        f.write('valid null ratio %.4f\n\n'%(ratio))
        f.close()
    x,xt = tr[cols].values,te[cols].values
    return x,xt,xte

def mtr_gd(x,y,xt,xte,window_sizes=[500,1000]):
    col = 'mean_y'
    tr = gd.DataFrame()
    tr['y'] = y.astype(np.float32)
    tr['x'] = np.ascontiguousarray(x)#.astype(np.float32)
    tr['x'] = tr['x'].fillna(0)
    #print(tr['x'].to_pandas().isnull().sum())
    df = tr.groupby('x').agg({'y':'mean'})
    df = df.sort_values('x')
    pdf = to_pandas(df)
    
    cols = []
    for i in window_sizes:
        pdf['mtr_%d'%i] = pdf[col].rolling(i,min_periods=1).mean()
        cols.append('mtr_%d'%i)
    del df
    df = gd.from_pandas(pdf)
    tr = merge(tr,df,on='x',how='left')
    tr = to_pandas(tr)
   
    te = gd.DataFrame()
    te['x'] = np.ascontiguousarray(xt)
    te = merge(te,df,on='x',how='left')
    te = to_pandas(te)
    if xte is not None:
        tes = gd.DataFrame()
        tes['x'] = np.ascontiguousarray(xte)
        tes = merge(tes,df,on='x',how='left')
        tes = to_pandas(tes)
        #print('test null ratio %.4f'%(tes[cols[0]].isnull().sum()*1.0/tes.shape[0]))
        f = open('mtr_null.txt','a')
        ratio = tes[cols[0]].isnull().sum()*1.0/tes.shape[0]
        f.write('test null ratio %.4f\n'%(ratio))
        xte = tes[cols].values
        del tes
        #print('valid null ratio %.4f'%(te[cols[0]].isnull().sum()*1.0/te.shape[0]))
        ratio = te[cols[0]].isnull().sum()*1.0/te.shape[0]
        f.write('valid null ratio %.4f\n\n'%(ratio))
        f.close()
    x,xt = tr[cols].values,te[cols].values
    return x,xt,xte

# Test different windows by features

In [106]:
auc_scores_all = []
num_vars = ['var_%s'%(idx) for idx in range(200)]
for idx in range(200):
    for w in np.arange(1, 16, 1):
        mtr_var = 'var_%s_mtr_%s'%(idx, w)
        print(mtr_var)
        full_vars = num_vars + [mtr_var]
        train_dataset = xgb.DMatrix(X_t[full_vars],y_t)
        valid_dataset = xgb.DMatrix(X_valid0[full_vars],y_valid)
        watchlist = [(train_dataset, 'train'), (valid_dataset, 'valid')]
        xgb_clf = xgb.train(xgb_params,
                                          train_dataset,
                                          evals=watchlist,
                                          num_boost_round=12000,
                                          early_stopping_rounds=100,
                                          verbose_eval=0
                                          )
        best_iteration = xgb_clf.best_iteration
        best_score = xgb_clf.best_score
        print(best_score)
        auc_scores_all.append([idx,w,mtr_var,best_score])
        xgb_clf.__del__() 
        del xgb_clf
        gc.collect()

var_0_mtr_5
0.89956
var_0_mtr_15
0.89969
var_0_mtr_25
0.899543
var_0_mtr_35
0.899505
var_0_mtr_45
0.899455
var_0_mtr_55
0.899534
var_0_mtr_65
0.899278
var_0_mtr_75
0.899494
var_0_mtr_85
0.899343
var_0_mtr_95
0.89944
var_0_mtr_105
0.898899
var_1_mtr_5
0.899367
var_1_mtr_15
0.899474
var_1_mtr_25
0.899259
var_1_mtr_35
0.899406
var_1_mtr_45
0.899351
var_1_mtr_55
0.899465
var_1_mtr_65
0.899492
var_1_mtr_75
0.899427
var_1_mtr_85
0.899318
var_1_mtr_95
0.899323
var_1_mtr_105
0.899447
var_2_mtr_5
0.899671
var_2_mtr_15
0.899554
var_2_mtr_25
0.899344
var_2_mtr_35
0.89943
var_2_mtr_45
0.899487
var_2_mtr_55
0.899416
var_2_mtr_65
0.899408
var_2_mtr_75
0.899611
var_2_mtr_85
0.899466
var_2_mtr_95
0.899347
var_2_mtr_105
0.899054
var_3_mtr_5
0.899427
var_3_mtr_15
0.899305
var_3_mtr_25
0.899418
var_3_mtr_35
0.898875
var_3_mtr_45
0.899412
var_3_mtr_55
0.899372
var_3_mtr_65
0.899234
var_3_mtr_75
0.899494
var_3_mtr_85
0.899406
var_3_mtr_95
0.899349
var_3_mtr_105
0.899428
var_4_mtr_5
0.899461
var_4_mtr_15
0.

0.899338
var_33_mtr_5
0.899342
var_33_mtr_15
0.899444
var_33_mtr_25
0.899447
var_33_mtr_35
0.899047
var_33_mtr_45
0.899398
var_33_mtr_55
0.899337
var_33_mtr_65
0.899164
var_33_mtr_75
0.899536
var_33_mtr_85
0.899058
var_33_mtr_95
0.899393
var_33_mtr_105
0.899316
var_34_mtr_5
0.89895
var_34_mtr_15
0.899274
var_34_mtr_25
0.899288
var_34_mtr_35
0.899304
var_34_mtr_45
0.899435
var_34_mtr_55
0.89941
var_34_mtr_65
0.899367
var_34_mtr_75
0.899403
var_34_mtr_85
0.899225
var_34_mtr_95
0.899466
var_34_mtr_105
0.899012
var_35_mtr_5
0.899578
var_35_mtr_15
0.899542
var_35_mtr_25
0.899492
var_35_mtr_35
0.899434
var_35_mtr_45
0.899328
var_35_mtr_55
0.899472
var_35_mtr_65
0.899294
var_35_mtr_75
0.899471
var_35_mtr_85
0.899337
var_35_mtr_95
0.899332
var_35_mtr_105
0.899489
var_36_mtr_5
0.899118
var_36_mtr_15
0.899228
var_36_mtr_25
0.899337
var_36_mtr_35
0.899583
var_36_mtr_45
0.89937
var_36_mtr_55
0.899373
var_36_mtr_65
0.899467
var_36_mtr_75
0.899317
var_36_mtr_85
0.899078
var_36_mtr_95
0.899238
var_36

0.899436
var_65_mtr_65
0.899432
var_65_mtr_75
0.899436
var_65_mtr_85
0.899436
var_65_mtr_95
0.899436
var_65_mtr_105
0.899436
var_66_mtr_5
0.899493
var_66_mtr_15
0.899422
var_66_mtr_25
0.899338
var_66_mtr_35
0.899589
var_66_mtr_45
0.899329
var_66_mtr_55
0.899287
var_66_mtr_65
0.899491
var_66_mtr_75
0.899245
var_66_mtr_85
0.899417
var_66_mtr_95
0.899401
var_66_mtr_105
0.899415
var_67_mtr_5
0.899294
var_67_mtr_15
0.899605
var_67_mtr_25
0.899681
var_67_mtr_35
0.899423
var_67_mtr_45
0.899566
var_67_mtr_55
0.899538
var_67_mtr_65
0.899365
var_67_mtr_75
0.899468
var_67_mtr_85
0.899313
var_67_mtr_95
0.899259
var_67_mtr_105
0.899368
var_68_mtr_5
0.899425
var_68_mtr_15
0.89894
var_68_mtr_25
0.899444
var_68_mtr_35
0.899439
var_68_mtr_45
0.899436
var_68_mtr_55
0.899436
var_68_mtr_65
0.899436
var_68_mtr_75
0.899436
var_68_mtr_85
0.899341
var_68_mtr_95
0.899331
var_68_mtr_105
0.899328
var_69_mtr_5
0.899309
var_69_mtr_15
0.899437
var_69_mtr_25
0.899436
var_69_mtr_35
0.899436
var_69_mtr_45
0.899516
var

0.899455
var_98_mtr_15
0.899469
var_98_mtr_25
0.899442
var_98_mtr_35
0.899333
var_98_mtr_45
0.899357
var_98_mtr_55
0.899451
var_98_mtr_65
0.899406
var_98_mtr_75
0.899436
var_98_mtr_85
0.899436
var_98_mtr_95
0.899443
var_98_mtr_105
0.89944
var_99_mtr_5
0.899414
var_99_mtr_15
0.899512
var_99_mtr_25
0.899298
var_99_mtr_35
0.899412
var_99_mtr_45
0.899515
var_99_mtr_55
0.898913
var_99_mtr_65
0.899182
var_99_mtr_75
0.898896
var_99_mtr_85
0.89937
var_99_mtr_95
0.899286
var_99_mtr_105
0.899548
var_100_mtr_5
0.899444
var_100_mtr_15
0.899433
var_100_mtr_25
0.899421
var_100_mtr_35
0.89943
var_100_mtr_45
0.899436
var_100_mtr_55
0.899302
var_100_mtr_65
0.899324
var_100_mtr_75
0.899414
var_100_mtr_85
0.899454
var_100_mtr_95
0.89944
var_100_mtr_105
0.899455
var_101_mtr_5
0.899436
var_101_mtr_15
0.899436
var_101_mtr_25
0.899424
var_101_mtr_35
0.899309
var_101_mtr_45
0.899436
var_101_mtr_55
0.899441
var_101_mtr_65
0.899436
var_101_mtr_75
0.899436
var_101_mtr_85
0.899436
var_101_mtr_95
0.899421
var_101_

0.899304
var_129_mtr_45
0.899384
var_129_mtr_55
0.899413
var_129_mtr_65
0.899252
var_129_mtr_75
0.899387
var_129_mtr_85
0.899442
var_129_mtr_95
0.899127
var_129_mtr_105
0.899393
var_130_mtr_5
0.899376
var_130_mtr_15
0.899253
var_130_mtr_25
0.899378
var_130_mtr_35
0.899459
var_130_mtr_45
0.899365
var_130_mtr_55
0.899463
var_130_mtr_65
0.899323
var_130_mtr_75
0.899389
var_130_mtr_85
0.899237
var_130_mtr_95
0.899273
var_130_mtr_105
0.898848
var_131_mtr_5
0.89955
var_131_mtr_15
0.899546
var_131_mtr_25
0.899442
var_131_mtr_35
0.899195
var_131_mtr_45
0.899544
var_131_mtr_55
0.899725
var_131_mtr_65
0.899296
var_131_mtr_75
0.899439
var_131_mtr_85
0.899102
var_131_mtr_95
0.899213
var_131_mtr_105
0.899326
var_132_mtr_5
0.899769
var_132_mtr_15
0.899272
var_132_mtr_25
0.899405
var_132_mtr_35
0.899446
var_132_mtr_45
0.899491
var_132_mtr_55
0.899321
var_132_mtr_65
0.8993
var_132_mtr_75
0.898933
var_132_mtr_85
0.899191
var_132_mtr_95
0.899312
var_132_mtr_105
0.899477
var_133_mtr_5
0.899394
var_133_mt

0.899433
var_160_mtr_65
0.899418
var_160_mtr_75
0.899423
var_160_mtr_85
0.899457
var_160_mtr_95
0.899381
var_160_mtr_105
0.899457
var_161_mtr_5
0.899323
var_161_mtr_15
0.899436
var_161_mtr_25
0.899436
var_161_mtr_35
0.899474
var_161_mtr_45
0.899436
var_161_mtr_55
0.899419
var_161_mtr_65
0.899437
var_161_mtr_75
0.899433
var_161_mtr_85
0.899436
var_161_mtr_95
0.899436
var_161_mtr_105
0.899436
var_162_mtr_5
0.899514
var_162_mtr_15
0.899423
var_162_mtr_25
0.899291
var_162_mtr_35
0.899378
var_162_mtr_45
0.899479
var_162_mtr_55
0.899443
var_162_mtr_65
0.899516
var_162_mtr_75
0.899313
var_162_mtr_85
0.899454
var_162_mtr_95
0.899459
var_162_mtr_105
0.899437
var_163_mtr_5
0.899524
var_163_mtr_15
0.899385
var_163_mtr_25
0.899243
var_163_mtr_35
0.899413
var_163_mtr_45
0.899286
var_163_mtr_55
0.89944
var_163_mtr_65
0.899505
var_163_mtr_75
0.899436
var_163_mtr_85
0.899436
var_163_mtr_95
0.899375
var_163_mtr_105
0.899355
var_164_mtr_5
0.899352
var_164_mtr_15
0.899543
var_164_mtr_25
0.899759
var_164_

0.899264
var_191_mtr_85
0.899331
var_191_mtr_95
0.899449
var_191_mtr_105
0.899357
var_192_mtr_5
0.899341
var_192_mtr_15
0.899379
var_192_mtr_25
0.899339
var_192_mtr_35
0.8989
var_192_mtr_45
0.899447
var_192_mtr_55
0.899558
var_192_mtr_65
0.899551
var_192_mtr_75
0.899528
var_192_mtr_85
0.89933
var_192_mtr_95
0.899305
var_192_mtr_105
0.899494
var_193_mtr_5
0.89932
var_193_mtr_15
0.899151
var_193_mtr_25
0.899385
var_193_mtr_35
0.899409
var_193_mtr_45
0.89939
var_193_mtr_55
0.899385
var_193_mtr_65
0.899373
var_193_mtr_75
0.899284
var_193_mtr_85
0.899218
var_193_mtr_95
0.899522
var_193_mtr_105
0.899327
var_194_mtr_5
0.899559
var_194_mtr_15
0.899557
var_194_mtr_25
0.899459
var_194_mtr_35
0.899346
var_194_mtr_45
0.899488
var_194_mtr_55
0.899398
var_194_mtr_65
0.898896
var_194_mtr_75
0.899294
var_194_mtr_85
0.899259
var_194_mtr_95
0.899348
var_194_mtr_105
0.899297
var_195_mtr_5
0.899501
var_195_mtr_15
0.899485
var_195_mtr_25
0.899497
var_195_mtr_35
0.899376
var_195_mtr_45
0.899376
var_195_mtr_

In [114]:
auc_scores_df = pd.DataFrame(auc_scores_all, columns=['feature_index', 'window', 'feature_name', 'auc'])

In [117]:
auc_scores_df['feature_name'] = auc_scores_df['feature_name'].apply(lambda x:x.split('_')[0]+'_'+x.split('_')[1])
auc_scores_df.to_csv('mtr_windows.csv', index=False)