In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import os
print(os.listdir("../input"))

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

plt.style.use('seaborn')
sns.set(font_scale=1)

try:
    import cudf as gd
    import nvstrings
    from librmm_cffi import librmm
    from nvstring_workaround import get_unique_tokens,on_gpu,get_token_counts,is_in
    from cudf_workaround import unique,rename_col,to_pandas,merge
except:
    print('cudf not imported')

['sample_submission.csv', 'test.csv', 'train.csv']
cudf not imported


In [8]:
random_state = 42
np.random.seed(random_state)
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [9]:
def augment(x,y,t=1, include_raw=True):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

#     for i in range(t//2):
#         mask = y==0
#         x1 = x[mask].copy()
#         ids = np.arange(x1.shape[0])
#         for c in range(x1.shape[1]):
#             np.random.shuffle(ids)
#             x1[:,c] = x1[ids][:,c]
#         xn.append(x1)

    xs = np.vstack(xs)
    #xn = np.vstack(xn) # it seems removing 0 augmentation is better with mtr features
    ys = np.ones(xs.shape[0])
    #yn = np.zeros(xn.shape[0])
    if include_raw:
        return np.vstack([x,xs]), np.concatenate([y,ys])
    else:
        return xs,ys

In [10]:
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
#features = ['var_12','var_81'] # check these 2 features first
# in future we can check 1 feature at a time
X_test = df_test[features].values

In [11]:
def mtr_encodes(x,y,xt,xte,names,window_sizes=[20,50]):
    names = names.copy()
    x0,xt0,xte0 = x.copy(),xt.copy(),xte.copy()
    x,xt,xte = [x0],[xt0],[xte0]
    for i in range(x0.shape[1]):
        print('feature mtr encoding',names[i])
        a,b,c = mtr_encode(x0[:,i],y,xt0[:,i],xte0[:,i],window_sizes)
        x.append(a)
        xt.append(b)
        xte.append(c)
        names.extend(['%s_mtr_%d'%(names[i],j) for j in window_sizes])
    x = np.hstack(x)
    xt = np.hstack(xt)
    xte = np.hstack(xte)
    return x,xt,xte,names

def mtr_encode(x,y,xt,xte,window_sizes):
    ids = np.arange(x.shape[0])
    x1,x2,y1,y2 = train_test_split(ids,y, test_size=0.5, random_state=42,stratify=y)

    xnew = np.zeros([x.shape[0],len(window_sizes)]).astype(np.float32)
    _,xnew[x2],_ = mtr_pd(x[x1],y1,x[x2],None,window_sizes)
    _,xnew[x1],_ = mtr_pd(x[x2],y2,x[x1],None,window_sizes)
    _,xt,xte = mtr_pd(x,y,xt,xte,window_sizes)
    return xnew,xt,xte

def mtr_pd(x,y,xt,xte,window_sizes):
    col = 'mean_y'
    tr = pd.DataFrame()
    tr['y'] = y.astype(np.float32)
    tr['x'] = x
    df = tr.groupby('x').agg({'y':'mean'})
    df.columns = [col]
    df = df.reset_index()
    df = df.sort_values('x')
    
    cols = []
 
    for i in [df.shape[0]//ws for ws in window_sizes]:
        df['mtr_%d'%i] = df[col].rolling(i,min_periods=1).mean()
        cols.append('mtr_%d'%i)
    tr = tr.merge(df,on='x',how='left')
    te = pd.DataFrame()
    te['x'] = xt
    te = te.merge(df,on='x',how='left')

    if xte is not None:
        tes = pd.DataFrame()
        tes['x'] = xte
        tes = tes.merge(df,on='x',how='left')
        #print('test null ratio %.4f'%(tes[cols[0]].isnull().sum()*1.0/tes.shape[0]))
        f = open('mtr_null.txt','a')
        ratio = tes[cols[0]].isnull().sum()*1.0/tes.shape[0]
        f.write('test null ratio %.4f\n'%(ratio))
        xte = tes[cols].values
        del tes
        #print('valid null ratio %.4f'%(te[cols[0]].isnull().sum()*1.0/te.shape[0]))
        ratio = te[cols[0]].isnull().sum()*1.0/te.shape[0]
        f.write('valid null ratio %.4f\n\n'%(ratio))
        f.close()
    x,xt = tr[cols].values,te[cols].values
    return x,xt,xte

def mtr_gd(x,y,xt,xte,window_sizes=[500,1000]):
    col = 'mean_y'
    tr = gd.DataFrame()
    tr['y'] = y.astype(np.float32)
    tr['x'] = np.ascontiguousarray(x)#.astype(np.float32)
    tr['x'] = tr['x'].fillna(0)
    #print(tr['x'].to_pandas().isnull().sum())
    df = tr.groupby('x').agg({'y':'mean'})
    df = df.sort_values('x')
    pdf = to_pandas(df)
    
    cols = []
    for i in window_sizes:
        pdf['mtr_%d'%i] = pdf[col].rolling(i,min_periods=1).mean()
        cols.append('mtr_%d'%i)
    del df
    df = gd.from_pandas(pdf)
    tr = merge(tr,df,on='x',how='left')
    tr = to_pandas(tr)
   
    te = gd.DataFrame()
    te['x'] = np.ascontiguousarray(xt)
    te = merge(te,df,on='x',how='left')
    te = to_pandas(te)
    if xte is not None:
        tes = gd.DataFrame()
        tes['x'] = np.ascontiguousarray(xte)
        tes = merge(tes,df,on='x',how='left')
        tes = to_pandas(tes)
        #print('test null ratio %.4f'%(tes[cols[0]].isnull().sum()*1.0/tes.shape[0]))
        f = open('mtr_null.txt','a')
        ratio = tes[cols[0]].isnull().sum()*1.0/tes.shape[0]
        f.write('test null ratio %.4f\n'%(ratio))
        xte = tes[cols].values
        del tes
        #print('valid null ratio %.4f'%(te[cols[0]].isnull().sum()*1.0/te.shape[0]))
        ratio = te[cols[0]].isnull().sum()*1.0/te.shape[0]
        f.write('valid null ratio %.4f\n\n'%(ratio))
        f.close()
    x,xt = tr[cols].values,te[cols].values
    return x,xt,xte

In [12]:
xgb_params =  {
    'objective': 'binary:logistic',
    #'objective':'reg:linear',
    'tree_method': 'gpu_hist',
    'eta':0.1,
    'nthread': 16,
    'num_class':1,
    'max_depth': 1,
    'silent':1,
    'subsample':0.5,
    'colsample_bytree': 0.5,
    'min_child_weight':100,
    'eval_metric':'auc',
}

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
oof = df_train[['ID_code', 'target']]
oof['predict'] = 0
predictions = df_test[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()

for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
    X_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
    X_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
    N = 1
    p_valid,yp = 0,0
    for i in range(N):
        # mtr
        X_train0, X_valid0, X_test0,names = mtr_encodes(X_train.values,y_train,X_valid.values,
                                X_test,names=features,window_sizes=[20, 50])
        print("Training on mtr data...")
        train_dataset = xgb.DMatrix(X_train0,y_train)
        valid_dataset = xgb.DMatrix(X_valid0,y_valid)
        watchlist = [(train_dataset, 'train'), (valid_dataset, 'valid')]
        xgb_mtr_clf = xgb.train(xgb_params,
                                          train_dataset,
                                          evals=watchlist,
                                          num_boost_round=12000,
                                          early_stopping_rounds=1000,
                                          verbose_eval=100
                                          )
        # augmentation
        print("Augmenting....")
        augments = 10
        X_t, y_t = augment(X_train0, y_train.values, t=augments, include_raw=False)
        print("Augmented data shape:", X_t.shape)
        X_t_prob = xgb_mtr_clf.predict(xgb.DMatrix(X_t))
        threshold = np.percentile(X_t_prob, int(100/augments))
        
        X_t = np.vstack((X_train0,
                   X_t[X_t_prob>=threshold,:]))
        y_t = np.hstack((y_train, y_t[X_t_prob>=threshold]))
        print("Selected augmented data shape:", X_t.shape)
        
        X_t = pd.DataFrame(X_t,columns=names).astype('float32')
        X_valid0 = pd.DataFrame(X_valid0,columns=names).astype('float32')
        X_test0 = pd.DataFrame(X_test0,columns=names).astype('float32')
        print(X_t.shape,X_valid0.shape,X_test0.shape)
        assert X_t.shape[1]==X_valid0.shape[1]
        assert X_t.shape[1]==X_test0.shape[1]

        train_dataset = xgb.DMatrix(X_t,y_t)
        valid_dataset = xgb.DMatrix(X_valid0,y_valid)
        watchlist = [(train_dataset, 'train'), (valid_dataset, 'valid')]
        xgb_clf = xgb.train(xgb_params,
                                          train_dataset,
                                          evals=watchlist,
                                          num_boost_round=12000,
                                          early_stopping_rounds=1000,
                                          verbose_eval=100
                                          )
        best_iteration = xgb_clf.best_iteration + 50
        
        p_valid += xgb_clf.predict(valid_dataset, ntree_limit=best_iteration)
        yp += xgb_clf.predict(xgb.DMatrix(X_test0), ntree_limit=best_iteration)

    oof['predict'][val_idx] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions['fold{}'.format(fold+1)] = yp/N

In [None]:
# submission
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('all_predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']
sub_df.to_csv("submission.csv", index=False)
oof.to_csv('oof.csv', index=False)