# Install lightgbm

In [2]:
!/home/user/anaconda3/bin/pip install lightgbm



# Download from kaggle with kaggle driver

In [62]:
%%bash
/home/user/anaconda3/bin/kaggle competitions download -c santander-customer-transaction-prediction

train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
Downloading sample_submission.csv.zip to /home/user/work/Santader

test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


  0%|          | 0.00/463k [00:00<?, ?B/s]100%|██████████| 463k/463k [00:00<00:00, 5.92MB/s]


In [76]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import zipfile

import os
import subprocess

from datetime import datetime
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from IPython.core.display import display, HTML, display_html
display(HTML("<style>.container { width:99% !important; }</style>")) # set full window wide cells (only in this notebook)

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)
    
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

# Make dir and unzip all

In [17]:
#zip_ref = zipfile.ZipFile('train.csv.zip', 'r')
#zip_ref.extractall()
#zip_ref.close()

os.mkdir('input')
zipfile.ZipFile('train.csv.zip', 'r').extractall('input/')
zipfile.ZipFile('test.csv.zip', 'r').extractall('input/')
zipfile.ZipFile('sample_submission.csv.zip', 'r').extractall('input/')

In [3]:
%%bash
ls -s

итого 250640
     4 input
   464 sample_submission.csv.zip
    16 Santader.ipynb
124920 test.csv.zip
125236 train.csv.zip


In [2]:
%%bash
ls input/ -s

итого 592140
  2628 sample_submission.csv
294460 test.csv
295052 train.csv


In [4]:



#START

In [9]:
path='input/'
df_train=pd.read_csv(path+'train.csv').drop('ID_code',axis=1)
df_test=pd.read_csv(path+'test.csv').drop('ID_code',axis=1)
len(train[train.target==0]), len(train[train.target==1])

(179902, 20098)

In [2]:
## Inspiration from
#https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment
def augment(df_train,num_n=1,num_p=2):
    newtrain=[train]
    
    for i in range(num_n):
        n=df_train[df_train.target==0]
        newtrain.append(n.apply(lambda x: x.values.take(np.random.permutation(len(n)))))
    
    for i in range(num_p):
        p=df_train[df_train.target==1]
        newtrain.append(p.apply(lambda x: x.values.take(np.random.permutation(len(p)))))
    return pd.concat(newtrain)
#df=oversample(train,2,1)

In [3]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': -1,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
    #'scaleposweight': 1,
    #'is_unbalance': True
}

In [6]:
%%time
#LightGBM PREDICTION
result=np.zeros(df_test.shape[0])

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=777)
for counter, (train_index, valid_index) in enumerate(rskf.split(df_train, df_train.target), 1):
    #print (counter)
    
    #Train data
    t=df_train.iloc[train_index]
    #param['scaleposweight'] = sum(t.target==0) / sum(t.target>0) #
    t=augment(t)
    trn_data = lgb.Dataset(t.drop("target",axis=1), label=t.target)
    
    #Validation data
    v=train.iloc[valid_index]
    val_data = lgb.Dataset(v.drop("target",axis=1), label=v.target)
    print (counter)
    #Training
    model = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 4000)
    result += model.predict(df_test)

1
Training until validation scores don't improve for 4000 rounds.
[500]	training's auc: 0.888273	valid_1's auc: 0.880517
[1000]	training's auc: 0.893134	valid_1's auc: 0.884581
[1500]	training's auc: 0.896485	valid_1's auc: 0.887492
[2000]	training's auc: 0.899162	valid_1's auc: 0.889545
[2500]	training's auc: 0.901348	valid_1's auc: 0.891479
[3000]	training's auc: 0.903076	valid_1's auc: 0.892915
[3500]	training's auc: 0.904741	valid_1's auc: 0.894421
[4000]	training's auc: 0.906188	valid_1's auc: 0.895592
[4500]	training's auc: 0.907434	valid_1's auc: 0.896519
[5000]	training's auc: 0.908628	valid_1's auc: 0.897403
[5500]	training's auc: 0.909676	valid_1's auc: 0.898092
[6000]	training's auc: 0.910662	valid_1's auc: 0.898767
[6500]	training's auc: 0.911604	valid_1's auc: 0.899313
[7000]	training's auc: 0.912477	valid_1's auc: 0.89979
[7500]	training's auc: 0.913294	valid_1's auc: 0.900155
[8000]	training's auc: 0.914072	valid_1's auc: 0.900502
[8500]	training's auc: 0.9148	valid_1's 

[17000]	training's auc: 0.924635	valid_1's auc: 0.902081
[17500]	training's auc: 0.925141	valid_1's auc: 0.902082
[18000]	training's auc: 0.925647	valid_1's auc: 0.90205
[18500]	training's auc: 0.926152	valid_1's auc: 0.902064
[19000]	training's auc: 0.926658	valid_1's auc: 0.902075
[19500]	training's auc: 0.927155	valid_1's auc: 0.902065
[20000]	training's auc: 0.927641	valid_1's auc: 0.902031
[20500]	training's auc: 0.928129	valid_1's auc: 0.901992
[21000]	training's auc: 0.928614	valid_1's auc: 0.901952
Early stopping, best iteration is:
[17119]	training's auc: 0.924757	valid_1's auc: 0.902104
5
Training until validation scores don't improve for 4000 rounds.
[500]	training's auc: 0.888108	valid_1's auc: 0.882371
[1000]	training's auc: 0.893017	valid_1's auc: 0.887069
[1500]	training's auc: 0.896327	valid_1's auc: 0.890014
[2000]	training's auc: 0.8991	valid_1's auc: 0.892224
[2500]	training's auc: 0.901368	valid_1's auc: 0.894037
[3000]	training's auc: 0.903112	valid_1's auc: 0.8953

[10000]	training's auc: 0.916203	valid_1's auc: 0.902606
[10500]	training's auc: 0.916856	valid_1's auc: 0.902674
[11000]	training's auc: 0.917476	valid_1's auc: 0.902734
[11500]	training's auc: 0.918092	valid_1's auc: 0.902734
[12000]	training's auc: 0.918678	valid_1's auc: 0.902697
[12500]	training's auc: 0.91926	valid_1's auc: 0.90269
[13000]	training's auc: 0.919839	valid_1's auc: 0.902646
[13500]	training's auc: 0.920419	valid_1's auc: 0.902609
[14000]	training's auc: 0.920977	valid_1's auc: 0.902563
[14500]	training's auc: 0.921532	valid_1's auc: 0.902547
[15000]	training's auc: 0.922068	valid_1's auc: 0.902497
Early stopping, best iteration is:
[11407]	training's auc: 0.917972	valid_1's auc: 0.902758
9
Training until validation scores don't improve for 4000 rounds.
[500]	training's auc: 0.887971	valid_1's auc: 0.881639
[1000]	training's auc: 0.892903	valid_1's auc: 0.885777
[1500]	training's auc: 0.896179	valid_1's auc: 0.889013
[2000]	training's auc: 0.898878	valid_1's auc: 0.8

In [7]:
#SAVE
submission = pd.read_csv(path/'sample_submission.csv')
submission['target'] = result/counter
filename="{:%Y-%m-%d_%H_%M}_sub.csv".format(datetime.now())
submission.to_csv(filename, index=False)

with zipfile.ZipFile(filename+'.zip', 'w', zipfile.ZIP_DEFLATED) as newzip:
    newzip.write(filename)
print(filename+'.zip')

2019-03-30_16_32_sub.csv.zip


In [172]:
#TRAIN CALC MIN/MAX percent in each feathure
arr_min0, arr_min1, arr_max0, arr_max1, arr_count = [], [], [], [], []
for col in df_train.iloc[:,1:]:
    arr_min0.append(df_train[col][df_train['target']==0].min())
    arr_min1.append(df_train[col][df_train['target']==1].min())
    arr_max0.append(df_train[col][df_train['target']==0].max())
    arr_max1.append(df_train[col][df_train['target']==1].max())
    arr_count.append(len(df_train[col].value_counts()))
df_minmax = pd.DataFrame({ 'min0':arr_min0, 'min1':arr_min1, 'max1':arr_max1, 'max0':arr_max0, 'count':arr_count, 'diff':abs(np.array(arr_min0)-np.array(arr_max0))/100 })
df_minmax['diff_min'] = round(abs(df_minmax['min0']-df_minmax['min1'])/df_minmax['diff'], 2)#.astype(int)
df_minmax['diff_max'] = round(abs(df_minmax['max1']-df_minmax['max0'])/df_minmax['diff'], 2)#.astype(int)
display_side_by_side(df_minmax.head(), df_minmax.tail())
df_minmax['diff_min'].mean(), df_minmax['diff_max'].mean(), df_minmax['diff_min'].median(), df_minmax['diff_max'].median(), df_minmax['diff_min'].std(), df_minmax['diff_max'].std()

Unnamed: 0,min0,min1,max1,max0,count,diff,diff_min,diff_max
0,0.4084,0.4528,19.4583,20.315,94672,0.199066,0.22,4.3
1,-15.0434,-14.037,9.0298,10.3768,108932,0.254202,3.96,5.3
2,2.1171,2.9462,18.2941,19.353,86555,0.172359,4.81,6.14
3,-0.0402,0.374,12.7069,13.1883,74597,0.132285,3.13,3.64
4,5.0748,5.8762,15.6925,16.6714,63515,0.115966,6.91,8.44

Unnamed: 0,min0,min1,max1,max0,count,diff,diff_min,diff_max
195,-5.261,-5.0185,4.0881,4.2729,57870,0.095339,2.54,1.94
196,-14.2096,-14.0204,17.1614,18.3215,125560,0.325311,0.58,3.57
197,5.9606,6.119,11.7069,12.0004,40537,0.060398,2.62,4.86
198,6.2993,6.5587,25.8571,26.0791,94153,0.197798,1.31,1.12
199,-38.8528,-38.8528,24.5646,28.5007,149430,0.673535,0.0,5.84


(4.487950000000001,
 4.445450000000001,
 4.12,
 4.225,
 2.752507910339662,
 2.8568042372113536)

In [173]:
#TEST CALC MIN/MAX percent in each feathure
pr = 2 #percents for cut from min and max
arr_min, arr_max, arr_count = [], [], []
for col in df_test.iloc[:,:]:
    arr_min.append(df_test[col].min())
    arr_max.append(df_test[col].max())
    arr_count.append(len(df_test[col].value_counts()))
df_minmax = pd.DataFrame({ 'min':arr_min, 'max':arr_max, 'count':arr_count, 'pr':abs(np.array(arr_min)-np.array(arr_max))/100 })
df_minmax['diff_min_pr'] = df_minmax['min'] + pr*df_minmax['pr']#.astype(int)
df_minmax['diff_max_pr'] = df_minmax['max'] - pr*df_minmax['pr']#.astype(int)
display_side_by_side(df_minmax.head(), df_minmax.tail())
#df_minmax['diff_min'].mean(), df_minmax['diff_max'].mean(), df_minmax['diff_min'].median(), df_minmax['diff_max'].median(), df_minmax['diff_min'].std(), df_minmax['diff_max'].std()

Unnamed: 0,min,max,count,pr,diff_min_pr,diff_max_pr
0,0.1887,22.3234,65580,0.221347,0.631394,21.880706
1,-15.0434,9.3851,71661,0.244285,-14.55483,8.89653
2,2.3552,18.7141,61865,0.163589,2.682378,18.386922
3,-0.0224,13.142,56507,0.131644,0.240888,12.878712
4,5.4844,16.0371,49995,0.105527,5.695454,15.826046

Unnamed: 0,min,max,count,pr,diff_min_pr,diff_max_pr
195,-4.9119,4.5454,46482,0.094573,-4.722754,4.356254
196,-13.9442,15.9207,78038,0.298649,-13.346902,15.323402
197,6.1696,12.2758,34817,0.061062,6.291724,12.153676
198,6.584,26.5384,65262,0.199544,6.983088,26.139312
199,-39.4578,27.9074,85933,0.673652,-38.110496,26.560096


In [174]:
submission = pd.read_csv('2019-03-28_18_23_sub.csv') #0.901
#df_test_t = pd.concat([submission['target'].round().astype(int), df_test], axis=1)
df_test_t = pd.concat([submission['target'], df_test], axis=1)
len(df_test_t[df_test_t['target']>0.5])

12275

In [175]:
idx_for_trim = []
for idx,col in enumerate(df_test_t.iloc[:,1:]):
    diff_min_pr = df_minmax.loc[idx]['diff_min_pr']
    diff_max_pr = df_minmax.loc[idx]['diff_max_pr']
    #print(idx, df_test_t[(df_test_t[col]<diff_min_pr)|(df_test_t[col]>diff_max_pr)]['target'].sum(), df_minmax.loc[idx]['min'], diff_min_pr, diff_max_pr, df_minmax.loc[idx]['max'])
    idx_for_trim.append( df_test_t[(df_test_t[col]<diff_min_pr)|(df_test_t[col]>diff_max_pr)]['target'].index )
    
    #display_side_by_side(df_test_t[[col,'target']].sort_values(col, ascending=False).head(), df_test_t[[col,'target']].sort_values(col, ascending=True).head())
idx_for_trim = np.hstack(idx_for_trim)
len(idx_for_trim), len(np.unique(idx_for_trim))
idx_for_trim = np.unique(idx_for_trim)
print(len(idx_for_trim), len(np.unique(idx_for_trim)), 'Found class 1 mistakes:', len(df_test_t.loc[idx_for_trim][df_test_t['target']>0.5]))

2322 2322 Found class 1 mistakes: 190


  if sys.path[0] == '':


In [176]:
#Replace class 1 mistakes with class 0
idx_for_trim = df_test_t.loc[idx_for_trim][df_test_t['target']>0.5].index
df_test_t['target'].loc[idx_for_trim] = df_test_t['target'].loc[idx_for_trim] - 0.2

  


In [177]:
#TEST must be zero
len(df_test_t.loc[idx_for_trim][df_test_t['target']>0.5])

  


92

In [178]:
#SAVE
submission['target'] = df_test_t['target']
filename="{:%Y-%m-%d_%H_%M}_sub.csv".format(datetime.now())
submission.to_csv(filename, index=False)

with zipfile.ZipFile(filename+'.zip', 'w', zipfile.ZIP_DEFLATED) as newzip:
    newzip.write(filename)
print(filename+'.zip')

2019-04-01_22_58_sub.csv.zip


In [None]:
#END











In [None]:
import matplotlib.pyplot as plt
%pylab inline
%config InlineBackend.figure_format = 'retina'

fig, ax = plt.subplots(figsize=(70,35))

ax.plot(df_minmax['count'])
ax.legend(loc='upper left')
ax.grid()
plt.show()

%%time
sub = model.predict(train.iloc[:,1:])
sub = pd.DataFrame(sub)
sub.sum()

In [147]:
sub = pd.DataFrame(sub)
len(sub), len(sub[sub[0]<0.4]), len(sub[sub[0]>=0.5]), len(sub[sub[0]>0.9])

(200000, 188023, 10401, 1450)

In [151]:
sum_moved = pd.read_csv(name)
sum_moved.head()

Unnamed: 0,ID_code,target
0,test_0,0.103143
1,test_1,0.207697
2,test_2,0.173284
3,test_3,0.199731
4,test_4,0.044954


In [2]:
submission = pd.read_csv('2019-03-27_19_27_sub.csv')
submission.head()

Unnamed: 0,ID_code,target
0,test_0,0.103143
1,test_1,0.207697
2,test_2,0.173284
3,test_3,0.199731
4,test_4,0.044954


In [29]:
submission = pd.read_csv('2019-03-28_18_23_sub.csv.zip')
submission.head()

Unnamed: 0,ID_code,target
0,test_0,0.182981
1,test_1,0.315891
2,test_2,0.269531
3,test_3,0.355173
4,test_4,0.081307


In [3]:
sum_moved = submission.copy()
#len(sum_moved[(submission['target']<0.5)&(submission['target']>0.48)])
sum_moved['target'] = sum_moved['target']*1.5
sum_moved['target'][sum_moved['target']>0.998877] = 0.998877
len(sum_moved), len(sum_moved[sum_moved['target']<0.5]), len(sum_moved[sum_moved['target']>=0.5]), len(sum_moved[sum_moved['target']>0.9]), len(sum_moved[sum_moved['target']>1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(200000, 187609, 12391, 3958, 0)

In [27]:
sum_moved = submission.copy()
print('original 0.901', len(sum_moved), len(sum_moved[sum_moved['target']<0.5]), len(sum_moved[sum_moved['target']>=0.5]), len(sum_moved[sum_moved['target']>0.8]), len(sum_moved[sum_moved['target']>1]))#len(sum_moved[(submission['target']<0.5)&(submission['target']>0.48)])
sum_moved['target'][sum_moved['target']>0.8] = sum_moved['target'][sum_moved['target']>0.8] - 0.1
#sum_moved['target'] = sum_moved['target'].apply(lambda x: x>0.8 - 0.1 if x>0.8 else x)
print('after 0.8 - .1', len(sum_moved), len(sum_moved[sum_moved['target']<0.5]), len(sum_moved[sum_moved['target']>=0.5]), len(sum_moved[sum_moved['target']>0.8]), len(sum_moved[sum_moved['target']>1]))

original 0.901 200000 187725 12275 2929 0
after 0.8 - .1 200000 187725 12275 1222 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
sum_moved = submission.copy()
for i in np.arange(0.0,1,0.1): print(i, i+.1, len(sum_moved[(sum_moved['target']>=i)&(sum_moved['target']<i+.1)]))
print(len(sum_moved[sum_moved['target']<0.5]),'\n')
sum_moved['target'] = sum_moved['target']*0.95
for i in np.arange(0.0,1,0.1): print(i, i+.1, len(sum_moved[(sum_moved['target']>=i)&(sum_moved['target']<i+.1)]))
print(len(sum_moved[sum_moved['target']<0.5]),'\n')

0.0 0.1 121854
0.1 0.2 34290
0.2 0.30000000000000004 16237
0.30000000000000004 0.4 9263
0.4 0.5 6081
0.5 0.6 4045
0.6000000000000001 0.7000000000000001 3055
0.7000000000000001 0.8 2246
0.8 0.9 1707
0.9 1.0 1222
187725 

0.0 0.1 127514
0.1 0.2 33210
0.2 0.30000000000000004 15262
0.30000000000000004 0.4 8674
0.4 0.5 5545
0.5 0.6 3693
0.6000000000000001 0.7000000000000001 2711
0.7000000000000001 0.8 1987
0.8 0.9 1404
0.9 1.0 0
190205 



In [40]:
sum_moved = submission.copy()
print(len(sum_moved), len(sum_moved[sum_moved['target']<0.5]), len(sum_moved[sum_moved['target']>=0.5]), len(sum_moved[sum_moved['target']>0.9]), len(sum_moved[sum_moved['target']>1]))
sum_moved['target'] = sum_moved['target']+0.08
sum_moved['target'][sum_moved['target']>0.9] = 0.9
len(sum_moved), len(sum_moved[sum_moved['target']<0.5]), len(sum_moved[sum_moved['target']>=0.5]), len(sum_moved[sum_moved['target']>0.9]), len(sum_moved[sum_moved['target']>1])

200000 187725 12275 1222 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(200000, 183071, 16929, 0, 0)

In [41]:
filename="{:%Y-%m-%d_%H_%M}_sub.csv".format(datetime.now())
sum_moved.to_csv(filename, index=False)

with zipfile.ZipFile(filename+'.zip', 'w', zipfile.ZIP_DEFLATED) as newzip:
    newzip.write(filename)
print(filename+'.zip')

2019-03-29_13_08_sub.csv.zip


In [5]:
%%bash
ls -s

итого 332484
  6200 2019-03-27_15_47_sub.csv
  2340 2019-03-27_15_47_sub.csv.zip
  6200 2019-03-27_19_27_sub.csv
  2340 2019-03-27_19_27_sub.csv.zip
  2308 2019-03-27_22_35_sub.csv.zip
  6184 2019-03-27_22_40_sub.csv
  2336 2019-03-27_22_40_sub.csv.zip
  6188 2019-03-27_22_49_sub.csv
  2336 2019-03-27_22_49_sub.csv.zip
  6136 2019-03-28_08_46_sub.csv
  2316 2019-03-28_08_46_sub.csv.zip
  6156 2019-03-28_18_23_sub.csv
  2336 2019-03-28_18_23_sub.csv.zip
  6156 2019-03-28_21_42_sub.csv
  2336 2019-03-28_21_42_sub.csv.zip
  6160 2019-03-28_21_46_sub.csv
  2336 2019-03-28_21_46_sub.csv.zip
  6160 2019-03-29_12_06_sub.csv
  2336 2019-03-29_12_06_sub.csv.zip
     4 input
   464 sample_submission.csv.zip
  1436 Santader.ipynb
     8 Santader_R.ipynb
  1556 subprocess.ipynb
124920 test.csv.zip
125236 train.csv.zip


In [18]:
%%bash
/home/user/anaconda3/bin/kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       2596           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge      10529           False  
house-prices-advanced-regression-techniques    2030-01-01 00:00:00  Getting Started  Knowledge       4211           False  
imagenet-object-localization-challenge         2029-12-31 07:00:00  Research         Knowledge         36           False  
competitive-data-science-predict-future-sales  2019-12-31 23:59:00  Playground           Kudos       2614           False  
two-sigma-financial-news                       2019-07-15 23:59:00  Featured          $100,000       2927           False  
aerial-c

In [74]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-27_15_47_sub.csv.zip -m "02 lgbm augment"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.28M [00:00<?, ?B/s]  5%|▌         | 120k/2.28M [00:00<00:01, 1.23MB/s] 60%|██████    | 1.38M/2.28M [00:00<00:00, 1.69MB/s]100%|██████████| 2.28M/2.28M [00:03<00:00, 640kB/s] 


In [107]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-27_19_27_sub.csv.zip -m "03 lgbm augment"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.28M [00:00<?, ?B/s]  3%|▎         | 80.0k/2.28M [00:00<00:02, 809kB/s] 61%|██████    | 1.39M/2.28M [00:00<00:00, 1.13MB/s]100%|██████████| 2.28M/2.28M [00:05<00:00, 452kB/s] 


In [199]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-27_22_49_sub.csv.zip -m "05 lgbm augment *1.1"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.28M [00:00<?, ?B/s]  6%|▌         | 136k/2.28M [00:00<00:01, 1.27MB/s] 70%|███████   | 1.60M/2.28M [00:00<00:00, 1.75MB/s]100%|██████████| 2.28M/2.28M [00:05<00:00, 476kB/s] 


In [5]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-28_08_46_sub.csv.zip -m "06 lgbm augment *1.5"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.26M [00:00<?, ?B/s]  4%|▍         | 104k/2.26M [00:00<00:02, 1.06MB/s] 73%|███████▎  | 1.66M/2.26M [00:00<00:00, 1.46MB/s]100%|██████████| 2.26M/2.26M [00:10<00:00, 222kB/s] 


In [15]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-28_18_23_sub.csv.zip -m "07 lgbm augment 3x"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.28M [00:00<?, ?B/s]  3%|▎         | 80.0k/2.28M [00:00<00:02, 796kB/s] 60%|█████▉    | 1.36M/2.28M [00:00<00:00, 1.11MB/s]100%|██████████| 2.28M/2.28M [00:08<00:00, 293kB/s] 


In [30]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-28_21_42_sub.csv.zip -m "08 lgbm augment 3x targ>0.8-0.1"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.28M [00:00<?, ?B/s]  6%|▌         | 144k/2.28M [00:00<00:01, 1.32MB/s] 75%|███████▍  | 1.70M/2.28M [00:00<00:00, 1.81MB/s]100%|██████████| 2.28M/2.28M [00:08<00:00, 294kB/s] 


In [6]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-29_12_06_sub.csv.zip -m "09 lgbm augment 3x targ*0.95"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.28M [00:00<?, ?B/s]  3%|▎         | 80.0k/2.28M [00:00<00:02, 807kB/s] 60%|█████▉    | 1.37M/2.28M [00:00<00:00, 1.12MB/s]100%|██████████| 2.28M/2.28M [00:03<00:00, 636kB/s] 


In [42]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-29_12_06_sub.csv.zip -m "09 lgbm augment 3x targ-0.08"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.22M [00:00<?, ?B/s]  6%|▋         | 144k/2.22M [00:00<00:01, 1.35MB/s] 75%|███████▍  | 1.66M/2.22M [00:00<00:00, 1.86MB/s]100%|██████████| 2.22M/2.22M [00:10<00:00, 227kB/s] 


In [42]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-29_13_08_sub.csv.zip -m "10 lgbm augment 3x targ+0.08"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.22M [00:00<?, ?B/s]  6%|▋         | 144k/2.22M [00:00<00:01, 1.35MB/s] 75%|███████▍  | 1.66M/2.22M [00:00<00:00, 1.86MB/s]100%|██████████| 2.22M/2.22M [00:10<00:00, 227kB/s] 


In [78]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-30_18_14_sub.csv.zip -m "11 lgbm augment 9x min and max trim 3%"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/484k [00:00<?, ?B/s] 17%|█▋        | 80.0k/484k [00:00<00:00, 817kB/s]100%|██████████| 484k/484k [00:07<00:00, 65.7kB/s]


In [79]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-30_16_32_sub.csv.zip -m "12 lgbm augment 9x"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.27M [00:00<?, ?B/s]  6%|▌         | 144k/2.27M [00:00<00:01, 1.37MB/s] 78%|███████▊  | 1.77M/2.27M [00:00<00:00, 1.88MB/s]100%|██████████| 2.27M/2.27M [00:08<00:00, 294kB/s] 


In [98]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-30_18_31_sub.csv.zip -m "13 lgbm augment 2x min and max trim 0.5%"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/477k [00:00<?, ?B/s] 28%|██▊       | 136k/477k [00:00<00:00, 1.22MB/s]100%|██████████| 477k/477k [00:04<00:00, 101kB/s] 


In [105]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-31_09_55_sub.csv.zip -m "14 lgbm augment 2x min and max trim 3% (target float!)"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.27M [00:00<?, ?B/s]  2%|▏         | 56.0k/2.27M [00:00<00:04, 549kB/s] 60%|█████▉    | 1.35M/2.27M [00:00<00:01, 770kB/s]100%|██████████| 2.27M/2.27M [00:06<00:00, 372kB/s]


In [126]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-31_10_10_sub.csv.zip -m "15 lgbm augment 2x min and max trim 1% (target float!)"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.26M [00:00<?, ?B/s]  6%|▌         | 144k/2.26M [00:00<00:01, 1.37MB/s] 73%|███████▎  | 1.66M/2.26M [00:00<00:00, 1.88MB/s]100%|██████████| 2.26M/2.26M [00:05<00:00, 397kB/s] 


In [171]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-03-31_10_25_sub.csv.zip -m "16 lgbm augment 2x min and max trim 2% 0.49 (err befor)"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.27M [00:00<?, ?B/s]  6%|▌         | 144k/2.27M [00:00<00:01, 1.35MB/s] 73%|███████▎  | 1.65M/2.27M [00:00<00:00, 1.86MB/s]100%|██████████| 2.27M/2.27M [00:10<00:00, 217kB/s] 


In [179]:
%%bash
/home/user/anaconda3/bin/kaggle competitions submit santander-customer-transaction-prediction -f 2019-04-01_22_58_sub.csv.zip -m "17 lgbm augment 2x min and max trim 2% target-0.2"

Successfully submitted to Santander Customer Transaction Prediction

  0%|          | 0.00/2.27M [00:00<?, ?B/s]  6%|▌         | 136k/2.27M [00:00<00:01, 1.23MB/s] 70%|██████▉   | 1.59M/2.27M [00:00<00:00, 1.69MB/s]100%|██████████| 2.27M/2.27M [00:03<00:00, 695kB/s] 


In [112]:
subprocess.check_output(["ls", "-l", "/dev/null"])

b'crw-rw-rw- 1 root root 1, 3 \xd0\xbc\xd0\xb0\xd1\x80 27 11:07 /dev/null\n'


#cmd = '/home/user/anaconda3/bin/kaggle submit santander-customer-transaction-prediction -f '+filename+'.zip -m "01 lgbm augment"'
#os.system(cmd)
import subprocess
cmd = ['/home/user/anaconda3/bin/kaggle',  'submit', 'santander-customer-transaction-prediction', '-f', '2019-03-27_22_40_sub.csv.zip', '-m' '"04 lgbm augment *1.1"']
subprocess.check_output(cmd)

In [None]:



#COMPARE 2 SUBs

In [24]:
submission1 = pd.read_csv('2019-03-27_15_47_sub.csv') #02
submission2 = pd.read_csv('2019-03-28_18_23_sub.csv') #0
submission2['target1'] = submission1['target']
submission2['diff'] = submission1['target'] - submission2['target']
submission2['tg1'] = submission1['target'].round().astype(int)
submission2['tg2'] = submission2['target'].round().astype(int)
submission2['tg'] = (submission2['tg1']!=submission2['tg2']).astype(int)
len(submission2[submission2['tg']==1])

6170

In [28]:
a = submission2[(submission2['tg']==1)&(submission2['tg2']==1)]
a['diff'].mean()

-0.17320328205207516

In [23]:
submission2[submission2['tg']==1].head(100)

Unnamed: 0,ID_code,target,target1,diff,tg1,tg2,tg
20,test_20,0.504109,0.312896,-0.191214,0,1,1
32,test_32,0.679486,0.460856,-0.218629,0,1,1
138,test_138,0.520818,0.35739,-0.163428,0,1,1
161,test_161,0.542096,0.345987,-0.196109,0,1,1
297,test_297,0.56346,0.384543,-0.178918,0,1,1
323,test_323,0.5472,0.378211,-0.168989,0,1,1
355,test_355,0.660541,0.484281,-0.176259,0,1,1
361,test_361,0.595429,0.37524,-0.220189,0,1,1
375,test_375,0.535741,0.37756,-0.158181,0,1,1
404,test_404,0.599366,0.391805,-0.207561,0,1,1
