# Module

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")
gc.enable()

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

# Load Data

In [3]:
train_raw = pd.read_csv('./data/train.csv')
test_raw = pd.read_csv('./data/test.csv')
train_raw.shape, test_raw.shape

((200000, 202), (200000, 201))

In [4]:
train = train_raw.copy()
test = test_raw.copy()

In [6]:
train0 = train[ train['target']==0 ].copy()
train1 = train[ train['target']==1 ].copy()
train.sample(5)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
26959,train_26959,1,10.902,-2.4469,8.3611,9.0721,11.1254,-20.7582,5.9289,23.052,-1.7027,7.7488,7.2279,9.7263,13.9879,6.6539,6.7474,14.5837,8.9329,-15.3794,14.8353,13.061,17.6761,5.3006,6.8549,2.7601,15.0941,13.4273,9.5878,-1.5014,6.4776,6.5432,3.326,15.1735,1.8596,12.0803,11.125,-5.2507,0.0273,2.8159,11.2346,-0.2105,-5.4246,0.8354,11.2028,11.0027,21.867,-6.5235,13.7263,-4.7502,7.5214,2.9336,13.0326,31.1174,1.5176,6.0495,6.8654,16.7275,17.0697,6.5828,8.6548,9.5882,5.4915,-10.0686,2.885,3.9191,4.3162,8.6379,5.1823,10.5702,5.0336,-12.3759,47.238,0.4197,1.0019,22.8037,12.7163,20.2165,8.3896,13.4361,6.7104,12.9666,-2.0214,14.0614,-19.4478,-11.2834,-15.2922,16.2145,5.7705,15.6333,9.0827,5.0037,12.2056,7.0629,6.9441,10.5599,13.373,0.4399,23.9972,35.027,2.3498,-1.8746,-13.667,16.7196,10.8343,1.6356,11.6453,2.8869,11.3984,29.9287,14.3255,10.4788,12.2098,5.2938,1.9017,8.6342,1.0938,0.5469,2.4565,41.7947,5.1012,7.254,33.818,11.3582,-9.1117,6.043,7.2604,12.8605,12.955,1.2358,0.3965,21.2929,13.9372,1.1065,9.5562,6.5202,2.0454,-12.4567,31.3434,13.563,-0.739,1.4853,3.0026,13.4852,7.0206,8.863,7.9419,3.4923,9.0049,2.0253,4.1752,-2.9151,16.8085,5.1612,8.2249,19.9676,2.5991,3.6567,14.14,-9.0241,33.0255,11.3018,31.5758,5.8212,5.8359,11.6741,-6.2794,26.2844,2.8498,-9.7853,11.5471,5.7151,-1.378,3.6194,5.5033,0.2845,20.0954,8.4425,2.3138,9.1526,-4.5214,9.6166,-3.5431,10.4218,10.7097,7.1401,4.0658,-3.1097,10.8821,13.6953,10.9401,0.7766,6.1334,3.3935,0.7549,4.3573,11.6641,-0.3244,0.7191,7.2878,16.5992,-16.9856
173891,train_173891,0,13.3032,1.1754,13.2192,4.367,12.085,2.7608,6.0757,20.1818,4.1484,7.2188,5.6963,3.7118,14.0395,11.1978,10.1512,14.9258,10.6049,-13.4389,11.9389,19.0922,18.4121,24.3287,2.9952,2.8679,10.5971,13.131,-11.8389,-3.8325,6.0759,7.3266,-6.3761,11.4195,-3.3733,19.47,11.7235,10.1116,-1.9405,8.4649,11.5309,-2.4327,-7.9744,15.6393,12.7105,11.4638,16.418,-3.5777,16.0417,-16.6211,9.4083,15.6793,11.1974,10.7947,0.6036,6.643,-9.9924,9.2036,21.8421,6.6956,-5.0313,8.9595,21.9896,-12.9194,-1.4197,3.5659,4.1546,5.5033,7.6317,4.2737,5.0157,3.5613,6.8697,0.9265,4.2854,12.1615,5.26,13.6117,11.7925,14.8217,7.2413,14.5759,7.1642,16.3678,3.9759,7.2153,7.8924,25.8948,3.1484,6.0144,8.4022,3.4543,-11.6191,7.0121,8.0093,11.2178,12.0815,0.2751,14.8108,25.5508,1.6796,0.8257,-14.6943,12.0771,32.6644,1.425,14.207,3.1395,8.4447,23.9606,14.1778,16.8255,5.6815,7.0013,4.5756,7.629,1.6926,1.5316,4.3341,-0.4634,-8.7961,-0.6554,13.7371,10.9188,0.611,6.236,5.5618,12.1664,12.5922,-1.1465,2.4968,14.6335,12.4407,0.5668,6.0508,6.672,-0.0189,-16.8432,8.3989,4.8334,1.3067,14.3227,2.8765,-0.215,16.8496,12.0155,7.291,-0.0795,5.9013,-13.8575,3.9404,14.8896,16.0818,10.1469,7.4469,17.1492,11.7665,-8.5909,12.9294,-11.2214,0.3409,7.0251,28.6479,5.6409,6.3248,15.0092,8.2047,29.8779,2.5668,7.371,4.0931,5.5576,0.0302,-2.3377,-4.0859,9.403,23.6842,14.0052,8.0848,10.3231,4.3466,3.7783,-8.4755,8.7426,-9.8162,6.5983,1.3993,-2.6748,8.6991,-15.9381,11.13,-0.0111,0.7598,12.4358,3.1302,4.1393,17.4902,0.4249,2.3029,7.8723,13.2712,7.9219
45152,train_45152,1,12.2178,3.9364,15.0638,10.9422,9.7375,2.3761,7.1635,11.7227,-0.2645,9.7269,2.6608,-3.6812,14.404,-0.7822,6.3081,14.1215,9.0843,4.4591,20.1248,1.8876,5.2081,19.2709,5.6346,3.493,17.2351,13.5027,0.9559,-1.3952,4.9081,5.623,-8.8506,10.4616,-1.5444,14.5228,10.8281,7.9614,2.3565,5.4237,14.6092,-0.7157,-15.9212,10.2209,12.3989,11.614,9.6838,9.1874,14.4297,3.0801,15.6655,27.715,11.7417,2.0394,0.0525,6.3403,-9.236,3.0912,12.2774,6.409,0.2003,7.1624,9.0494,-12.409,-1.6963,0.7561,7.151,-3.7458,5.007,5.6167,5.0336,-2.6985,32.3287,0.828,-4.8845,25.4249,2.5835,3.3261,9.9095,23.44,4.3169,14.5332,-0.2883,11.5027,-10.1575,-7.5738,-11.3227,15.2356,8.7101,10.6332,6.8358,4.8989,-18.6744,7.0406,14.3364,10.9329,11.5592,-0.0245,-2.3514,4.9156,1.6103,-2.6253,-2.4093,10.1997,21.4549,1.7584,15.3118,3.6723,10.6089,12.6373,14.2537,24.2324,10.5433,5.077,1.7811,12.7998,3.05,6.1783,4.7365,21.1048,-2.6987,1.9112,31.8392,10.3373,-4.2211,-4.4665,-1.2139,11.9755,12.5441,2.4024,-1.5624,18.9825,12.6797,0.8642,8.0154,6.9788,-6.7964,3.6096,28.2736,3.1492,4.3689,7.4252,1.2,4.3913,15.9672,12.0224,7.4071,8.2812,14.4268,-9.7011,3.8368,9.3881,16.6222,6.8156,11.4875,12.262,7.4904,1.7695,13.4245,0.8724,6.4204,14.7026,17.8632,5.773,2.9057,13.8586,-1.8612,23.7331,3.4174,-15.9435,7.8665,5.8433,-0.8372,3.0035,32.8518,4.4599,26.4862,15.3976,-4.6832,7.4394,2.6296,6.6509,1.1062,9.7227,-14.766,17.5572,3.7922,-2.4487,5.8885,0.4461,19.2793,1.6845,-3.464,5.661,2.6182,8.9014,15.6532,2.3686,-6.8534,8.3334,12.3782,9.676
8967,train_8967,0,17.1943,0.1494,8.1002,6.9122,11.7216,-0.9111,5.444,14.5797,7.1473,7.844,-3.9841,-4.7199,14.512,10.919,5.9938,14.3318,10.8827,-4.3723,21.303,24.7496,20.7682,8.2345,3.1805,2.2933,5.75,13.4217,-4.8443,-3.7143,5.7148,3.257,-11.2193,9.6985,-2.2137,18.6297,11.0625,0.4291,3.1473,5.953,7.314,4.0777,-3.0511,13.4941,10.7375,11.5428,4.9272,1.7116,8.3052,-17.5209,16.8772,8.8395,13.3617,21.6272,2.2753,6.0259,7.3849,6.361,11.1939,6.2692,5.0587,8.9603,12.1276,-2.2405,1.1631,0.147,6.4025,-2.7397,5.564,10.5743,5.0147,-7.9767,14.8545,1.1604,8.5567,23.9222,15.1655,14.6956,5.5799,12.5879,4.6045,13.4616,9.9914,14.236,-3.2754,12.7062,-5.4787,15.3009,10.9194,9.3681,10.3095,3.8719,-25.2351,6.8621,15.9821,10.7224,8.1774,-0.152,5.6554,19.0873,2.3769,-0.8094,1.6957,9.3155,36.6746,1.7862,11.8865,3.5677,5.9126,9.0164,14.1497,15.4005,11.6409,6.8123,5.3129,7.8471,2.8794,-3.2717,4.722,12.7853,10.7842,6.9457,16.8443,8.352,7.5387,11.2865,6.0204,13.2836,12.79,6.9343,0.8682,13.3133,12.2878,0.5145,6.2833,6.7985,-12.4679,-4.8045,8.928,27.5809,0.2634,16.828,-3.3445,0.7503,17.1752,13.252,9.9541,-2.7644,6.7181,-7.9165,3.7315,-8.8511,16.6173,7.3716,6.3549,18.5859,8.5522,-6.3456,12.8852,-5.972,19.6098,14.6986,21.5576,5.5863,4.7819,11.7195,3.7631,17.4106,2.7835,-13.5526,5.3855,5.596,-3.985,-1.3277,8.6155,7.104,18.8669,7.4147,2.0089,12.6842,-9.8806,7.286,4.4112,8.9732,9.3226,1.7624,12.3594,-4.9206,5.9684,4.2373,18.7243,0.191,5.4366,7.9873,-1.0887,8.8598,12.7524,-0.6813,3.0511,9.6746,16.8054,-15.8877
123856,train_123856,0,4.3182,-3.6016,9.6069,8.026,9.0816,-2.9328,5.7113,14.6678,-3.079,7.7294,10.8861,-3.7007,14.011,7.8415,6.4046,13.7853,6.4548,-13.786,-0.3073,2.3533,13.5765,15.8497,7.3787,3.5811,4.9301,13.6765,-5.4625,-1.3586,4.0193,3.1062,-18.3221,9.5575,0.6229,20.2985,10.9751,1.9137,2.5365,3.4283,14.8633,-6.8137,-5.4395,12.2656,10.1622,11.245,11.3682,19.1252,11.8757,-13.5835,2.1491,19.2855,12.37,15.7744,4.9834,5.2122,6.6962,8.8937,18.3322,7.156,-1.1298,9.3219,22.3223,-24.3183,1.8904,1.5039,5.0378,-3.3587,6.1007,17.4377,5.0287,-6.7386,37.2241,0.5318,-2.787,14.0991,-17.9684,16.061,-8.4812,20.864,7.9384,14.0411,9.9663,13.9942,-16.9289,-8.5702,1.2344,13.3458,18.9948,10.252,10.9485,0.6612,-26.3433,6.9479,10.7903,11.0756,8.4716,0.6261,14.7505,6.8433,1.8856,-3.2751,-4.4183,13.6232,36.1842,1.6348,8.5829,4.9169,10.4617,20.972,14.0625,16.9344,8.0296,7.5662,4.2921,-1.6679,2.9132,-1.6329,2.9193,17.5138,-2.299,6.6262,16.2742,10.3072,6.5372,7.8939,2.6258,12.3622,14.1391,-3.1357,-0.755,17.9086,12.1641,0.3597,5.6169,7.0515,-13.1164,-9.5871,25.757,24.9416,6.0488,17.0177,7.8844,12.2212,11.0516,17.089,8.5086,0.1381,7.7559,-10.3496,3.7366,18.3024,19.3423,14.2391,11.4478,15.9475,5.7999,-0.3113,12.5829,-3.9216,32.5271,10.8209,32.0835,5.9124,6.3872,15.5494,0.3276,11.807,2.4312,3.2989,6.1795,5.6956,-6.3005,-4.4744,12.7232,3.6184,16.008,9.5234,-3.1079,15.9633,-3.824,-0.0584,-3.7881,10.0685,3.3928,4.1134,-2.309,-0.2763,14.745,-10.3317,11.4135,0.7507,3.5122,9.6423,2.7121,3.9655,11.3242,1.8677,9.7003,9.2057,13.4518,8.0379


In [7]:
col_list = train.columns[2:]

In [6]:
# CALCULATE MEANS AND STANDARD DEVIATIONS
s = [0]*200
m = [0]*200
for i in range(200):
    s[i] = np.std(train['var_'+str(i)])
    m[i] = np.mean(train['var_'+str(i)])
    
# CALCULATE PROB(TARGET=1 | X)
def getp(i,x):
    c = 3 #smoothing factor
    a = len( train1[ (train1['var_'+str(i)]>x-s[i]/c)&(train1['var_'+str(i)]<x+s[i]/c) ] ) 
    b = len( train0[ (train0['var_'+str(i)]>x-s[i]/c)&(train0['var_'+str(i)]<x+s[i]/c) ] )
    # if a+b<500: return 0.1 #smoothing factor
    # RETURN PROBABILITY
    return a / (a+b)
    # ALTERNATIVELY RETURN ODDS
    # return a / b
    
# SMOOTH A DISCRETE FUNCTION
def smooth(x,st=1):
    for j in range(st):
        x2 = np.ones(len(x)) * 0.1
        for i in range(len(x)-2):
            x2[i+1] = 0.25*x[i]+0.5*x[i+1]+0.25*x[i+2]
        x = x2.copy()
    return x

In [17]:
# DRAW PLOTS, YES OR NO
Picture = False
# DATA HAS Z-SCORE RANGE OF -4.5 TO 4.5
rmin=-5; rmax=5; 
# CALCULATE PROBABILITIES FOR 501 BINS
res=501
# STORE PROBABILITIES IN PR
pr = 0.1 * np.ones((200,res))
pr2 = pr.copy()
xr = np.zeros((200,res))
xr2 = xr.copy()
ct2 = 0
for j in tqdm(range(50)):
    if Picture: plt.figure(figsize=(15,8))
    for v in range(4):
        ct = 0
        # CALCULATE PROBABILITY FUNCTION FOR VAR
        for i in np.linspace(rmin,rmax,res):
            pr[v+4*j,ct] = getp(v+4*j,m[v+4*j]+i*s[v+4*j])
            xr[v+4*j,ct] = m[v+4*j]+i*s[v+4*j]
            xr2[v+4*j,ct] = i
            ct += 1
        if Picture:
            # SMOOTH FUNCTION FOR PRETTIER DISPLAY
            # BUT USE UNSMOOTHED FUNCTION FOR PREDICTION
            pr2[v+4*j,:] = smooth(pr[v+4*j,:],res//10)
            # DISPLAY PROBABILITY FUNCTION
            plt.subplot(2, 4, ct2%4+5)
            plt.plot(xr[v+4*j,:],pr2[v+4*j,:],'-')
            plt.title('P( t=1 | var_'+str(v+4*j)+' )')
            xx = plt.xlim()
            # DISPLAY TARGET DENSITIES
            plt.subplot(2, 4, ct2%4+1)            
            sns.distplot(train0['var_'+str(v+4*j)], label = 't=0')
            sns.distplot(train1['var_'+str(v+4*j)], label = 't=1')
            plt.title('var_'+str(v+4*j))
            plt.legend()
            plt.xlim(xx)
            plt.xlabel('')
        ct2 += 1
    if Picture: plt.show()

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [127]:
def get_pred(data):

    on = np.nonzero(xr[int(col.split('_')[-1])] > data)[0][0]
    pred = (pr[int(col[-1])][on] + pr[int(col[-1])][on + 1])/2

    return pred

In [131]:
prob_dic = {}

for col in tqdm(col_list):

    temp = train[col].value_counts().to_frame()

    temp['pred'] = temp.index.map(get_pred)
    temp = np.around(temp, 5)
    del temp[col]

    temp = temp.to_dict()['pred']
    
    prob_dic[col] = temp

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [166]:
pb_idx = np.load('./data_temp/public_LB.npy')
pv_idx = np.load('./data_temp/private_LB.npy')

In [167]:
test_pb = test.iloc[pb_idx].sort_index().copy()
test_pv = test.iloc[pv_idx].sort_index().copy()

test_real = test_pb.append(test_pv)

In [168]:
data = train.append(test_real)

In [138]:
unique_df = data[['ID_code']]

In [139]:
for col in tqdm(col_list):
    unique_df[col] = data[col].map(((data[col].value_counts() == 1) * 1).to_dict())

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [174]:
# unique_data = data[['ID_code', 'target']]
for col in tqdm(col_list):
    data[col + '_unique'] = np.around(data[col] * unique_df[col], 4)
#     unique_data[col] = np.around(data[col] * unique_df[col], 4)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [176]:
for col in tqdm(col_list):
    data[col + '_prob'] = data[col + '_unique'].map(prob_dic[col])

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [179]:
train = data[~data.target.isna()]
test = data[data.target.isna()]

In [180]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [183]:
target = train['target']

In [184]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average': False,
    'boost': 'gbdt',
    'feature_fraction_seed': 47,
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'num_threads': 8
}

In [185]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [c for c in train.columns if c not in ['ID_code', 'target']]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target.values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 30000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=400, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    print("CV score: {:<8.5f}".format(roc_auc_score(target.values[val_idx], oof_lgb[val_idx])))
    
print("CV score: {:<8.5f}".format(roc_auc_score(target.values, oof_lgb)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[400]	training's auc: 0.890433	valid_1's auc: 0.872906
[800]	training's auc: 0.901113	valid_1's auc: 0.882236
[1200]	training's auc: 0.907121	valid_1's auc: 0.886424
[1600]	training's auc: 0.912618	valid_1's auc: 0.890582
[2000]	training's auc: 0.916292	valid_1's auc: 0.892593
[2400]	training's auc: 0.919965	valid_1's auc: 0.894464
[2800]	training's auc: 0.922948	valid_1's auc: 0.895916
[3200]	training's auc: 0.925863	valid_1's auc: 0.897298
[3600]	training's auc: 0.9286	valid_1's auc: 0.898492
[4000]	training's auc: 0.931252	valid_1's auc: 0.89941
[4400]	training's auc: 0.933689	valid_1's auc: 0.900249
[4800]	training's auc: 0.936123	valid_1's auc: 0.901017
[5200]	training's auc: 0.93854	valid_1's auc: 0.902014
[5600]	training's auc: 0.940774	valid_1's auc: 0.902838
[6000]	training's auc: 0.942989	valid_1's auc: 0.903607
[6400]	training's auc: 0.945074	valid_1's auc: 0.904235
[6800]	training's auc: 0.947184	valid_