In [0]:
def featurization(X):
    """Create 800 more new features from 200 original features .
    It will create below new features:
    a. Duplicate Count: Take minimum of 10 and value count for that particular value.
    b. Duplicate Value Count >2 : Multiply actual value of that feature with duplicate count (if only duplicate count greater than 2) 
    c. Duplicate Value Count >4 : Multiply actual value of that feature with duplicate count (if only duplicate count greater than 4) 
    d. Distance of mean : Calculate difference between current value and mean of that particular feature .Then mutiply it with duplicate count feature."""
    import pandas as pd
    from tqdm import tqdm
    import numpy as np
    from sklearn.model_selection import train_test_split
    import os
    import lightgbm as lgb
    from sklearn.metrics import roc_auc_score
    import pickle

    filename = '/content/drive/My Drive/proj_1/train_test_real.sav'
    target='target'
    features = [i for i in X.columns]
    train_test_real = pickle.load(open(filename, 'rb'))
    
    ##https://www.kaggle.com/super13579/lgbm-with-duplicate-flag-value-0-923?scriptVersionId=12330297
    for f in tqdm(features):
        count=train_test_real[f].value_counts(dropna=True)
        X[f+'dup_count'] = X[f].map(count).map(lambda x:min(10,x)).astype(np.uint8)
        X[f + '_dup_value_2'] = X[f]* (X[f + 'dup_count'].map(lambda x:int(x>2))).astype(np.float32)
        X[f + '_dup_value_4'] = X[f]* (X[f + 'dup_count'].map(lambda x:int(x>4))).astype(np.float32)
    for f in tqdm(features):
        X[f+'distance_of_mean'] = X[f]-train_test_real[f].mean()
        X[f+'distance_of_mean'] = (X[f+'distance_of_mean']* X[f+'dup_count'].map(lambda x:int(x>1))).astype(np.float32) 

    return X

In [0]:
def final_fun_score(X,Y):
    """Calculate auc score between actual target value and predicted target value"""
    import pandas as pd
    from tqdm import tqdm
    import numpy as np
    from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
    import os
    import lightgbm as lgb
    from sklearn.metrics import roc_auc_score,roc_curve,auc
    import pickle
    import warnings
    import pickle

    warnings.filterwarnings("ignore")
    from google.colab import drive
    drive.mount('/content/drive')
    
    X.drop(['target','ID_code'],axis=1,inplace=True)
    print('Shape of input data before featurization'+str(X.shape))
    X=featurization(X)
    
    print('Shape of input data after featurization'+str(X.shape))

    pred=0
    for i in range(5):
        lm =lgb.Booster(model_file='/content/drive/My Drive/proj_1/model_1000_iteration_{}.sav'.format(i))
        pred+=lm.predict(X)

    y_pred=pred/5
    y_pred=pd.DataFrame(y_pred)

    val_auc=roc_auc_score(Y, y_pred)
    print('auc:'+str(val_auc))

    return val_auc
    

In [0]:
def final_fun_predict(X):
    """Calculate predicted target value for input data"""
    import warnings
    import pandas as pd
    from tqdm import tqdm
    import numpy as np
    from sklearn.model_selection import train_test_split
    import os
    import lightgbm as lgb
    from sklearn.metrics import roc_auc_score
    
    import pickle
    warnings.filterwarnings("ignore")
    from google.colab import drive
    drive.mount('/content/drive')
        
    X.drop(['target','ID_code'],axis=1,inplace=True)
    print('Shape of input data before featurization'+str(X.shape))
    
    X=featurization(X)
    
    print('Shape of input data after featurization'+str(X.shape))
    
    #drop target and ID_code   
    pred=0
    for i in range(5):
        lm =lgb.Booster(model_file='/content/drive/My Drive/proj_1/model_1000_iteration_{}.sav'.format(i))
        pred+=lm.predict(X)

    y_pred=pred/5
    y_pred=pd.DataFrame(y_pred)
    
    return y_pred
    

### Sample Test runs

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')
tr_data = pd.read_csv('/content/drive/My Drive/proj_1/train.csv')
y=tr_data['target']
X_train, X_test, y_train, y_test = train_test_split(tr_data, y, test_size = 0.20, stratify=y)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
acc=final_fun_score(X_test,y_test)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape of input data before featurization(40000, 200)


100%|██████████| 200/200 [00:18<00:00, 10.61it/s]
100%|██████████| 200/200 [00:04<00:00, 31.72it/s]


Shape of input data after featurization(40000, 1000)
auc:0.873956281682195


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')
tr_data = pd.read_csv('/content/drive/My Drive/proj_1/train.csv')
y=tr_data['target']
X_train, X_test, y_train, y_test = train_test_split(tr_data, y, test_size = 0.20, stratify=y)
X_1=X_test.head(10)
print(X_1.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(10, 202)


In [11]:
pred=final_fun_predict(X_1)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape of input data before featurization(10, 200)


100%|██████████| 200/200 [00:07<00:00, 28.00it/s]
100%|██████████| 200/200 [00:00<00:00, 403.95it/s]


Shape of input data after featurization(10, 1000)


In [12]:
print(pred)

          0
0  0.999977
1  0.999505
2  0.999985
3  0.999850
4  0.999906
5  0.999996
6  0.999975
7  0.999803
8  0.999876
9  0.999985


In [13]:
X_test.head(10)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,...,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
165242,train_165242,0,6.5401,6.9738,9.4903,4.2416,11.7262,-3.3781,5.7018,11.058,4.7492,7.7976,-3.2499,-1.9315,13.8411,13.127,10.0855,14.0525,11.8369,-6.46,28.7318,18.4723,8.4144,6.0276,5.4655,2.8181,8.7724,13.4928,-7.2366,-2.8442,4.2513,5.6648,-1.8235,9.7287,1.6504,13.5633,10.9556,-0.5956,-0.0495,9.9555,...,19.1838,5.5893,5.6105,22.479,-12.4451,18.4421,2.6163,3.7576,9.9422,5.5241,-3.2202,-5.9092,20.6231,-5.8306,31.1829,12.6432,-9.823,8.9954,10.1563,-1.0242,1.2692,7.6314,26.4278,6.0648,-1.9358,-5.5244,4.114,-5.0133,9.0874,1.9311,5.0661,4.9965,2.451,8.6059,24.723,-2.3989,-2.6068,7.8623,18.0975,-3.1107
183855,train_183855,0,14.4793,6.5271,11.8568,7.7547,9.4347,-10.7928,6.193,16.5088,-0.1384,8.0878,6.0408,-15.0521,13.8099,5.5319,9.1261,14.8759,8.3637,-7.5907,14.5604,17.8963,16.5899,24.7629,3.3028,2.8696,18.3363,13.641,4.7528,-1.3007,6.4022,0.7586,-0.6048,12.5149,2.1115,18.2849,12.1742,2.1408,-1.2391,3.9627,...,14.1989,5.2896,6.9696,18.069,4.8239,18.5645,2.8969,-5.6944,3.9681,5.9238,-2.454,-0.615,32.2492,-0.0686,14.1989,10.061,7.8918,13.2197,1.0714,2.2161,-1.9457,10.1755,3.2342,13.9767,10.4543,3.4785,11.4362,-23.2323,14.4725,0.5902,-1.4867,5.0037,-0.6196,4.1695,10.9079,0.2663,-3.2531,9.9544,12.0928,-5.3351
34281,train_34281,0,5.8153,2.5905,10.617,7.2382,10.171,-13.8336,5.1741,12.8028,3.442,7.7383,-7.068,1.5839,13.8073,-1.0241,10.5851,14.4826,9.4762,13.4384,18.6199,4.1579,11.7768,26.4847,8.0858,3.3785,16.7824,13.3396,-2.813,-1.3079,6.3173,1.8239,2.9717,9.6539,-0.3331,13.9822,10.7967,3.8681,-1.9491,5.4673,...,44.7116,5.4186,8.0983,9.6701,-1.342,24.819,3.1861,-4.0691,4.6626,5.5485,-2.7329,-2.9546,22.2648,2.3774,12.6566,10.5496,-13.9433,13.0101,4.5808,4.7241,-16.1507,10.6453,17.55,5.6711,5.9815,-8.3606,19.0042,-14.7104,10.5606,0.6383,8.3933,13.2317,2.4654,0.712,11.5857,-2.0781,5.97,8.0142,19.728,-19.703
37995,train_37995,0,13.8082,-6.6342,10.012,7.3657,12.7025,-19.7165,5.869,17.4265,-1.8203,8.0585,-3.256,-8.4971,14.1146,6.5567,8.3723,15.2609,10.7289,-13.7423,19.0783,29.4252,9.6909,18.071,4.684,3.9801,8.3228,13.9832,-9.4166,-1.5942,5.409,7.1228,-0.8216,11.2056,0.8096,17.5465,12.1802,5.0986,5.651,6.591,...,23.755,5.8838,5.7237,21.6093,4.6248,14.1545,3.1037,-13.7615,5.0461,5.2769,-8.4268,6.8882,2.7511,-4.1276,23.6601,10.3585,-0.2458,15.1104,-8.363,-1.2713,0.0341,10.1279,-8.1238,5.5238,-4.2951,-4.0066,6.3221,-8.9449,14.2305,1.0721,7.7442,7.5805,1.6728,-0.2045,14.5061,-1.408,-3.3222,8.9219,12.757,1.9783
176993,train_176993,0,14.7594,-0.4673,12.4909,7.6346,10.5156,3.8747,4.7999,18.3569,4.2355,6.6202,3.0013,4.4642,14.0948,4.398,4.126,15.0608,9.0174,-11.5914,12.7684,1.2928,9.6934,26.5008,4.4104,3.2319,7.0888,13.7848,1.3505,-1.2567,5.3412,5.5523,-10.4225,8.0081,-6.2652,20.5355,11.1206,12.9148,4.4517,4.732,...,31.6641,5.7752,5.7382,10.7202,-0.6046,22.1915,3.1006,-11.046,2.6558,6.0333,-4.9314,-6.8749,24.1146,-4.5609,16.8676,9.5928,4.1708,8.1873,-9.5083,-1.1483,6.7695,9.2168,-9.0832,13.7306,23.2406,-11.6543,7.8886,-15.1914,19.4216,-0.4717,-3.3828,8.0503,-0.5326,7.7072,18.8132,0.7689,3.3208,8.7829,20.0415,-1.3041
133070,train_133070,1,15.3286,1.8768,10.7884,7.1206,11.4307,-14.7345,6.9288,23.8428,1.866,8.5872,10.0652,7.9304,13.6169,5.7943,10.8404,15.2302,8.8149,0.8605,25.0935,30.1167,11.1841,5.0003,0.2341,3.1198,10.4635,13.6536,-5.5721,0.6416,6.5786,3.8808,-12.0674,10.313,-0.9572,21.1588,11.2392,10.6627,3.0159,7.0862,...,19.0704,5.7514,5.9618,7.8909,-7.945,9.3043,3.1154,-12.7285,1.5399,6.212,7.7677,-11.5562,13.6292,-0.5624,2.3074,12.7461,2.8775,8.5169,-10.3368,3.9909,-5.5958,9.7983,-5.1784,11.8922,17.0799,-5.6855,6.8541,-9.7669,22.7382,0.2941,0.5396,9.7266,3.3731,6.1632,16.9903,-2.2596,6.3937,9.0087,16.3739,9.5168
121780,train_121780,0,10.4105,4.1128,10.8918,8.9215,13.0706,-2.9346,6.1087,13.9742,3.7364,5.9349,4.5088,-8.2624,14.0457,10.3581,2.4877,14.6476,6.566,-9.7393,18.3612,14.9365,17.8619,9.3981,6.1479,3.5421,11.187,13.4035,-1.4392,0.4802,4.5283,2.6787,-16.7703,9.3946,-3.759,9.7475,10.6808,2.0329,3.2902,6.1079,...,19.7515,5.4807,4.5458,15.6096,-5.2243,29.0177,3.2271,2.8931,3.7812,5.6002,-5.3139,-3.3129,31.4225,-5.6097,26.0476,7.4862,6.5031,12.1432,5.7591,3.5864,-0.2958,9.1506,-4.9477,14.4601,19.9908,-0.4191,7.811,-18.8672,16.5002,-1.1067,-4.7583,14.9562,2.3717,2.8024,21.1654,-0.6561,5.4924,9.5786,14.6464,-7.4558
76790,train_76790,0,7.907,2.5839,8.9171,5.1452,12.5271,10.881,6.2358,14.8463,-2.7941,6.9351,-5.5187,-7.5344,14.021,13.9607,6.0382,14.9084,7.2807,-7.3303,17.3338,13.172,23.0447,6.9843,1.6557,2.8155,6.619,14.0863,-5.279,0.5922,6.2785,4.8071,-13.7903,11.3907,-1.3462,12.3521,12.0321,-4.0203,3.4203,3.9808,...,31.886,5.7804,4.3865,2.0603,-11.3717,21.435,3.0945,-2.6811,3.6084,6.0556,-2.8327,0.056,25.8805,0.3718,12.035,11.7885,-1.4443,12.608,3.2407,5.7886,-8.7799,8.8687,6.524,7.387,22.1128,-1.635,10.3023,-21.0704,19.7757,-0.9242,4.7077,5.908,2.6341,2.4511,22.6324,1.677,7.0375,10.0287,17.6514,-5.6856
10702,train_10702,0,9.855,4.2773,9.7551,8.1443,12.6756,-1.7684,5.9037,15.8464,2.6871,7.1397,6.449,-6.0605,13.7994,13.3688,8.6393,15.373,6.9979,-16.6929,5.9041,-2.8853,13.7455,18.9334,-0.2615,3.5302,5.1055,13.9199,-2.1279,-3.5694,4.9306,7.7362,-22.3772,6.8822,-3.3274,20.4003,11.2671,8.2556,4.0844,7.9006,...,4.8899,5.6139,6.3343,11.0734,-9.2761,16.3448,2.2565,-9.5831,3.9435,5.6032,-5.0539,-0.2583,31.9259,-6.3078,29.0442,8.0844,1.5228,13.0565,-1.6197,6.5406,-14.7116,7.6638,6.9658,3.8675,6.146,5.5417,7.9683,-9.0915,18.3883,-0.0362,6.9816,6.645,2.4722,1.4964,17.6621,-2.0757,-3.1008,9.2763,16.5364,-16.5693
55455,train_55455,0,8.2779,6.6452,13.403,4.8533,8.9099,-4.7034,4.3028,17.4599,-5.3796,9.3989,8.5484,-5.0477,14.1826,8.5235,10.0892,14.6891,8.8836,1.3031,2.4519,14.1932,18.3876,22.3851,4.6508,2.8862,12.1908,13.2972,-5.7415,-1.1654,6.001,2.4952,-6.9582,11.1287,5.5969,8.9401,11.6099,8.7117,1.8725,1.8035,...,31.9511,5.5399,5.1452,4.1643,0.402,24.1176,2.5,3.1565,8.5115,5.7919,-3.4466,5.8993,32.4552,6.7993,10.4327,7.8656,12.3105,9.9048,0.9065,2.5282,-9.6539,12.8674,3.8019,12.815,9.7829,-5.237,12.4697,-0.4842,23.4898,1.3041,-0.3,3.0239,1.2322,-2.0048,18.3084,-0.7034,2.149,7.6664,13.4285,4.4595
