<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import joblib
import pandas as pd 
from lightgbm import LGBMClassifier, create_tree_digraph
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

pd.set_option("display.max_columns", None)

In [2]:
def pr_auc(y_true, y_pred):
    
    precision, recall, threshold = precision_recall_curve(y_true, y_pred)
    
    pr_auc = auc(recall, precision)
    
    return pr_auc 

In [3]:
prod_model = joblib.load("model_30_features_dev_june_july_twice.joblib")

In [4]:
prod_model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'gain',
 'learning_rate': 0.1,
 'max_depth': 4,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 300,
 'n_jobs': -1,
 'num_leaves': 30,
 'objective': None,
 'random_state': 2022,
 'reg_alpha': 0.75,
 'reg_lambda': 0.25,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 20000,
 'subsample_freq': 0,
 'is_unbalance': True}

In [5]:
X_oot_kasım = pd.read_pickle("X_oot_kasım.pkl")

X_oot_aralık = pd.read_pickle("X_oot_aralık.pkl")

X_oot_ocak = pd.read_pickle("X_oot_ocak.pkl")

X_oot_subat = pd.read_pickle("X_oot_subat.pkl")

X_oot_mart = pd.read_pickle("X_oot_mart.pkl")

X_oot_nisan = pd.read_pickle("X_oot_nisan.pkl")

X_oot_mayıs = pd.read_pickle("X_oot_mayıs.pkl")

X_recent_agustos = pd.read_pickle("X_recent_agustos.pkl")

X_recent_eylul = pd.read_pickle("X_recent_eylul.pkl")

X_recent_ekim = pd.read_pickle("X_recent_ekim.pkl")

X_recent_kasım = pd.read_pickle("X_recent_kasım.pkl")

#X_most_recent_aralık = pd.read_pickle("X_most_recent_aralık.pkl")

#X_most_recent_ocak = pd.read_pickle("X_most_recent_ocak.pkl")

In [6]:
y_oot_kasım = pd.read_pickle("y_oot_kasım.pkl")

y_oot_aralık = pd.read_pickle("y_oot_aralık.pkl")

y_oot_ocak = pd.read_pickle("y_oot_ocak.pkl")

y_oot_subat = pd.read_pickle("y_oot_subat.pkl")

y_oot_mart = pd.read_pickle("y_oot_mart.pkl")

y_oot_nisan = pd.read_pickle("y_oot_nisan.pkl")

y_oot_mayıs = pd.read_pickle("y_oot_mayıs.pkl")

y_recent_agustos = pd.read_pickle("y_recent_agustos.pkl")

y_recent_eylul = pd.read_pickle("y_recent_eylul.pkl")

y_recent_ekim = pd.read_pickle("y_recent_ekim.pkl")

y_recent_kasım = pd.read_pickle("y_recent_kasım.pkl")

#y_most_recent_aralık = pd.read_pickle("y_most_recent_aralık.pkl")

#y_most_recent_ocak = pd.read_pickle("y_most_recent_ocak.pkl")

In [7]:
X_most_recent_final = pd.read_pickle("X_most_recent_final.pkl")  # aralık-ocak datası

y_most_recent_final = pd.read_pickle("y_most_recent_final.pkl")  # aralık-ocak target

In [8]:
datasets = [X_oot_kasım, X_oot_aralık, X_oot_ocak, X_oot_subat, X_oot_mart, X_oot_nisan, X_oot_mayıs,
            X_recent_agustos, X_recent_eylul, X_recent_ekim, X_recent_kasım]

In [9]:
targets =  [y_oot_kasım, y_oot_aralık, y_oot_ocak, y_oot_subat, y_oot_mart, y_oot_nisan, y_oot_mayıs,
            y_recent_agustos, y_recent_eylul, y_recent_ekim, y_recent_kasım]

**PROD MODEL PREDICTIONS**

In [10]:
probas_most_recent = prod_model.predict_proba(X_most_recent_final)[:,1]

In [11]:
pr_auc(y_most_recent_final, probas_most_recent)

0.18331771999643165

**BATCH MODEL PREDICTIONS**

In [12]:
X_dev_oot_recent_batch = pd.read_pickle("X_dev_oot_recent_batch.pkl")

y_dev_oot_recent_batch = pd.read_pickle("y_dev_oot_recent_batch.pkl")

X_dev_oot_recent_batch.shape, y_dev_oot_recent_batch.shape

((2510749, 30), (2510749,))

In [13]:
%%time
prod_model.fit(X_dev_oot_recent_batch, y_dev_oot_recent_batch)

CPU times: user 4min 36s, sys: 17.5 s, total: 4min 54s
Wall time: 9.93 s


LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=4,
               n_estimators=300, num_leaves=30, random_state=2022,
               reg_alpha=0.75, reg_lambda=0.25, subsample_for_bin=20000)

In [14]:
probas_most_recent = prod_model.predict_proba(X_most_recent_final)[:,1]

In [15]:
pr_auc(y_most_recent_final, probas_most_recent)

0.1164785576599589

**UNDERSAMPLING BATCH MODEL**

In [16]:
rus = RandomUnderSampler(sampling_strategy = 0.02, random_state = 2023)

In [17]:
X_dev_oot_recent_batch_resampled, y_dev_oot_recent_batch_resampled = rus.fit_resample(X_dev_oot_recent_batch, 
                                                                                      y_dev_oot_recent_batch)

In [18]:
Counter(y_dev_oot_recent_batch)

Counter({0: 2493302, 1: 17447})

In [19]:
Counter(y_dev_oot_recent_batch_resampled)

Counter({0: 872350, 1: 17447})

In [20]:
y_dev_oot_recent_batch.mean()

0.006948922413192239

In [21]:
y_dev_oot_recent_batch_resampled.mean()

0.0196078431372549

In [22]:
%%time
prod_model.fit(X_dev_oot_recent_batch_resampled, y_dev_oot_recent_batch_resampled)

CPU times: user 1min 39s, sys: 4.11 s, total: 1min 43s
Wall time: 3.21 s


LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=4,
               n_estimators=300, num_leaves=30, random_state=2022,
               reg_alpha=0.75, reg_lambda=0.25, subsample_for_bin=20000)

In [23]:
probas_most_recent_undersampled_model = prod_model.predict_proba(X_most_recent_final)[:,1]

In [24]:
pr_auc(y_most_recent_final, probas_most_recent_undersampled_model)

0.20740899717293382

**MINI-BATCH APPROACH**

In [25]:
# 1 AY 1 AY EKLENEREK MOST_RECENT BAKILACAK...

# trials will include : conservative model // agressive model // twice mini-batching etc...

In [26]:
prod_model = joblib.load("model_30_features_dev_june_july_twice.joblib")

In [27]:
prod_model

LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=4,
               n_estimators=300, num_leaves=30, random_state=2022,
               reg_alpha=0.75, reg_lambda=0.25, subsample_for_bin=20000)

In [28]:
# CONSERVATIVE MODEL 

model1 = LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=2,
                       n_estimators=10, num_leaves=30, random_state=2022,
                       reg_alpha=0.75, reg_lambda=0.25)

In [29]:
# AGRESSIVE MODEL 

model2 = LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=6,
                       n_estimators=1000, num_leaves=30, random_state=2022,
                       reg_alpha=0.75, reg_lambda=0.25)

In [30]:
models = [prod_model]

In [31]:
prod_model.booster_.trees_to_dataframe().shape[0]

8858

**CONSERVATIVE MODEL MINI-BATCH LEARNING**

In [32]:
# conservative model mini-batch learning

for i in range(len(datasets)):
    
    model1.fit(datasets[i], targets[i], init_model = models[-1])
    
    models.append(model1)
    
    print(model1.booster_.trees_to_dataframe().shape[0])
    
    proba = model1.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

8928
PR_AUC ----->>> 0.18220820818462724
8998
PR_AUC ----->>> 0.18040285201299053
9066
PR_AUC ----->>> 0.18058244235565943
9136
PR_AUC ----->>> 0.18150159397285032
9206
PR_AUC ----->>> 0.18570417551817917
9276
PR_AUC ----->>> 0.18055651787527743
9346
PR_AUC ----->>> 0.17854253920162488
9416
PR_AUC ----->>> 0.17946585778593624
9486
PR_AUC ----->>> 0.18006273348752483
9554
PR_AUC ----->>> 0.18023195766399386
9624
PR_AUC ----->>> 0.17936360891553332


In [33]:
# conservative model mini-batch learning  ( twice )

for i in range(len(datasets)):
    
    model1.fit(datasets[i], targets[i], init_model = models[-1])
    
    models.append(model1)
    
    print(model1.booster_.trees_to_dataframe().shape[0])
    
    proba = model1.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

9694
PR_AUC ----->>> 0.1799176981826055
9764
PR_AUC ----->>> 0.17398563630324138
9834
PR_AUC ----->>> 0.16717154552092323
9904
PR_AUC ----->>> 0.16609650979013016
9974
PR_AUC ----->>> 0.16553702790443273
10044
PR_AUC ----->>> 0.1614102458448117
10114
PR_AUC ----->>> 0.1644787060295191
10184
PR_AUC ----->>> 0.16985010595500571
10254
PR_AUC ----->>> 0.17417893048900704
10324
PR_AUC ----->>> 0.17038906064306747
10394
PR_AUC ----->>> 0.17049461051392306


**AGRESSIVE MODEL MINI-BATCH LEARNING**

In [34]:
prod_model = joblib.load("model_30_features_dev_june_july_twice.joblib")

In [35]:
# AGRESSIVE MODEL 
model2 = LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=6,
                       n_estimators=1000, num_leaves=30, random_state=2022,
                       reg_alpha=0.75, reg_lambda=0.25)

In [36]:
prod_model.booster_.trees_to_dataframe().shape[0]

8858

In [37]:
models = [prod_model]

In [38]:
# agressive model mini-batch learning

for i in range(len(datasets)):
    
    model2.fit(datasets[i], targets[i], init_model = models[-1])
    
    models.append(model2)
    
    print(model2.booster_.trees_to_dataframe().shape[0])
    
    proba = model2.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

27881
PR_AUC ----->>> 0.14733426461581695
69063
PR_AUC ----->>> 0.0261533402084652
100289
PR_AUC ----->>> 0.0809616678728867
127097
PR_AUC ----->>> 0.1553262317945438
152861
PR_AUC ----->>> 0.18061235394642589
176693
PR_AUC ----->>> 0.17301732793116223
199875
PR_AUC ----->>> 0.1649886659308058
226419
PR_AUC ----->>> 0.2512176175591838
251889
PR_AUC ----->>> 0.22986751301021272
278111
PR_AUC ----->>> 0.24970830309032008
303257
PR_AUC ----->>> 0.20377022983606835


In [39]:
# agressive model mini-batch learning ( twice )

for i in range(len(datasets)):
    
    model2.fit(datasets[i], targets[i], init_model = models[-1])
    
    models.append(model2)
    
    print(model2.booster_.trees_to_dataframe().shape[0])
    
    proba = model2.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

321581
PR_AUC ----->>> 0.20168029038367324
332430
PR_AUC ----->>> 0.19323143135278975
348952
PR_AUC ----->>> 0.19405907583703577
367748
PR_AUC ----->>> 0.19843692435485613
367748
PR_AUC ----->>> 0.19843692435485613
386098
PR_AUC ----->>> 0.20550242046622624
403520
PR_AUC ----->>> 0.20032130072617788
425172
PR_AUC ----->>> 0.2415677991941135
447184
PR_AUC ----->>> 0.24934253499335482
469012
PR_AUC ----->>> 0.2537270453052841
489306
PR_AUC ----->>> 0.22858640976285766


**UNDERSAMPLING DATA AND MINI-BATCH LEARNING USING CONSERVATIVE & AGRESSIVE MODELS**

In [40]:
prod_model = joblib.load("model_30_features_dev_june_july_twice.joblib")

In [41]:
# CONSERVATIVE MODEL 

model1 = LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=2,
                       n_estimators=10, num_leaves=30, random_state=2022,
                       reg_alpha=0.75, reg_lambda=0.25)

In [42]:
# AGRESSIVE MODEL 

model2 = LGBMClassifier(importance_type='gain', is_unbalance=True, max_depth=6,
                       n_estimators=1000, num_leaves=30, random_state=2022,
                       reg_alpha=0.75, reg_lambda=0.25)

In [43]:
prod_model.booster_.trees_to_dataframe().shape[0]

8858

In [44]:
models = [prod_model]

In [45]:
rus = RandomUnderSampler(sampling_strategy = 0.02, random_state = 2023)

**conservative model**

In [46]:
# conservative model mini-batch learning -- undersampled data.

for i in range(len(datasets)):
    
    X, y = rus.fit_resample(datasets[i], targets[i])
    
    model1.fit(X, y, init_model = models[-1])
    
    models.append(model1)
    
    print(model1.booster_.trees_to_dataframe().shape[0])
    
    proba = model1.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

8928
PR_AUC ----->>> 0.17910047135750548
8994
PR_AUC ----->>> 0.18119041460614208
9056
PR_AUC ----->>> 0.1803778471082567
9126
PR_AUC ----->>> 0.18147331951125362
9196
PR_AUC ----->>> 0.18364757402300752
9266
PR_AUC ----->>> 0.17904664790256306
9336
PR_AUC ----->>> 0.18029039170952812
9404
PR_AUC ----->>> 0.18212290411704066
9474
PR_AUC ----->>> 0.18427058976783212
9542
PR_AUC ----->>> 0.18726083108917227
9612
PR_AUC ----->>> 0.1860844106669671


In [47]:
# conservative model mini-batch learning ( twice ) -- undersampled data

for i in range(len(datasets)):
    
    X, y = rus.fit_resample(datasets[i], targets[i])
    
    model1.fit(X, y, init_model = models[-1])
    
    models.append(model1)
    
    print(model1.booster_.trees_to_dataframe().shape[0])
    
    proba = model1.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

9682
PR_AUC ----->>> 0.1827592803872589
9750
PR_AUC ----->>> 0.17778542432197852
9820
PR_AUC ----->>> 0.17577269995277484
9890
PR_AUC ----->>> 0.17144230363654872
9960
PR_AUC ----->>> 0.16993350999156112
10030
PR_AUC ----->>> 0.16736683571859784
10100
PR_AUC ----->>> 0.1674598485505154
10170
PR_AUC ----->>> 0.1695892111900818
10240
PR_AUC ----->>> 0.17040810843381324
10310
PR_AUC ----->>> 0.17152604258007553
10380
PR_AUC ----->>> 0.17177680532830691


**aggressive model**

In [48]:
prod_model = joblib.load("model_30_features_dev_june_july_twice.joblib")

In [49]:
prod_model.booster_.trees_to_dataframe().shape[0]

8858

In [50]:
models = [prod_model]

In [51]:
# aggressive model mini-batch learning -- undersampled data

for i in range(len(datasets)):
    
    X, y = rus.fit_resample(datasets[i], targets[i])
    
    model2.fit(X, y, init_model = models[-1])
    
    models.append(model2)
    
    print(model2.booster_.trees_to_dataframe().shape[0])
    
    proba = model2.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

20806
PR_AUC ----->>> 0.16377307818316933
31528
PR_AUC ----->>> 0.02970016991301124
42684
PR_AUC ----->>> 0.025817433742410745
60670
PR_AUC ----->>> 0.034484287115847806
92002
PR_AUC ----->>> 0.08762628490817992
133058
PR_AUC ----->>> 0.10942827962751843
174138
PR_AUC ----->>> 0.13737183976706754
224080
PR_AUC ----->>> 0.18695776242241915
265228
PR_AUC ----->>> 0.2246097834225279
304626
PR_AUC ----->>> 0.22486669112690447
344184
PR_AUC ----->>> 0.2276568738819362


In [52]:
# aggressive model mini-batch learning -- undersampled data ( twice )

for i in range(len(datasets)):
    
    X, y = rus.fit_resample(datasets[i], targets[i])
    
    model2.fit(X, y, init_model = models[-1])
    
    models.append(model2)
    
    print(model2.booster_.trees_to_dataframe().shape[0])
    
    proba = model2.predict_proba(X_most_recent_final)[:,1]
    
    pr_auc_score = pr_auc(y_most_recent_final, proba)
    
    print("PR_AUC ----->>>"  , pr_auc_score)

373512
PR_AUC ----->>> 0.23331747084630877
397156
PR_AUC ----->>> 0.245622736944434
420984
PR_AUC ----->>> 0.2469456829248192
447554
PR_AUC ----->>> 0.26376297489069417
476006
PR_AUC ----->>> 0.24304957902891067
502054
PR_AUC ----->>> 0.2261246222184953
529790
PR_AUC ----->>> 0.24148953628754213
561732
PR_AUC ----->>> 0.24903061256760672
595512
PR_AUC ----->>> 0.22752530011336497
628230
PR_AUC ----->>> 0.2345308522478957
663428
PR_AUC ----->>> 0.2644345689206906
