In [1]:
import pandas as pd;
import numpy as np;
import joblib;
import optuna
import lightgbm as lgb
from IPython.display import display_html, clear_output, Markdown;
from gc import collect;
from os import system, getpid, walk;
from psutil import Process;
import ctypes;
libc = ctypes.CDLL("libc.so.6");

from pprint import pprint;
from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');

from tqdm.notebook import tqdm;
from sklearn.model_selection import KFold as KF
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_absolute_error

from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR;
from catboost import CatBoostRegressor as CBR;
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import HistGradientBoostingRegressor as HGBR;
from sklearn.metrics import mean_absolute_error as mae, make_scorer;

from sklearn2pmml import PMMLPipeline, sklearn2pmml

In [2]:
%%time   
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    print(style + color + text + Style.RESET_ALL); 

def GetMemUsage():
    pid = getpid();
    py = Process(pid);
    memory_use = py.memory_info()[0] / 2. ** 30;
    return f"RAM memory GB usage = {memory_use :.4}";
    

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 10 µs


In [3]:
from sklearn import set_config; 
set_config(transform_output = "pandas");
pd.set_option('display.max_columns', 50);
pd.set_option('display.max_rows', 50);

print();
collect();




In [4]:
class CFG:
    version_nb         = 2;
    load_tr_data       = "Y";
    state              = 22;
    LL                 = 'trainV1.pickle';
    PI_LL              = 'trainV2.pickle';
    path               = f"../data/";
    mdl_path           = f'../models/' + ('v1/' if version_nb == 1 else 'v2/');
    
    ftre_imp           = f'../feat_imp/';
    methods            = ["LGBMR", 'XGBR'];
    ML                 = "Y";
    OPTUNA             = "N";
    inference          = "N";
    n_splits           = 5;
    n_repeats          = 1;
    nbrnd_erly_stp     = 100 ;
    mdlcv_mthd         = KF;
    spliter            = tts;

print();
PrintColor(f"--> Configuration done!\n");
collect();


[1m[34m--> Configuration done!
[0m


In [5]:
def ScoreMetric(ytrue, ypred)-> float:    
    return mae(ytrue, ypred);

myscorer = make_scorer(ScoreMetric, greater_is_better = False, needs_proba=False,);

print();
collect();

PrintColor(f"\n" + GetMemUsage(), color = Fore.RED);


[1m[31m
RAM memory GB usage = 0.2395[0m


In [6]:
if (CFG.load_tr_data == "Y" or CFG.ML == "Y"):
    if CFG.version_nb == 1:
        df = pd.read_pickle(CFG.path + CFG.LL)
        y = df[['target']].dropna()
        X = df.drop(['target'], axis=1).dropna()
        PrintColor(f"---> Version with LL target", color = Fore.GREEN)
    else:
        df = pd.read_pickle(CFG.path + CFG.PI_LL)
        y = df[['target']].dropna()
        X = df.drop(['target'], axis=1).dropna()
        PrintColor(f"---> Version with PI-LL target", color = Fore.GREEN)
    PrintColor(f"---> Sampled train shapes = {X.shape}, {y.shape}", 
               color = Fore.RED);

[1m[32m---> Version with PI-LL target[0m
[1m[31m---> Sampled train shapes = (128, 3), (128, 1)[0m


In [7]:
if CFG.OPTUNA == 'Y':
    Xtr, Xdev, ytr, ydev = CFG.spliter(X, y, test_size=0.2, random_state=CFG.state)

In [8]:
if CFG.OPTUNA == 'Y':
    def objective_cbt(trial, xtrain=Xtr, ytrain=ytr, return_info=False):
        cv =  CFG.mdlcv_mthd(n_splits= CFG.n_splits, shuffle = False)

        X_train, y_train = Xtr.values, ytr.values
        y_valid_pred_total = np.zeros(X_train.shape[0])
    
        collect()

        models = []
        valid_score = 0
    
        for train_idx, valid_idx in cv.split(X_train, y_train):
        
            train_data = X_train[train_idx], y_train[train_idx]
            valid_data = X_train[valid_idx], y_train[valid_idx]
        
        #print('train', len(train_idx), 'valid', len(valid_idx))
        
            model, y_pred_valid, log = fit_cbt(trial, train_data, valid_data, num_rounds=1000)
        
            y_valid_pred_total[valid_idx] = y_pred_valid
            models.append(model)
        
            collect()
            valid_score += log["valid/l1"]
    
        valid_score /= len(models)
        if return_info:
            return valid_score, models, y_pred_valid, y_train
        else:
            return valid_score


In [9]:
if CFG.OPTUNA == 'Y':
    def objective_lgb(trial, xtrain=Xtr, ytrain=ytr, return_info=False):
        cv =  CFG.mdlcv_mthd(n_splits= CFG.n_splits, shuffle = False)

        X_train, y_train = Xtr.values, ytr.values
        y_valid_pred_total = np.zeros(X_train.shape[0])
    
        collect()

        models = []
        valid_score = 0
    
        for train_idx, valid_idx in cv.split(X_train, y_train):
        
            train_data = X_train[train_idx], y_train[train_idx]
            valid_data = X_train[valid_idx], y_train[valid_idx]
        
        #print('train', len(train_idx), 'valid', len(valid_idx))
        
            model, y_pred_valid, log = fit_lgbm(trial, train_data, valid_data, num_rounds=1000)
        
            y_valid_pred_total[valid_idx] = y_pred_valid
            models.append(model)
        
            collect()
            valid_score += log["valid/l1"]
    
        valid_score /= len(models)
        if return_info:
            return valid_score, models, y_pred_valid, y_train
        else:
            return valid_score


In [10]:
def fit_cbt(trial, train, val, devices=(-1,), seed=None, num_rounds=1500):
    X_train, y_train = train
    X_valid, y_valid = val
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
    param['depth'] = trial.suggest_int('depth', 2, 16)
    param['l2_leaf_reg'] = trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 3000
    param['use_best_model'] = True
    param['eval_metric'] = 'MAE'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = CFG.state
    param['logging_level'] = 'Silent'
    device = devices[0]
    
    if device == -1:
        # use cpu
        pass
    else:
        # use gpu
        print(f'using gpu device_id {device}...')
        params.update({'device': 'gpu', 'gpu_device_id': device})

    model = CBR(**param)

    model.fit(X_train.copy(), y_train.copy(),
                  eval_set=[(X_valid.copy(), y_valid.copy())],
                  early_stopping_rounds=CFG.nbrnd_erly_stp)

    # predictions
    y_pred_valid = model.predict(X_valid)
    
    log = {'train/l1': model.get_best_score()['learn']['MAE'],
           'valid/l1': model.get_best_score()['validation']['MAE']}
    #print(log)
    return model, y_pred_valid, log
    

In [11]:
def fit_lgbm(trial, train, val, devices=(-1,), seed=None, num_rounds=1500):
    """Train Light GBM model"""
    X_train, y_train = train
    X_valid, y_valid = val
    metric = 'l1'
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'objective': 'regression',
        'learning_rate': 0.1,
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "bagging_freq": 5,
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        "metric": metric,
        'verbose': -1,
        'early_stopping': 100
        }
    device = devices[0]
    if device == -1:
        # use cpu
        pass
    else:
        # use gpu
        print(f'using gpu device_id {device}...')
        params.update({'device': 'gpu', 'gpu_device_id': device})

    params['seed'] = seed

    early_stop = 20
    verbose_eval = 20
    
    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_valid, label=y_valid)
    watchlist = [d_train, d_valid]

    #print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist)

    # predictions
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    
    #print('best_score', model.best_score)
    log = {'train/l1': model.best_score['training']['l1'],
           'valid/l1': model.best_score['valid_1']['l1']}
    return model, y_pred_valid, log
    

In [12]:
def optuna_study(objective):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, n_jobs=-1)
    PrintColor(f'---> Best Score: {study.best_value}\n')
    PrintColor(f'---> Best params: {study.best_params}')

In [13]:
if CFG.OPTUNA == 'Y':
    optuna_study(objective_cbt)

In [14]:
if CFG.ML == "Y":
    Mdl_Master = {#'CBR': CBR(objective='MAE', iterations=3000, verbose=0),
                  'LGBMR' : LGBMR(objective='regression_l1', n_estimators=500, verbose=-1),
                  'XGBR': XGBR(**{
                      'objective'          : 'reg:absoluteerror',
                      'random_state'       : CFG.state,
                      'n_estimators'       : 3000,                         
                      'early_stopping_rounds' : CFG.nbrnd_erly_stp,
                     }),
                 }

In [15]:
if CFG.ML == "Y":
    methods = CFG.methods;
    system('mkdir models');
    model_path = CFG.mdl_path;
    cv =  CFG.mdlcv_mthd(n_splits= CFG.n_splits, shuffle = True, random_state= CFG.state)
    Scores = pd.DataFrame(index = range(CFG.n_splits * CFG.n_repeats),
                          columns = methods).fillna(0).astype(np.float32);
    
    FtreImp = pd.DataFrame(index = X.columns, columns = [methods]).fillna(0);

mkdir: cannot create directory ‘models’: File exists


In [16]:
lgb = PMMLPipeline([
	('lgb', LGBMR(objective='regression_l1', n_estimators=500, verbose=-1)),
])

xgb = PMMLPipeline([
    ('xgb', XGBR(**{
                      'objective'          : 'reg:absoluteerror',
                      'random_state'       : CFG.state,
                      'n_estimators'       : 3000,                         
                      'early_stopping_rounds' : CFG.nbrnd_erly_stp,
                     }),)
])

In [17]:
%%time 

if CFG.ML == "Y":
    PrintColor(f"\n{'=' * 25} ML Training {'=' * 25}\n");
    
    for fold_nb, (train_idx, dev_idx) in tqdm(enumerate(cv.split(X, y)), 
                                              f"{CFG.mdlcv_mthd} CV {CFG.n_splits}x{CFG.n_repeats}"
                                             ):
        Xtr  = X.iloc[train_idx];   
        Xdev = X.iloc[dev_idx];
        ytr  = y.iloc[train_idx];
        ydev = y.iloc[dev_idx];
        
        PrintColor(f"-------> Fold{fold_nb} <-------");
 
        for method in methods:
            model = Mdl_Master[method];
            if method == "LGBMR":
                model.fit(Xtr, ytr, 
                          eval_set = [(Xdev, ydev)], 
                          eval_metric = "mae",
                          callbacks = [log_evaluation(0,), 
                                       early_stopping(CFG.nbrnd_erly_stp, verbose = False)], 
                         );
                lgb.fit(Xtr, ytr, 
                          lgb__eval_set = [(Xdev, ydev)], 
                          lgb__eval_metric = "mae",
                          lgb__callbacks = [log_evaluation(0,), 
                          early_stopping(CFG.nbrnd_erly_stp, verbose = False)]);
            elif method == "XGBR":
                model.fit(Xtr, ytr, 
                          eval_set = [(Xdev, ydev)], 
                          verbose = 0, 
                          eval_metric = "mae",
                         ); 

            joblib.dump(model, CFG.mdl_path + f'{method}V{CFG.version_nb}Fold{fold_nb}.model');
            sklearn2pmml(lgb, CFG.mdl_path + f'{method}V{CFG.version_nb}Fold{fold_nb}.pmml', with_repr = True)
            #sklearn2pmml(xgb, CFG.mdl_path + f'{method}V{CFG.version_nb}Fold{fold_nb}.pmml', with_repr = True)
            
            score = ScoreMetric(ydev, model.predict(Xdev));
            Scores.at[fold_nb, method] = score;
            num_space = 6- len(method);
            PrintColor(f"---> {method} {' '* num_space} OOF = {score:.5f}", 
                       color = Fore.MAGENTA);  
            del num_space, score;
            try:
                FtreImp[method] = \
                FtreImp[method].values + (model.feature_importances_ / (CFG.n_splits * CFG.n_repeats));
            except:
                pass;
            
            collect();
            #clear_output();
            
    PrintColor(f"\n---> OOF scores across methods <---\n");
    Scores.index.name = "FoldNb";
    Scores.index = Scores.index + 1;
    display(Scores.style.format(precision = 5).\
            background_gradient(cmap = "Pastel1")
           );
    
    PrintColor(f"\n---> Mean OOF scores across methods <---\n");
    display(Scores.mean());
    
    try: FtreImp.to_csv(CFG.ftre_imp + f"FtreImp_V{CFG.version_nb}.csv");
    except: pass;

[1m[34m
[0m


<class 'sklearn.model_selection._split.KFold'> CV 5x1: 0it [00:00, ?it/s]

[1m[34m-------> Fold0 <-------[0m
[1m[35m---> LGBMR   OOF = 5.29686[0m
[1m[35m---> XGBR    OOF = 6.15290[0m
[1m[34m-------> Fold1 <-------[0m
[1m[35m---> LGBMR   OOF = 5.84267[0m
[1m[35m---> XGBR    OOF = 6.92975[0m
[1m[34m-------> Fold2 <-------[0m
[1m[35m---> LGBMR   OOF = 6.30564[0m
[1m[35m---> XGBR    OOF = 7.66538[0m
[1m[34m-------> Fold3 <-------[0m
[1m[35m---> LGBMR   OOF = 7.83914[0m
[1m[35m---> XGBR    OOF = 7.22571[0m
[1m[34m-------> Fold4 <-------[0m
[1m[35m---> LGBMR   OOF = 8.23845[0m
[1m[35m---> XGBR    OOF = 8.33304[0m
[1m[34m
---> OOF scores across methods <---
[0m


Unnamed: 0_level_0,LGBMR,XGBR
FoldNb,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.29686,6.1529
2,5.84267,6.92975
3,6.30564,7.66538
4,7.83914,7.22571
5,8.23845,8.33304


[1m[34m
---> Mean OOF scores across methods <---
[0m


LGBMR    6.704553
XGBR     7.261357
dtype: float64

CPU times: user 4.75 s, sys: 114 ms, total: 4.86 s
Wall time: 29.3 s


In [18]:
mdl_lbl = [];
for _, _, filename in walk(CFG.mdl_path):
    mdl_lbl.extend(filename);

models = [];
for filename in mdl_lbl:
    models.append(joblib.load(CFG.mdl_path + f"{filename}"));
        
mdl_lbl    = [m.replace(r".model", "") for m in mdl_lbl];
model_dict = {l:m for l,m in zip(mdl_lbl, models)};
PrintColor(f"\n---> Trained models\n");    
pprint(np.array(mdl_lbl), width = 100, indent = 10, depth = 1); 

print();
collect();  
libc.malloc_trim(0);
PrintColor(f"\n" + GetMemUsage(), color = Fore.RED); 

KeyError: 60

In [None]:
if CFG.inference == 'Y':
    test_file = None
    test = pd.read_pickle(CFG.path + test_file)
    sample_prediction = np.mean([model.predict(test) for model in models], 0)