In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMRegressor
from tqdm import tqdm
import optuna
from functools import partial
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [16]:
t1=[[1,1,1],[3,3,3]]
t2=[[1,1,1],[3,3,3]]
test_t=[]
test_t[len(test_t):] = t1
test_t[len(test_t):] = t2
np.save(f'test.npy', test_t)
test_l = np.load('test.npy')
np.array(test_l).mean(axis=0)

array([2., 2., 2.])

In [17]:
train = pd.read_csv('..\\kaggle_data\\train.csv')
test = pd.read_csv('..\\kaggle_data\\test.csv')

In [18]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train.target
X_test = test.drop(['id'], axis=1)

In [19]:
cat_cols = [feature for feature in train.columns if 'cat' in feature]

def label_encoder(df):
    for feature in cat_cols:
        le = LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

In [20]:
X_train = label_encoder(X_train)
X_test = label_encoder(X_test)

In [22]:
split = KFold(n_splits=5)

In [None]:
preds_list_final_iterations =[]

In [27]:
def objective(trial, X, y, name='xgb'):
        
    params = {'max_depth':trial.suggest_int('max_depth', 5, 50),
              'n_estimators':trial.suggest_int('n_estimators', 100000, 300000),
              #'boosting':trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
              'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1.0),
              'learning_rate':trial.suggest_uniform('learning_rate', 0.00001, 0.1),
              'reg_lambda':trial.suggest_uniform('reg_lambda', 0.01, 100),
              'reg_alpha':trial.suggest_uniform('reg_alpha', 0.01, 100),
              'min_child_samples':trial.suggest_int('min_child_samples', 5, 100),
              'num_leaves':trial.suggest_int('num_leaves', 10, 2000),
              'n_jobs' : -1,
              'metric':'rmse',
              'max_bin':trial.suggest_int('max_bin', 50, 10000),
              'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
              'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100)}

    model = LGBMRegressor(**params)
                  
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              eval_metric=['rmse'],
              early_stopping_rounds=250, 
              categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
              #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
              verbose=0)
    
    preds_list_final_iterations.append(model.predict(X_test))

    dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    #np.save(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.npy', preds_list_final_iterations)
    y_preds_final_iteration = np.array(preds_list_final_iterations).mean(axis=0)
    submission = pd.DataFrame({'id':test.id,
              'target':y_preds_final_iteration})
    submission.to_csv(f'..\\kaggle_data\\{dt_string}_AUTOML_sub.csv', index=False)
    
    train_score = np.round(np.sqrt(mean_squared_error(y_train, model.predict(X_train))), 5)
    test_score = np.round(np.sqrt(mean_squared_error(y_val, model.predict(X_val))), 5)
                  
    print(f'TRAIN RMSE : {train_score} || TEST RMSE : {test_score}')
                  
    return test_score

In [None]:
optimize = partial(objective, X=X_train, y=y_train)

study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(optimize, n_trials=300)

# i have commented out the trials so as to cut short the notebook execution time.

[32m[I 2021-03-04 21:02:59,418][0m A new study created in memory with name: no-name-b5772513-f90e-4603-b6ca-19913345f877[0m
[32m[I 2021-03-04 21:03:13,472][0m Trial 0 finished with value: 0.32992 and parameters: {'max_depth': 7, 'n_estimators': 141543, 'subsample': 0.13194982068985575, 'colsample_bytree': 0.4275351375003753, 'learning_rate': 0.05576040249389689, 'reg_lambda': 10.130556541335842, 'reg_alpha': 43.888243038335254, 'min_child_samples': 100, 'num_leaves': 781, 'max_bin': 3750, 'cat_smooth': 71, 'cat_l2': 0.10415838986547839}. Best is trial 0 with value: 0.32992.[0m


TRAIN RMSE : 0.31778 || TEST RMSE : 0.32992


[32m[I 2021-03-04 21:03:39,153][0m Trial 1 finished with value: 0.33086 and parameters: {'max_depth': 32, 'n_estimators': 234308, 'subsample': 0.17824817621894845, 'colsample_bytree': 0.46499252239286504, 'learning_rate': 0.021048179938482306, 'reg_lambda': 97.2189346518409, 'reg_alpha': 73.88978022185553, 'min_child_samples': 39, 'num_leaves': 262, 'max_bin': 3172, 'cat_smooth': 56, 'cat_l2': 77.09053081006996}. Best is trial 0 with value: 0.32992.[0m


TRAIN RMSE : 0.32114 || TEST RMSE : 0.33086


[32m[I 2021-03-04 21:04:00,258][0m Trial 2 finished with value: 0.33115 and parameters: {'max_depth': 34, 'n_estimators': 174446, 'subsample': 0.3053473998791926, 'colsample_bytree': 0.7198785151042646, 'learning_rate': 0.05929153067392442, 'reg_lambda': 22.866042811162192, 'reg_alpha': 68.06082702897217, 'min_child_samples': 73, 'num_leaves': 992, 'max_bin': 8643, 'cat_smooth': 16, 'cat_l2': 0.008149730962108092}. Best is trial 0 with value: 0.32992.[0m


TRAIN RMSE : 0.31566 || TEST RMSE : 0.33115


[32m[I 2021-03-04 21:05:33,405][0m Trial 3 finished with value: 0.33177 and parameters: {'max_depth': 20, 'n_estimators': 133857, 'subsample': 0.2191358523211001, 'colsample_bytree': 0.37871476345615207, 'learning_rate': 0.006985998714364101, 'reg_lambda': 56.15419013975906, 'reg_alpha': 92.64646225227655, 'min_child_samples': 41, 'num_leaves': 1881, 'max_bin': 9817, 'cat_smooth': 70, 'cat_l2': 1.125918987341215}. Best is trial 0 with value: 0.32992.[0m


TRAIN RMSE : 0.32486 || TEST RMSE : 0.33177


[32m[I 2021-03-04 21:06:02,703][0m Trial 4 finished with value: 0.33046 and parameters: {'max_depth': 35, 'n_estimators': 280672, 'subsample': 0.2061661815232902, 'colsample_bytree': 0.6986671153751562, 'learning_rate': 0.058094922620627, 'reg_lambda': 19.185160044883723, 'reg_alpha': 52.69835248476301, 'min_child_samples': 62, 'num_leaves': 1947, 'max_bin': 8548, 'cat_smooth': 63, 'cat_l2': 0.008435150698300666}. Best is trial 0 with value: 0.32992.[0m


TRAIN RMSE : 0.31155 || TEST RMSE : 0.33046


In [None]:
fold_ittr_range = 20
preds_list_final_iterations =[]

for fold_index in range(2, fold_ittr_range):
    print(f'Current fold index {fold_index}')
    split = KFold(n_splits=fold_index)
    lgbm_params = {'max_depth': 16, 
                'subsample': 0.8032697250789377, 
                'colsample_bytree': 0.21067140508531404, 
                'learning_rate': 0.009867383057779643,
                'reg_lambda': 10.987474846877767, 
                'reg_alpha': 17.335285595031994, 
                'min_child_samples': 31, 
                'num_leaves': 66, 
                'max_bin': 522, 
                'cat_smooth': 81, 
                'cat_l2': 0.029690334194270022, 
                'metric': 'rmse', 
                'n_jobs': -1, 
                'n_estimators': 20000,
                'verbose':-1}
    

    preds_list_base = []
    preds_list_final_iteration = []
    preds_list_all = []

    for train_idx, val_idx in tqdm(split.split(X_train)):
        X_tr = X_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]

        Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                      eval_metric=['rmse'],
                      early_stopping_rounds=250, 
                      categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                      #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                      verbose=0)

        preds_list_base.append(Model.predict(X_test))
        preds_list_all.append(Model.predict(X_test))
        print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
        first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
        params = lgbm_params.copy()

        for i in range(1, 8):
            if i >2:    

                # reducing regularizing params if 

                params['reg_lambda'] *= 0.9
                params['reg_alpha'] *= 0.9
                params['num_leaves'] += 40

            params['learning_rate'] = 0.003
            Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                      eval_metric=['rmse'],
                      early_stopping_rounds=200, 
                      categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                      #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                      verbose=0,
                      init_model=Model)

            preds_list_all.append(Model.predict(X_test))
            print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
        last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
        print('',end='\n\n')
        print(f'Improvement of : {first_rmse - last_rmse}')
        print('-' * 100)
        preds_list_final_iteration.append(Model.predict(X_test))
    preds_list_final_iterations[len(preds_list_final_iterations):] = preds_list_final_iteration
    dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    np.save(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.npy', preds_list_final_iterations)
    y_preds_final_iteration = np.array(preds_list_final_iterations).mean(axis=0)
    submission = pd.DataFrame({'id':test.id,
              'target':y_preds_final_iteration})
    submission.to_csv(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.csv', index=False)