In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMRegressor
from tqdm import tqdm
import optuna
from functools import partial
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
t1=[[1,1,1],[3,3,3]]
t2=[[1,1,1],[3,3,3]]
test_t=[]
test_t[len(test_t):] = t1
test_t[len(test_t):] = t2
np.save(f'test.npy', test_t)
test_l = np.load('test.npy')
np.array(test_l).mean(axis=0)

array([2., 2., 2.])

In [3]:
train = pd.read_csv('..\\kaggle_data\\train.csv')
test = pd.read_csv('..\\kaggle_data\\test.csv')

In [4]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train.target
X_test = test.drop(['id'], axis=1)

In [5]:
cat_cols = [feature for feature in train.columns if 'cat' in feature]

def label_encoder(df):
    for feature in cat_cols:
        le = LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

In [6]:
X_train = label_encoder(X_train)
X_test = label_encoder(X_test)

In [12]:
fold_ittr_range = 10
preds_list_final_iterations =[]

for fold_index in range(2, fold_ittr_range):
    print(f'Current fold index {fold_index}')
    split = KFold(n_splits=fold_index)
    lgbm_params = {'metric': 'rmse', 
                   'verbosity': -1, 
                   'boosting_type': 'gbdt', 
                   'feature_pre_filter': False, 
                   'lambda_l1': 9.4016532152438, 
                   'lambda_l2': 0.006560267182579874, 
                   'num_leaves': 10, 
                   'feature_fraction': 0.4, 
                   'bagging_fraction': 0.7091814665169984, 
                   'bagging_freq': 7, 
                   'min_child_samples': 25, 
                   'num_iterations': 1000, 
                   'early_stopping_round': 100,
                    'reg_lambda': 10.987474846877767, 
                'reg_alpha': 17.335285595031994,
                  'verbose':-1}
    

    preds_list_base = []
    preds_list_final_iteration = []
    preds_list_all = []

    for train_idx, val_idx in tqdm(split.split(X_train)):
        X_tr = X_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]

        Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                      eval_metric=['rmse'],
                      early_stopping_rounds=250, 
                      categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                      #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                      verbose=0)

        preds_list_base.append(Model.predict(X_test))
        preds_list_all.append(Model.predict(X_test))
        print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
        first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
        params = lgbm_params.copy()

        for i in range(1, 8):
            if i >2:    

                # reducing regularizing params if 

                params['reg_lambda'] *= 0.9
                params['reg_alpha'] *= 0.9
                params['num_leaves'] += 40

            params['learning_rate'] = 0.003
            Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                      eval_metric=['rmse'],
                      early_stopping_rounds=200, 
                      categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                      #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                      verbose=0,
                      init_model=Model)

            preds_list_all.append(Model.predict(X_test))
            print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
        last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
        print('',end='\n\n')
        print(f'Improvement of : {first_rmse - last_rmse}')
        print('-' * 100)
        preds_list_final_iteration.append(Model.predict(X_test))
    preds_list_final_iterations[len(preds_list_final_iterations):] = preds_list_final_iteration
    dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    np.save(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.npy', preds_list_final_iterations)
    y_preds_final_iteration = np.array(preds_list_final_iterations).mean(axis=0)
    submission = pd.DataFrame({'id':test.id,
              'target':y_preds_final_iteration})
    submission.to_csv(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.csv', index=False)

0it [00:00, ?it/s]

Current fold index 2
RMSE for Base model is 0.844066621029272
RMSE for Incremental trial 1 model is 0.8440215983638926
RMSE for Incremental trial 2 model is 0.8439865432619419
RMSE for Incremental trial 3 model is 0.8438800134974772
RMSE for Incremental trial 4 model is 0.8438307091606543
RMSE for Incremental trial 5 model is 0.8438080794612002
RMSE for Incremental trial 6 model is 0.84379593077556
RMSE for Incremental trial 7 model is 0.8437896456875694


Improvement of : 0.00027697534170256777
----------------------------------------------------------------------------------------------------


1it [01:26, 86.83s/it]

RMSE for Base model is 0.8444091042439591
RMSE for Incremental trial 1 model is 0.8443485924213023
RMSE for Incremental trial 2 model is 0.8443047074734082
RMSE for Incremental trial 3 model is 0.8442014004658082
RMSE for Incremental trial 4 model is 0.8441427147899908
RMSE for Incremental trial 5 model is 0.8441391336537261
RMSE for Incremental trial 6 model is 0.8441295350891671
RMSE for Incremental trial 7 model is 0.844127999166191


Improvement of : 0.000281105077768129
----------------------------------------------------------------------------------------------------


1it [03:02, 182.63s/it]


KeyboardInterrupt: 

In [None]:
for fold_index in range(20, 30):
    print(f'Current fold index {fold_index}')
    split = KFold(n_splits=fold_index)
    lgbm_params = {'max_depth': 16, 
                'subsample': 0.8032697250789377, 
                'colsample_bytree': 0.21067140508531404, 
                'learning_rate': 0.009867383057779643,
                'reg_lambda': 10.987474846877767, 
                'reg_alpha': 17.335285595031994, 
                'min_child_samples': 31, 
                'num_leaves': 66, 
                'max_bin': 522, 
                'cat_smooth': 81, 
                'cat_l2': 0.029690334194270022, 
                'metric': 'rmse', 
                'n_jobs': -1, 
                'n_estimators': 20000,
                'boosting_type': 'gbdt'
                  }
    preds_list_base = []
    preds_list_final_iteration = []
    preds_list_all = []

    for train_idx, val_idx in tqdm(split.split(X_train)):
                X_tr = X_train.iloc[train_idx]
                X_val = X_train.iloc[val_idx]
                y_tr = y_train.iloc[train_idx]
                y_val = y_train.iloc[val_idx]

                Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                              eval_metric=['rmse'],
                              early_stopping_rounds=250, 
                              categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                              #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                              verbose=0)

                preds_list_base.append(Model.predict(X_test))
                preds_list_all.append(Model.predict(X_test))
                print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
                first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
                params = lgbm_params.copy()

                for i in range(1, 8):
                    if i >2:    

                        # reducing regularizing params if 

                        params['reg_lambda'] *= 0.9
                        params['reg_alpha'] *= 0.9
                        params['num_leaves'] += 40

                    params['learning_rate'] = 0.003
                    Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                              eval_metric=['rmse'],
                              early_stopping_rounds=200, 
                              categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                              #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                              verbose=0,
                              init_model=Model)

                    preds_list_all.append(Model.predict(X_test))
                    print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
                last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
                print('',end='\n\n')
                print(f'Improvement of : {first_rmse - last_rmse}')
                print('-' * 100)
                preds_list_final_iteration.append(Model.predict(X_test))
    preds_list_final_iterations[len(preds_list_final_iterations):] = preds_list_final_iteration
    dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    np.save(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.npy', preds_list_final_iterations)
    y_preds_final_iteration = np.array(preds_list_final_iterations).mean(axis=0)
    submission = pd.DataFrame({'id':test.id,
              'target':y_preds_final_iteration})
    submission.to_csv(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.csv', index=False)

In [None]:
fold_ittr_range = 10
preds_list_final_iterations =[]

fold_index = 30
print(f'Current fold index {fold_index}')
split = KFold(n_splits=fold_index)
lgbm_params = {'max_depth': 16, 
            'subsample': 0.8032697250789377, 
            'colsample_bytree': 0.21067140508531404, 
            'learning_rate': 0.009867383057779643,
            'reg_lambda': 10.987474846877767, 
            'reg_alpha': 17.335285595031994, 
            'min_child_samples': 31, 
            'num_leaves': 66, 
            'max_bin': 522, 
            'cat_smooth': 81, 
            'cat_l2': 0.029690334194270022, 
            'metric': 'rmse', 
            'n_jobs': -1, 
            'n_estimators': 20000}


preds_list_base = []
preds_list_final_iteration = []
preds_list_all = []

for train_idx, val_idx in tqdm(split.split(X_train)):
    X_tr = X_train.iloc[train_idx]
    X_val = X_train.iloc[val_idx]
    y_tr = y_train.iloc[train_idx]
    y_val = y_train.iloc[val_idx]

    Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                  eval_metric=['rmse'],
                  early_stopping_rounds=250, 
                  categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                  #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                  verbose=0)

    preds_list_base.append(Model.predict(X_test))
    preds_list_all.append(Model.predict(X_test))
    print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
    first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
    params = lgbm_params.copy()

    for i in range(1, 8):
        if i >2:    

            # reducing regularizing params if 

            params['reg_lambda'] *= 0.9
            params['reg_alpha'] *= 0.9
            params['num_leaves'] += 40

        params['learning_rate'] = 0.003
        Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                  eval_metric=['rmse'],
                  early_stopping_rounds=200, 
                  categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                  #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                  verbose=0,
                  init_model=Model)

        preds_list_all.append(Model.predict(X_test))
        print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
    last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
    print('',end='\n\n')
    print(f'Improvement of : {first_rmse - last_rmse}')
    print('-' * 100)
    preds_list_final_iteration.append(Model.predict(X_test))
    
preds_list_final_iterations[len(preds_list_final_iterations):] = preds_list_final_iteration
dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
np.save(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.npy', preds_list_final_iterations)
y_preds_final_iteration = np.array(preds_list_final_iterations).mean(axis=0)
submission = pd.DataFrame({'id':test.id,
          'target':y_preds_final_iteration})
submission.to_csv(f'..\\kaggle_data\\{dt_string}_FI{fold_index}_sub.csv', index=False)

In [None]:
len(preds_list_final_iterations[-165:])

In [None]:
dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
y_preds_final_iteration = np.array(preds_list_final_iterations[-20:]).mean(axis=0)
submission = pd.DataFrame({'id':test.id,
          'target':y_preds_final_iteration})
submission.to_csv(f'..\\kaggle_data\\{dt_string}_FI00_sub.csv', index=False)


In [None]:
with open('10_ittr_res.npy', 'wb') as f:
    np.save(f, np.array([1, 2]))

In [None]:
y_preds_final_iteration = np.array(preds_list_final_iterations).mean(axis=0)
y_preds_final_iteration

In [None]:
submission = pd.DataFrame({'id':test.id,
              'target':y_preds_final_iteration})

In [None]:
from datetime import datetime
dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

submission.to_csv(f'..\\kaggle_data\\{dt_string}_submission.csv', index=False)

In [None]:
split = KFold(n_splits=5)

In [None]:
lgbm_params = {'max_depth': 16, 
                'subsample': 0.8032697250789377, 
                'colsample_bytree': 0.21067140508531404, 
                'learning_rate': 0.009867383057779643,
                'reg_lambda': 10.987474846877767, 
                'reg_alpha': 17.335285595031994, 
                'min_child_samples': 31, 
                'num_leaves': 66, 
                'max_bin': 522, 
                'cat_smooth': 81, 
                'cat_l2': 0.029690334194270022, 
                'metric': 'rmse', 
                'n_jobs': -1, 
                'n_estimators': 20000}

In [None]:
preds_list_base = []
preds_list_final_iteration = []
preds_list_all = []

for train_idx, val_idx in split.split(X_train):
            X_tr = X_train.iloc[train_idx]
            X_val = X_train.iloc[val_idx]
            y_tr = y_train.iloc[train_idx]
            y_val = y_train.iloc[val_idx]
            
            Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=250, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                          verbose=0)
            
            preds_list_base.append(Model.predict(X_test))
            preds_list_all.append(Model.predict(X_test))
            print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
            first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            params = lgbm_params.copy()
            
            for i in range(1, 8):
                if i >2:    
                    
                    # reducing regularizing params if 
                    
                    params['reg_lambda'] *= 0.9
                    params['reg_alpha'] *= 0.9
                    params['num_leaves'] += 40
                    
                params['learning_rate'] = 0.003
                Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=200, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                          verbose=0,
                          init_model=Model)
                
                preds_list_all.append(Model.predict(X_test))
                print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
            last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            print('',end='\n\n')
            print(f'Improvement of : {first_rmse - last_rmse}')
            print('-' * 100)
            preds_list_final_iteration.append(Model.predict(X_test))

In [None]:
for train_idx, val_idx in split.split(X_train):
            X_tr = X_train.iloc[train_idx]
            X_val = X_train.iloc[val_idx]
            y_tr = y_train.iloc[train_idx]
            y_val = y_train.iloc[val_idx]
            
            Model = LGBMRegressor(**lgbm_params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=250, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                          verbose=0)
            
            preds_list_base.append(Model.predict(X_test))
            preds_list_all.append(Model.predict(X_test))
            print(f'RMSE for Base model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
            first_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            params = lgbm_params.copy()
            
            for i in range(1, 8):
                if i >2:    
                    
                    # reducing regularizing params if 
                    
                    params['reg_lambda'] *= 0.9
                    params['reg_alpha'] *= 0.9
                    params['num_leaves'] += 40
                    
                params['learning_rate'] = 0.003
                Model = LGBMRegressor(**params).fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=200, 
                          categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                          #callbacks=[optuna.integration.LightGBMPruningCallback(trial, metric='rmse')],
                          verbose=0,
                          init_model=Model)
                
                preds_list_all.append(Model.predict(X_test))
                print(f'RMSE for Incremental trial {i} model is {np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))}')
            last_rmse = np.sqrt(mean_squared_error(y_val, Model.predict(X_val)))
            print('',end='\n\n')
            print(f'Improvement of : {first_rmse - last_rmse}')
            print('-' * 100)
            preds_list_final_iteration.append(Model.predict(X_test))

In [None]:
y_preds_base = np.array(preds_list_base).mean(axis=0)
y_preds_base

In [None]:
y_preds_all = np.array(preds_list_all).mean(axis=0)
y_preds_all

In [None]:
y_preds_final_iteration = np.array(preds_list_final_iteration).mean(axis=0)
y_preds_final_iteration

In [None]:
submission = pd.DataFrame({'id':test.id,
              'target':y_preds_final_iteration})

In [None]:
submission.to_csv('submission.csv', index=False)