# IMPORT DATASETS


In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns
from tqdm import tqdm 
import warnings
warnings.filterwarnings('ignore')
import time
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor , StackingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error,  make_scorer
from sklearn.model_selection import KFold,  cross_val_score,  RepeatedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer,  StandardScaler
from scipy.stats import kurtosis, skew
from scipy.special import boxcox, inv_boxcox
from xgboost import XGBRFRegressor, Booster
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from prettytable import PrettyTable 
import joblib
from sklearn.pipeline import Pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def r2_score_transform(train_pred, test_pred, train_label, test_label):

  ''' 
  This function convert data into inverse yeo-johnson transformation and give a r2 score
  Input - train and test predication array, train and test label
  Output - inverse transform the array and compute the R2 score

   '''
  # Inverse transform the arrays
  a, b    = pt.inverse_transform(train_pred.reshape(-1,1)), pt.inverse_transform(test_pred.reshape(-1,1))
  x, y    = pt.inverse_transform(train_label.reshape(-1,1)), pt.inverse_transform(test_label.reshape(-1,1))
   
  # get r2 score of train and test data
  train = r2_score(x,a)
  test  = r2_score(y,b)
    
  return train, test


In [5]:
def kfold_grid_search(clf, params, train, label, fold, kfold, search = 'grid'):

  ''' 
  This function compute the grid or random search cross validation and taken best estimator compute 
  R^2 score with kfold validation.
  Input :
    clf = model
    params = dict of parameters for ranfom or grid search
    train = train dataset
    label = train data label
    fold = grid or random cross validation folds
    kfold = fold for kfold CV
    search  = grid or random

  '''

  start = time.time()

  # declare the grid search
  if search == 'grid' :
    clf_grid = GridSearchCV(estimator = clf,
                        param_grid = params,
                        n_jobs = -1,
                        scoring = 'r2',
                        cv = fold,
                        verbose = 1)
    # fit data into the grid model
    clf_grid.fit(train, label)

    # best estimator for the model
    print('\n')
    print(f'Best Estimator : {clf_grid.best_estimator_}')
    print('\n')

    # Model as best estimator
    clf = clf_grid.best_estimator_
    
  # declre the random search 
  elif search == 'random' :
    clf_grid = RandomizedSearchCV(estimator = clf,
                        param_distributions = params,
                        n_jobs = -1,
                        scoring = 'r2',
                        cv = fold,
                        verbose = 1,
                        )
    # fit data into the grid model
    clf_grid.fit(train, label)
    
    # best estimator for the model
    print('\n')
    print(f'Best Estimator : {clf_grid.best_estimator_}')
    print('\n')
    
    # Model as best estimator
    clf = clf_grid.best_estimator_

  else:  
    clf = clf


  # genrate folds 
  kfold = KFold(n_splits= kfold, random_state= 42)

  tr = [ ]
  te = [ ]
  i = 0
  print('R2 metric : ')
  for train_in, test_in in kfold.split(train):

    # set indices of train and test data
    xtrain, xtest = train.iloc[train_in], train.iloc[test_in]
    ytrain, ytest = label[train_in], label[test_in]

    # set best estimator as model
    model = clf

    # fit data into the best estimator
    model.fit(xtrain, ytrain)
    
    # predict in train and test data
    train_pred = model.predict(xtrain)
    test_pred = model.predict(xtest)

     # get r2 score of train and test data
    tr_pre  = r2_score(ytrain, train_pred)
    te_pre  = r2_score(ytest, test_pred)
      
    tr.append(tr_pre)
    te.append(te_pre)

    print(f'{i} r2 score of the train data {tr_pre} and test data {te_pre}')
    
    i += 1
  
  print(f'\nAvg r2_score of train {np.mean(tr)} and test {np.mean(te)}')
  

  end = time.time()
  print(f'Time taken - {(end-start)/60} min')
  

# Models with PCA + Synthetic and Binary features


*   This datasets contain PCA (5) features, Binary features, label encoded features and created synthetic features.






In [None]:
# Import the train and test csv files 

train = pd.read_csv(r'/content/drive/MyDrive/Datasets/pca+feature_train.csv')
test = pd.read_csv(r'/content/drive/MyDrive/Datasets/pca+feature_test.csv')

train = train.drop(['Unnamed: 0'], axis = 1)
test = test.drop(['Unnamed: 0'], axis = 1)
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,...,X371,X372,X373,X374,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385,X0_label_encode,X1_label_encode,X2_label_encode,X3_label_encode,X4_label_encode,X5_label_encode,X6_label_encode,X8_label_encode,pca_feature0,pca_feature1,pca_feature2,pca_feature3,pca_feature4,X315_314_51_299,X299_300_301_271,X50_88_51_31,X46_263_119_261,X136_118_136_60,qua_encode_1,qua_encode_2,qua_encode_3,qua_encode_4,cos_encode_1,cos_encode_2,cos_encode_3,cos_encode_4
0,0,130.81,k,v,at,a,d,u,j,o,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,32,23,17,0,3,24,9,14,1.064804,-0.230379,27.062359,6.387048,2.354851,0.0,0.0,0.0,1.0,1.0,6.901179,80.729628,14.457824,8.0,0.096738,0.236773,0.096738,0.30724
1,6,88.53,k,t,av,e,d,y,l,o,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,32,21,19,4,3,28,11,14,0.979313,2.702576,31.020305,-0.160911,2.661553,0.0,0.0,0.0,0.5,1.0,28.816796,7.221336,13.855619,8.0,0.091688,0.080591,0.091688,0.322075
2,7,76.26,az,w,n,c,d,x,j,x,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,20,24,34,2,3,27,9,23,28.062189,21.291834,30.542186,0.322695,20.526222,0.0,0.0,0.0,1.0,0.0,567.801379,9.717606,935.797419,8.0,0.001787,0.002932,0.001787,0.376125
3,9,80.62,az,t,n,f,d,x,l,e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,20,21,34,5,3,27,11,4,27.972617,23.444034,25.805297,-5.440229,-11.833706,0.0,0.0,0.0,1.0,0.0,674.842909,10.394944,930.330382,8.0,0.001798,0.00247,0.001798,0.376125
4,13,78.02,az,v,n,f,d,h,d,n,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,20,23,34,5,3,12,3,13,28.508003,22.819777,7.626386,13.286824,3.706443,0.0,0.0,0.0,1.0,0.0,642.8411,250.973811,963.246231,8.0,0.001735,0.002592,0.001735,0.376125


In [None]:
#Check the Shape of the test and train data
print('train data shape - ', train.shape)
print('test data shape  - ', test.shape)

train data shape -  (4209, 404)
test data shape  -  (4209, 403)


In [None]:
# declare the train set and label
y2 = train.y
x2 = train.iloc[:, 10:]

In [None]:
# Split train data into train-set and test-set
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,y2, test_size = 0.33,random_state= 42)

print(x_train2.shape, y_train2.shape)
print(x_test2.shape, y_test2.shape)

(2820, 394) (2820,)
(1389, 394) (1389,)


### Decision Tree

In [None]:
clf = DecisionTreeRegressor()

params = {'max_depth' : [2,3,4,8,10,15],
          'max_features' : ['auto', 'sqrt', 'log2'],
          'random_state' : [5,10,20,30],
          }

kfold_grid_search(clf, params, x2, y2, 10, kfold = 15, search= 'random') #(0.5780)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.9s finished




Best Estimator : DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=8,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=10, splitter='best')


R2 metric : 
0 r2 score of the train data 0.6813107472320485 and test data 0.5767082371099517
1 r2 score of the train data 0.621586039510241 and test data 0.708513052109607
2 r2 score of the train data 0.630831290790349 and test data 0.5631604647843083
3 r2 score of the train data 0.6668606818390898 and test data 0.2787127830840411
4 r2 score of the train data 0.6380355155912681 and test data 0.06995160992884453
5 r2 score of the train data 0.6311910610649891 and test data 0.5245457377028628
6 r2 score of the train data 0.63061835338783 and test data 0.6205419720909853
7 r2 

In [None]:
dt = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                            max_features='auto',
                            min_impurity_decrease=0.0,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, presort='deprecated',
                            splitter='best')


a = cross_val_score(dt, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5734756571589295 data


### Random Forest

In [None]:
clf = RandomForestRegressor()

params = {'n_estimators':[40,50,60,70,100],
             'max_depth':[3,5,6,7,8],
             'min_samples_split':[2,3,4,5,6,7,8,9,10],
             'max_features': [0.80,.95, 1.0],
             'min_samples_leaf': [1, 2,3,4,5,6,7,8,9],
             'min_impurity_decrease':[1e-5,1e-4,1e-3,1e-2,1e-1,0,1,10]}

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= 'random' )

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.8min finished




Best Estimator : RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features=0.8, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=1e-05,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=9, min_weight_fraction_leaf=0.0,
                      n_estimators=40, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


R2 metric : 
0 r2 score of the train data 0.5715153440871907 and test data 0.6075821902602618
1 r2 score of the train data 0.5674883076551979 and test data 0.6511201254823615
2 r2 score of the train data 0.5730553993723171 and test data 0.6021445241149124
3 r2 score of the train data 0.60601763083342 and test data 0.30708047504635183
4 r2 score of the train data 0.5773519833442724 and test data 0.5444356633636224
5 r2 score of the train data 0.575621620235263 and test data 0.5670822

In [None]:
rf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features=0.8, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=1e-05,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=9, min_weight_fraction_leaf=0.0,
                      n_estimators=40, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

a = cross_val_score(rf, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5745664501511195 data


### XGBoost-Regressor

In [None]:
clf = XGBRFRegressor(silent=True)

xparams = {'learning_rate':[0.1,0.5,0.8,1],
             'n_estimators':[70,80,100],
             'max_depth':[2,3,4],
             'colsample_bytree':[0.1,0.5,0.7,0.9,1],
             'subsample':[0.2,0.3,0.5,1],
             'gamma':[0.0001,0.001,0,0.1,0.01,0.5,1],
             'reg_alpha':[0.00001,0.0001,0.001,0.01,0.1]}


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= 'random')

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.5min finished




Best Estimator : XGBRFRegressor(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
               colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
               max_depth=15, max_features='log2', min_child_weight=1,
               missing=None, n_estimators=100, n_jobs=1, nthread=None,
               objective='reg:linear', random_state=5, reg_alpha=0,
               reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
               subsample=0.8, verbosity=1)


R2 metric : 
0 r2 score of the train data 0.5591106706797955 and test data 0.5969348853865986
1 r2 score of the train data 0.5567761302990156 and test data 0.6392482807896993
2 r2 score of the train data 0.5589944292139439 and test data 0.6035569639748404
3 r2 score of the train data 0.5925375912978497 and test data 0.3071454665475717
4 r2 score of the train data 0.5640973355720406 and test data 0.5338131420792587
5 r2 score of the train data 0.5621640135494697 and test data 0.5595743834445313
6 

In [None]:
 xg = XGBRFRegressor(colsample_bylevel=1,
               colsample_bynode=0.8, colsample_bytree=1, gamma=0,
               learning_rate=1, max_delta_step=0, max_depth=5,
               max_features=0.95, min_child_weight=1, min_impurity_decrease=1,
               min_samples_leaf=1, min_samples_split=5, missing=None,
               n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=True, subsample=0.8, verbosity=1)

a = cross_val_score(xg, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5718237374738755 data


AdaBoost-Regressor

In [None]:
clf = AdaBoostRegressor()

params = {'n_estimators'  : [100, 150, 200, ],
          'learning_rate' :[0.0001,0.001,0.01, 0.1],
          'loss' : [ 'linear', 'square', 'exponential'],
          'random_state' : [10,20,30]
          }

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = 'random') 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 13.6min finished




Best Estimator : AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='linear',
                  n_estimators=100, random_state=10)


R2 metric : 
0 r2 score of the train data 0.5720113699608151 and test data 0.6158808626834753
1 r2 score of the train data 0.5619763801409192 and test data 0.6355339181615801
2 r2 score of the train data 0.5707625343770122 and test data 0.606265905792721
3 r2 score of the train data 0.6058312584998451 and test data 0.3073106403664668
4 r2 score of the train data 0.5721886323759073 and test data 0.5351559426964696
5 r2 score of the train data 0.5744676443771899 and test data 0.5623683551296568
6 r2 score of the train data 0.5648681504239315 and test data 0.6552271007623669
7 r2 score of the train data 0.5655959785551095 and test data 0.7061852214705656
8 r2 score of the train data 0.580944156330872 and test data 0.4719427448074006
9 r2 score of the train data 0.5714446221740298 and test data 0.5538494689733773
10 r2 score of the train data

In [None]:
 ab =  AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='linear',
                  n_estimators=100, random_state=20)

a = cross_val_score(ab, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5703529803771688 data


### Gradient Boosting Regressor

In [None]:
clf = GradientBoostingRegressor())

params = {'n_estimators' : [800,1000, 1500, 2000, 2500],
          'loss' : [ 'huber', 'exponential'],
          'learning_rate' : [0.01, 0.01, 0.1],
          'max_depth' : [3,4,5,7]
          }


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = 'random') #0.6900, 0.6677

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.2min finished




Best Estimator : GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


R2 metric : 
0 r2 score of the train data 0.5640270026713142 and test data 0.5844962160053929
1 r2 score of the train data 0.5545558017178335 and test data 0.6263858956242915
2 r2 score of the train data 0.5683858422857833 and test data 0.5882478867422942
3 r2 score of the train data 0.6032325417831674 and test

In [None]:
gb =  GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

a = cross_val_score(gb, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5696356812530452 data


## LGBM Regressor

In [None]:
clf =  LGBMRegressor()

params = {'min_child_samples' : [10, 20,50],
          'num_leaves' : [5,6],
          'max_depth' : [2, 3, 5],
          'n_estimators' : [1000,2000,4000,5000],
          'learning_rate' : [0.0001,0.001,0.01,0.1]
          }

kfold_grid_search(clf, params, x2, y2.ravel(), fold = 10, kfold = 15, search = 'random') #0.6920, 0.6622

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.9min finished




Best Estimator : LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=3,
              min_child_samples=50, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=4000, n_jobs=-1, num_leaves=5, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


R2 metric : 
0 r2 score of the train data 0.6539376054229651 and test data 0.5745765027557168
1 r2 score of the train data 0.6479659676917198 and test data 0.6541898772014111
2 r2 score of the train data 0.6553573129657073 and test data 0.5877012528118586
3 r2 score of the train data 0.6804781717894042 and test data 0.2926826947679162
4 r2 score of the train data 0.6577511726844416 and test data 0.5400561398326873
5 r2 score of the train data 0.6563520755321524 and test data 0.5388823520975055
6 r2 score of the train dat

In [None]:
lb =  LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=3,
              min_child_samples=50, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=5, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

a = cross_val_score(lb, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5555207657145106 data


In [None]:
estimators = [ ('rf', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features=0.95, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=8, min_weight_fraction_leaf=0.0,
                      n_estimators=70, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)),
              
                ('xg', XGBRFRegressor(colsample_bylevel=1,
               colsample_bynode=0.8, colsample_bytree=1, gamma=0,
               learning_rate=1, max_delta_step=0, max_depth=5,
               max_features=0.95, min_child_weight=1, min_impurity_decrease=1,
               min_samples_leaf=1, min_samples_split=5, missing=None,
               n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=True, subsample=0.8, verbosity=1)),
              

              ('gb', GradientBoostingRegressor(max_depth= 2, learning_rate= 0.1, random_state= 10,n_estimators=5000, 
                                n_iter_no_change = 11))
              
             ]

stack = StackingRegressor(estimators= estimators,
                          final_estimator= Ridge(), 
                          )
 
cv_score = cross_val_score(stack, x2, y2.ravel(), scoring='r2', cv=10, verbose=5, n_jobs=-1)
print('Mean Score:',cv_score.mean())
print('Standard Deviation:',cv_score.std())  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Mean Score: 0.5762229576080763
Standard Deviation: 0.08756604747415982


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 12.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 12.2min finished


## R^2 Score

In [None]:
col = ['MODELS', 'FEATURES', 'CROSS-VALIDATION' , 'PUBLIC SCORE', 'PRIVATE SCORE']

tb = PrettyTable() 

tb.add_column(col[0], ['DECISION TREE', 'RANDOM FOREST', 'XGBOOST', 'ADA-BOOST', 'GRADIENT BOOSTING', 'LIGHT GBM', 'STCKED-ENSEMBLE'])

tb.add_column(col[1], ['PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES', 'PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES','PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES',
                       'PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES','PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES','PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES',
                       'PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES'])

tb.add_column(col[2], ['0.5211', '0.5808', '0.5752', '0.5754', '0.5724', '0.5661', '0.5762'])

tb.add_column(col[3], ['0.55187', '0.55327', '0.54654', '0.54911', '0.54427', '0.54420', '0.55610'])

tb.add_column(col[4], ['0.54823', '0.54488', '0.54390', '0.54551', '0.53709', '0.52651', '0.54816'])

print(tb)

+-------------------+----------------------------------------------+------------------+--------------+---------------+
|       MODELS      |                   FEATURES                   | CROSS-VALIDATION | PUBLIC SCORE | PRIVATE SCORE |
+-------------------+----------------------------------------------+------------------+--------------+---------------+
|   DECISION TREE   | PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES |      0.5211      |   0.55187    |    0.54823    |
|   RANDOM FOREST   | PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES |      0.5808      |   0.55327    |    0.54488    |
|      XGBOOST      | PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES |      0.5752      |   0.54654    |    0.54390    |
|     ADA-BOOST     | PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES |      0.5754      |   0.54911    |    0.54551    |
| GRADIENT BOOSTING | PCA(5) + BINARY + LABEL + SYNTHETIC FEATURES |      0.5724      |   0.54427    |    0.53709    |
|     LIGHT GBM     | PCA(5) + BINARY + LABEL + 

## OBSERVATION


*   This data set created like using PCA method and synthetic features.

*   Using PCA method created 5 features and created 22 synthetic features like difference between twi feature and ratio of another features.

*   In these models use 10 k fold cross validation and grid or random search cross validation for the best parameters. 

*   We can see that above table random forest well performe in the cross validation and also good perform in kaggle public score. 

*   Stacked ensemble got higher score in public and private score.





# Models with PCA + synthetic + Binary features and y target value clip 150sec

In [9]:
# Import the train and test csv files                                                      
train = pd.read_csv('C:/Users/Dell/Python/Python/Python AAIC File/Assignments/CASE-STUDY ML/mercedes-benz-greener-manufacturing/Datasets/pca+feature_train.csv')
test = pd.read_csv('C:/Users/Dell/Python/Python/Python AAIC File/Assignments/CASE-STUDY ML/mercedes-benz-greener-manufacturing/Datasets/pca+feature_test.csv')

train = train.drop(['Unnamed: 0'], axis = 1)
test = test.drop(['Unnamed: 0'], axis = 1)
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X46_263_119_261,X136_118_136_60,qua_encode_1,qua_encode_2,qua_encode_3,qua_encode_4,cos_encode_1,cos_encode_2,cos_encode_3,cos_encode_4
0,0,130.81,k,v,at,a,d,u,j,o,...,1.0,1.0,6.901179,80.729628,14.457824,8.0,0.096738,0.236773,0.096738,0.30724
1,6,88.53,k,t,av,e,d,y,l,o,...,0.5,1.0,28.816796,7.221336,13.855619,8.0,0.091688,0.080591,0.091688,0.322075
2,7,76.26,az,w,n,c,d,x,j,x,...,1.0,0.0,567.801379,9.717606,935.797419,8.0,0.001787,0.002932,0.001787,0.376125
3,9,80.62,az,t,n,f,d,x,l,e,...,1.0,0.0,674.842909,10.394944,930.330382,8.0,0.001798,0.00247,0.001798,0.376125
4,13,78.02,az,v,n,f,d,h,d,n,...,1.0,0.0,642.8411,250.973811,963.246231,8.0,0.001735,0.002592,0.001735,0.376125


In [10]:
#Check the Shape of the test and train data
print('train data shape - ', train.shape)
print('test data shape  - ', test.shape)

train data shape -  (4209, 404)
test data shape  -  (4209, 403)


In [11]:
# cilp the the y label at 150 which value have above 150
train.loc[train['y'] > 150] = 150

In [12]:
y2 = train.y
x2 = train.iloc[:, 10:]

# Split train data into train-set and test-set
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,y2, test_size = 0.33,random_state= 42)

print(x_train2.shape, y_train2.shape)
print(x_test2.shape, y_test2.shape)

(2820, 394) (2820,)
(1389, 394) (1389,)


### Decision Tree

In [None]:
clf = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                            max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, presort='deprecated',
                            random_state=None, splitter='best')

params = {'max_depth' : [2,3,4,8,10,15],
          'max_features' : ['auto', 'sqrt', 'log2'],
          'random_state' : [5,10,20,30],
          }

kfold_grid_search(clf, params, x2, y2, 10, kfold = 15, search= 'random')

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.6s finished




Best Estimator : DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=5, splitter='best')


R2 metric : 
0 r2 score of the train data 0.6480538816132388 and test data 0.6693714310012511
1 r2 score of the train data 0.642465953902116 and test data 0.7702358085844839
2 r2 score of the train data 0.6533336168366211 and test data 0.6034552624278658
3 r2 score of the train data 0.650727064754295 and test data 0.626995114554626
4 r2 score of the train data 0.6481815912325416 and test data 0.6753606349895027
5 r2 score of the train data 0.6495238656607814 and test data 0.6594321202166429
6 r2 score of the train data 0.6496479097970977 and test data 0.6594673475081894
7 r2 

In [None]:
dt = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=5, splitter='best')

a = cross_val_score(dt, x2, y2, scoring= 'r2', cv= 5)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6355296996079519 data


## AdaBoost-Regressor

In [None]:
clf = AdaBoostRegressor(base_estimator=None, learning_rate=0.0001,
                        n_estimators=300, random_state=None, loss= 'linear')

params = {'n_estimators'  : [100, 150, 200, ],
          'learning_rate' :[0.0001,0.001,0.01, 0.1],
          'loss' : [ 'linear', 'square', 'exponential'],
          'random_state' : [10,20,30]
          }

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = 'random') 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 14.6min finished




Best Estimator : AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='exponential',
                  n_estimators=200, random_state=10)


R2 metric : 
0 r2 score of the train data 0.6477109410342181 and test data 0.6667582180352138
1 r2 score of the train data 0.6411183093664439 and test data 0.769834809588402
2 r2 score of the train data 0.6519445325653355 and test data 0.6035245529110724
3 r2 score of the train data 0.6501761962040864 and test data 0.6403407813302944
4 r2 score of the train data 0.6467394871648264 and test data 0.6752756035497736
5 r2 score of the train data 0.6482150084579972 and test data 0.6593962447784378
6 r2 score of the train data 0.6483409079900484 and test data 0.6595427982210769
7 r2 score of the train data 0.6427737983809947 and test data 0.7524045226988414
8 r2 score of the train data 0.6612296144447891 and test data 0.4693381380158933
9 r2 score of the train data 0.6503698925698844 and test data 0.6281263795271057
10 r2 score of the trai

In [None]:
 ab =  AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='exponential',
                  n_estimators=200, random_state=10)


a = cross_val_score(ab, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6424381050598855 data


### Gradient Boosting Regressor

In [None]:
clf = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=800,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

params = {'n_estimators' : [500,800,1000, 1500, 2000],
          'loss' : [ 'huber', 'exponential'],
          'learning_rate' : [0.01, 0.01, 0.1],
          'max_depth' : [3,4,5,7]
          }


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = 'random') 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 16.7min finished




Best Estimator : GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


R2 metric : 
0 r2 score of the train data 0.6440326836974183 and test data 0.6493814982039965
1 r2 score of the train data 0.6447564492798792 and test data 0.7714889267681285
2 r2 score of the train data 0.6556808940577739 and test data 0.5873595530545905
3 r2 score of the train data 0.6602751070405202 and test

In [None]:
gb =  GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

a = cross_val_score(gb, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.640877235785154 data


### Random Forest

In [None]:
clf =  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features=0.95, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=8, min_weight_fraction_leaf=0.0,
                      n_estimators=70, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

params = {'n_estimators':[40,50,60,70,100],
             'max_depth':[3,5,6,7,8],
             'min_samples_split':[2,3,4,5,6,7,8,9,10],
             'max_features': [0.80,.95, 1.0],
             'min_samples_leaf': [1, 2,3,4,5,6,7,8,9],
             'min_impurity_decrease':[1e-5,1e-4,1e-3,1e-2,1e-1,0,1,10]}

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= 'random' )

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.6min finished




Best Estimator : RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.1,
                      min_impurity_split=None, min_samples_leaf=6,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=70, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


R2 metric : 
0 r2 score of the train data 0.6761806107128834 and test data 0.6573744354284433
1 r2 score of the train data 0.6690398809859387 and test data 0.7684369053571742
2 r2 score of the train data 0.6804516359221022 and test data 0.5979577019930118
3 r2 score of the train data 0.6738827442338753 and test data 0.6428903435953389
4 r2 score of the train data 0.6723676756508512 and test data 0.6868064575030692
5 r2 score of the train data 0.6752737867274246 and test data 0.6627912

In [None]:
rf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.1,
                      min_impurity_split=None, min_samples_leaf=6,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=700, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


a = cross_val_score(rf, x2, y_trans, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6457039283917678 data


## LGBM Regressor

In [None]:
clf =  LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=5,
              min_child_samples=50, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=5, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

params = {'min_child_samples' : [10, 20,50],
          'num_leaves' : [5,6],
          'max_depth' : [2, 3, 5],
          'n_estimators' : [1000,2000,4000,5000],
          'learning_rate' : [0.0001,0.001,0.01,0.1]
          }

kfold_grid_search(clf, params, x2, y2.ravel(), fold = 10, kfold = 15, search = 'random') 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.6min finished




Best Estimator : LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=3,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=6, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


R2 metric : 
0 r2 score of the train data 0.6646104803919637 and test data 0.6630022586838633
1 r2 score of the train data 0.6583630541223624 and test data 0.7675817081799015
2 r2 score of the train data 0.6690928334118285 and test data 0.6058607904596616
3 r2 score of the train data 0.6649248016084909 and test data 0.6436142891192629
4 r2 score of the train data 0.6624097369466116 and test data 0.6936298300139243
5 r2 score of the train data 0.6646374197084737 and test data 0.665528759256929
6 r2 score of the train dat

In [None]:
lb =  LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=3,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=6, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

a = cross_val_score(lb, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6468964578401072 data


### XGBoost-Regressor

In [None]:
clf = XGBRFRegressor(colsample_bylevel=1,colsample_bynode=0.8, colsample_bytree=1, gamma=0,
               learning_rate=1, max_delta_step=0, max_depth=5,
               max_features=0.95, min_child_weight=1, min_impurity_decrease=1,
               min_samples_leaf=1, min_samples_split=5, missing=None,
               n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=True, subsample=0.8, verbosity=1)

xparams = {'learning_rate':[0.1,0.5,0.8,1],
             'n_estimators':[70,80,100],
             'max_depth':[2,3,4],
             'colsample_bytree':[0.1,0.5,0.7,0.9,1],
             'subsample':[0.2,0.3,0.5,1],
             'gamma':[0.0001,0.001,0,0.1,0.01,0.5,1],
             'reg_alpha':[0.00001,0.0001,0.001,0.01,0.1]}


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= False )

R2 metric : 
0 r2 score of the train data 0.6066547132057518 and test data 0.63525224317498
1 r2 score of the train data 0.6034804178965002 and test data 0.6963600576690997
2 r2 score of the train data 0.6105771133880324 and test data 0.601203463972948
3 r2 score of the train data 0.6128226383721311 and test data 0.5564458465086671
4 r2 score of the train data 0.6111184302393099 and test data 0.5830068184166133
5 r2 score of the train data 0.6082304797363157 and test data 0.6281456387239466
6 r2 score of the train data 0.6089669154327122 and test data 0.6561857533878476
7 r2 score of the train data 0.6035336362444628 and test data 0.7099815395322746
8 r2 score of the train data 0.6238022287495313 and test data 0.4687577983273177
9 r2 score of the train data 0.6125290949699321 and test data 0.5628813136358713
10 r2 score of the train data 0.6200341125828022 and test data 0.48810031559351164
11 r2 score of the train data 0.6088743143552269 and test data 0.6168210561799822
12 r2 score of 

In [None]:
 xg = XGBRFRegressor(colsample_bylevel=1,
               colsample_bynode=0.8, colsample_bytree=1, gamma=0,
               learning_rate=1, max_delta_step=0, max_depth=5,
               max_features=0.95, min_child_weight=1, min_impurity_decrease=1,
               min_samples_leaf=1, min_samples_split=5, missing=None,
               n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=True, subsample=0.8, verbosity=1)

a = cross_val_score(xg, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6084653791278974 data


## Stack ensemble

In [14]:
estimators = [ ('rf', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features=0.95, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=8, min_weight_fraction_leaf=0.0,
                      n_estimators=70, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)),
              
                ('xg', XGBRFRegressor(colsample_bylevel=1,
               colsample_bynode=0.8, colsample_bytree=1, gamma=0,
               learning_rate=1, max_delta_step=0, max_depth=5,
               max_features=0.95, min_child_weight=1, min_impurity_decrease=1,
               min_samples_leaf=1, min_samples_split=5, missing=None,
               n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=True, subsample=0.8, verbosity=0)),
              

              ('lg', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=3,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=6, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0))
              
             ]

stack = StackingRegressor(estimators= estimators,
                          final_estimator= Ridge(), 
                          )

In [None]:
cv_score = cross_val_score(stack, x2, y2.ravel(), scoring='r2', cv= 5, verbose=5, n_jobs=-1)
print('Mean Score:',cv_score.mean())
print('Standard Deviation:',cv_score.std())   #0.6404, 0.64200,  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Mean Score: 0.6412507934357022
Standard Deviation: 0.030613234929429335


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 22.7min finished


## R^2 Score

In [56]:
col = ['MODELS', 'FEATURES', 'CROSS-VALIDATION' , 'PUBLIC SCORE', 'PRIVATE SCORE']

tb = PrettyTable() 

tb.add_column(col[0], ['DECISION TREE', 'RANDOM FOREST', 'XGBOOST', 'ADA-BOOST', 'GRADIENT BOOSTING', 'LIGHT GBM', 'STCKED-ENSEMBLE'])

tb.add_column(col[1], ['PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES','PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',
                       'PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES','PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',
                       'PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES','PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',
                       'PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',])

tb.add_column(col[2], ['0.6451', '0.6488', '0.6102', '0.6431', '0.6430', '0.6500', '0.6412'])

tb.add_column(col[3], ['0.53938', '0.55102', '0.54616', '0.53998', '0.54294', '0.55541', '0.55514'])

tb.add_column(col[4], ['0.53968', '0.54857', '0.53822', '0.53963', '0.53637', '0.54895', '0.54951'])

print(tb)

+-------------------+----------------------------------------------------------+------------------+--------------+---------------+
|       MODELS      |                         FEATURES                         | CROSS-VALIDATION | PUBLIC SCORE | PRIVATE SCORE |
+-------------------+----------------------------------------------------------+------------------+--------------+---------------+
|   DECISION TREE   | PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.6451      |   0.53938    |    0.53968    |
|   RANDOM FOREST   | PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.6488      |   0.55102    |    0.54857    |
|      XGBOOST      | PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.6102      |   0.54616    |    0.53822    |
|     ADA-BOOST     | PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.6431      |   0.53998    |    0.53963    |
| GRADIENT BOOSTING | PCA(5) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |   

## OBSERVATION 


*   This dataset contain 5 PCA feature, binary features, label encoded features and synthetic featues.

*   This dataset with some experiment found that when we clip our y target with threshold 150secs then we got good score.

*   We got good score in this data higher than another datasets, we see that in above table stacked ensemble get best score in public 0.55514 and private 0.54951 score.

*   All models well perform in cross validation sets with k fold method.








# Model with important feature of models (synethic features + 100 PCA + Binary)

In [11]:
# Import the train and test csv files 

train = pd.read_csv(r'/content/drive/MyDrive/top_feature_train.csv')
test = pd.read_csv(r'/content/drive/MyDrive/top_feature_test.csv')

train = train.drop(['Unnamed: 0'], axis = 1)
test = test.drop(['Unnamed: 0'], axis = 1)
train.head()

Unnamed: 0,ID,X0,X1,X2,X3,X5,X6,X8,X10,X12,X13,X14,X19,X20,X22,X23,X27,X28,X29,X31,X32,X35,X37,X38,X41,X43,X44,X45,X46,X47,X48,X49,X50,X51,X52,X54,X56,X57,X58,X61,...,pca_feature74,pca_feature75,pca_feature76,pca_feature77,pca_feature78,pca_feature79,pca_feature80,pca_feature81,pca_feature82,pca_feature83,pca_feature84,pca_feature85,pca_feature86,pca_feature87,pca_feature88,pca_feature89,pca_feature90,pca_feature91,pca_feature92,pca_feature93,pca_feature94,pca_feature95,pca_feature96,pca_feature97,pca_feature98,pca_feature99,X315_314_51_340,X299_300_301_328,X50_100_51_31,X46_263_118_261,X136_118_136_355,qua_encode_1,qua_encode_2,qua_encode_3,qua_encode_4,cos_encode_1,cos_encode_2,cos_encode_3,cos_encode_4,y
0,0,k,v,at,a,u,j,o,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,...,-0.478423,-0.041581,-0.364478,-0.001854,-0.041399,0.309832,0.17707,0.251996,-0.036021,0.635873,-0.171752,-0.129971,0.459947,0.029937,0.123607,-0.56466,-0.216277,-0.038842,0.308547,-0.172807,0.095357,0.004402,-0.041449,0.003441,-0.084739,-0.078419,0.340805,1.524459,0.0,1.0,0.506038,2.181611,2.181611,3.024459,3.024459,0.282399,0.396161,0.050612,0.268648,130.81
1,6,k,t,av,e,y,l,o,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,...,-0.043473,0.101824,0.219871,-0.067341,-0.223638,-0.003935,-0.010555,-0.035781,0.408114,0.000368,-0.258432,-0.093938,-0.306398,-0.075213,0.265231,0.132231,0.123904,0.272324,-0.112198,0.10286,-0.181315,-0.064247,0.015819,-0.082439,-0.076255,-0.312864,-0.178265,0.810873,0.333333,1.0,0.402006,1.143471,0.786942,2.310873,2.97754,0.193438,0.460063,0.157992,0.381311,88.53
2,7,az,w,n,c,x,j,x,0,0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,1,1,...,0.256844,0.525369,0.039997,0.15171,0.104512,0.331503,-0.499652,-0.475288,-0.127968,0.362647,-0.09953,0.276298,-0.340234,-0.194046,-0.097575,0.358764,-0.071983,-0.431886,-0.301564,-0.23825,0.078197,0.372351,-0.350833,0.282364,1.032828,0.060364,1.902455,0.786635,0.0,0.79188,0.090326,5.304909,9.109818,2.286635,3.073269,0.615764,0.205873,0.18582,0.01391,76.26
3,9,az,t,n,f,x,l,e,0,0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,...,-0.026078,-0.088235,-0.227881,0.049265,-0.096558,0.515847,-0.261283,-0.269399,-0.039287,0.137951,0.136733,-0.247255,0.333867,0.048052,-0.315151,-0.162288,-0.026982,-0.007258,-0.010178,-0.174021,0.071059,0.1089,0.052355,0.308304,0.040069,-0.112119,3.725739,1.054499,0.0,0.788393,-0.281703,8.951479,8.951479,2.554499,3.608998,0.866039,0.118635,0.130835,0.075874,80.62
4,13,az,v,n,f,h,d,n,0,0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,...,0.124707,-0.199144,-0.051635,0.184691,0.038281,0.095479,-0.00122,-0.179807,-0.174086,0.135042,0.062843,-0.018008,0.26768,0.038113,0.112184,0.020562,-0.040897,-0.024733,-0.276257,-0.162307,0.024471,0.043678,-0.026407,0.075065,0.213295,0.062244,1.858257,1.055305,0.0,0.787979,0.043414,5.216515,8.93303,2.555305,3.610609,0.565009,0.21432,0.130311,0.020876,78.02


In [12]:
#Check the Shape of the test and train data
print('train data shape - ', train.shape)
print('test data shape  - ', test.shape)

train data shape -  (4209, 350)
test data shape  -  (4209, 349)


In [13]:
train.loc[train['y'] > 150] = 150

y2 = train.y
move = [ 	'ID' ,	'X0' ,	'X1' ,	'X2' 	,'X3' ,'X5' ,	'X6' ,	'X8', 'y']
x2 = train.drop(move, axis= 1)

In [14]:
# Split train data into train-set and test-set
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,y2, test_size = 0.33,random_state= 42)

print(x_train2.shape, y_train2.shape)
print(x_test2.shape, y_test2.shape)

(2820, 341) (2820,)
(1389, 341) (1389,)


### Decision Tree

In [15]:
clf = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                            max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, presort='deprecated',
                            random_state=None, splitter='best')

params = {'max_depth' : [2,3,4,8,10,15],
          'max_features' : ['auto', 'sqrt', 'log2'],
          'random_state' : [5,10,20,30],
          }

kfold_grid_search(clf, params, x2, y2, 10, kfold = 15, search= 'grid')  

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 466 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 717 out of 720 | elapsed:  1.3min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  1.3min finished




Best Estimator : DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=20, splitter='best')


R2 metric : 
0 r2 score of the train data 0.5727993448261308 and test data 0.5517333068615851
1 r2 score of the train data 0.5653722857340067 and test data 0.6649931070125705
2 r2 score of the train data 0.5734600812347554 and test data 0.6037041847200402
3 r2 score of the train data 0.6079480564620057 and test data 0.295841387440966
4 r2 score of the train data 0.5780829128488911 and test data 0.5094437738972812
5 r2 score of the train data 0.5770455134950616 and test data 0.5546179692984541
6 r2 score of the train data 0.5714621388760321 and test data 0.63931768707822
7 r2

In [22]:
dt =  DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=20, splitter='best')

a = cross_val_score(dt, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5645815440213546 data


### Random Forest

In [16]:
clf = RandomForestRegressor()

params = {'n_estimators':[40,50,60,70,100],
             'max_depth':[3,5,6,7,8],
             'min_samples_split':[2,3,4,5,6,7,8,9,10],
             'max_features': [0.80,.95, 1.0],
             'min_samples_leaf': [1, 2,3,4,5,6,7,8,9],
             'min_impurity_decrease':[1e-5,1e-4,1e-3,1e-2,1e-1,0,1,10]}

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= 'random') 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 13.0min finished




Best Estimator : RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features=0.95, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=1e-05,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=3, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


R2 metric : 
0 r2 score of the train data 0.577087291813353 and test data 0.6132711188897396
1 r2 score of the train data 0.5703996644280093 and test data 0.6609684185922715
2 r2 score of the train data 0.578452544680822 and test data 0.6028701420087133
3 r2 score of the train data 0.6125421985716001 and test data 0.30357217989243657
4 r2 score of the train data 0.582456005055181 and test data 0.5311574076974697
5 r2 score of the train data 0.581347147499635 and test data 0.558037

In [23]:
rf =  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features=0.95, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=1e-05,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=3, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

a = cross_val_score(rf, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5722703062634319 data


### XGBoost-Regressor

In [17]:
clf = XGBRFRegressor(silent=True)

xparams = {'learning_rate':[0.1,0.5,0.8,1],
             'n_estimators':[70,80,100],
             'max_depth':[2,3,4],
             'colsample_bytree':[0.1,0.5,0.7,0.9,1],
             'subsample':[0.2,0.3,0.5,1],
             'gamma':[0.0001,0.001,0,0.1,0.01,0.5,1],
             'reg_alpha':[0.00001,0.0001,0.001,0.01,0.1]}


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= 'random') 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.3min finished




Best Estimator : XGBRFRegressor(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
               colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
               max_depth=7, max_features=0.8, min_child_weight=1,
               min_impurity_decrease=0.001, min_samples_leaf=4,
               min_samples_split=2, missing=None, n_estimators=40, n_jobs=1,
               nthread=None, objective='reg:linear', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
               silent=True, subsample=0.8, verbosity=1)


R2 metric : 
0 r2 score of the train data 0.5607519129595249 and test data 0.612350904071567
1 r2 score of the train data 0.5577277386771264 and test data 0.6554340512207613
2 r2 score of the train data 0.5617422277259452 and test data 0.6032007707051259
3 r2 score of the train data 0.5954512272952628 and test data 0.306385454262185
4 r2 score of the train data 0.567338555840642 and test data 0.5264216134296651
5 r2 s

In [24]:
xg =  XGBRFRegressor(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
               colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
               max_depth=7, max_features=0.8, min_child_weight=1,
               min_impurity_decrease=0.001, min_samples_leaf=4,
               min_samples_split=2, missing=None, n_estimators=40, n_jobs=1,
               nthread=None, objective='reg:linear', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
               silent=True, subsample=0.8, verbosity=1)

a = cross_val_score(xg, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5730571735111031 data


## AdaBoost-Regressor

In [18]:
clf = AdaBoostRegressor(base_estimator=None, learning_rate=0.0001,
                        n_estimators=300, random_state=None, loss= 'linear')

params = {'n_estimators'  : [100, 150, 200, ],
          'learning_rate' :[0.0001,0.001,0.01, 0.1],
          'loss' : [ 'linear', 'square', 'exponential'],
          'random_state' : [10,20,30]
          }

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = 'random') 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 32.0min finished




Best Estimator : AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='square',
                  n_estimators=150, random_state=30)


R2 metric : 
0 r2 score of the train data 0.5646822713230479 and test data 0.6066783130115726
1 r2 score of the train data 0.5577141547036 and test data 0.6649903462149656
2 r2 score of the train data 0.5651393370662766 and test data 0.6033862254239895
3 r2 score of the train data 0.5989253378984944 and test data 0.3064867419902134
4 r2 score of the train data 0.5703661713907457 and test data 0.532300026745163
5 r2 score of the train data 0.5687084265678528 and test data 0.5544782697611486
6 r2 score of the train data 0.5626966400650278 and test data 0.6543380452537592
7 r2 score of the train data 0.5566910900648376 and test data 0.7050220948487183
8 r2 score of the train data 0.5756621801071371 and test data 0.46433697036193844
9 r2 score of the train data 0.5682103360047548 and test data 0.5530743664423573
10 r2 score of the train data 

In [25]:
ab =   AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='square',
                  n_estimators=150, random_state=30)

a = cross_val_score(ab, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5743669401083786 data


### Gradient Boosting Regressor

In [19]:
clf = GradientBoostingRegressor(max_depth= 2, learning_rate= 0.1, random_state= 10,n_estimators=5000, 
                                n_iter_no_change = 11)

params = {'n_estimators' : [500,800,1000, 1500, 2000],
          'loss' : [ 'huber', 'exponential'],
          'learning_rate' : [0.01, 0.01, 0.1],
          'max_depth' : [3,4,5,7]
          }


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = 'random')

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed: 26.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 48.8min finished




Best Estimator : GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1500,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


R2 metric : 
0 r2 score of the train data 0.572231628764966 and test data 0.6037034985933161
1 r2 score of the train data 0.5621641874751212 and test data 0.6414798160235033
2 r2 score of the train data 0.5867592358384793 and test data 0.5801527896688994
3 r2 score of the train data 0.6211634297112336 and test 

In [26]:
 gb = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1500,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
 
 a = cross_val_score(gb, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5731033817060677 data


## LGBM Regressor

In [21]:
clf =   LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=3,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=6, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

params = {'min_child_samples' : [10, 20,50],
          'num_leaves' : [5,6],
          'max_depth' : [2, 3, 5],
          'n_estimators' : [1000,2000,4000,5000],
          'learning_rate' : [0.0001,0.001,0.01,0.1]
          }

kfold_grid_search(clf, params, x2, y2.ravel(), fold = 10, kfold = 15, search = False) 

R2 metric : 
0 r2 score of the train data 0.5997441404083625 and test data 0.6096233813618482
1 r2 score of the train data 0.5958881929472761 and test data 0.6445588724260033
2 r2 score of the train data 0.602526807697845 and test data 0.6007657087156126
3 r2 score of the train data 0.6327607569536936 and test data 0.3070436943331817
4 r2 score of the train data 0.6063989312693985 and test data 0.5403439972577462
5 r2 score of the train data 0.6060046445533136 and test data 0.5558067093017696
6 r2 score of the train data 0.5993389114393196 and test data 0.6537402142783227
7 r2 score of the train data 0.5958897819905952 and test data 0.6964613240982211
8 r2 score of the train data 0.6112823739876911 and test data 0.48196247922911584
9 r2 score of the train data 0.6066215294634596 and test data 0.5536277630993041
10 r2 score of the train data 0.607835933029663 and test data 0.5027366167589261
11 r2 score of the train data 0.6068813272907176 and test data 0.5586564302402194
12 r2 score of

In [28]:
lg = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=3,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=6, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


a = cross_val_score(lg, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.5736623038048128 data


In [29]:
estimators = [ ('rf', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0001,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)),
              
              ('xgb', XGBRFRegressor(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
               colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
               max_depth=5, max_features=0.95, min_child_weight=1,
               min_impurity_decrease=0.0001, min_samples_leaf=2,
               min_samples_split=4, missing=None, n_estimators=50, n_jobs=1,
               nthread=None, objective='reg:linear', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
               silent=True, subsample=0.8, verbosity=1)),
              
              ('lb', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=3,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=5000, n_jobs=-1, num_leaves=6, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0))
              ]

In [30]:
stack = StackingRegressor(estimators= estimators,
                          final_estimator= Ridge(), 
                          )


cv_score = cross_val_score(stack, x2, y2.ravel(), scoring='r2', cv= 10, verbose=5, n_jobs=-1)
print('Mean Score:',cv_score.mean())
print('Standard Deviation:',cv_score.std())  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Mean Score: 0.575714581019241
Standard Deviation: 0.08823219159340108


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 39.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 39.0min finished


In [57]:
col = ['MODELS', 'FEATURES', 'CROSS-VALIDATION' , 'PUBLIC SCORE', 'PRIVATE SCORE']

tb = PrettyTable() 

tb.add_column(col[0], ['DECISION TREE', 'RANDOM FOREST', 'XGBOOST', 'ADA-BOOST', 'GRADIENT BOOSTING', 'LIGHT GBM', 'STCKED-ENSEMBLE'])

tb.add_column(col[1], ['PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES','PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',
                       'PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES','PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',
                       'PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES','PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',
                       'PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES',])

tb.add_column(col[2], ['0.5616', '0.5743', '0.5768', '0.5715', '0.5847', '0.5775', '0.5757'])

tb.add_column(col[3], ['0.54541', '0.54556', '0.54345', '0.54539', '0.54321', '0.55793', '0.55442'])

tb.add_column(col[4], ['0.52796', '0.53966', '0.53673', '0.53996', '0.53868', '0.54630', '0.54515'])

print(tb)

+-------------------+------------------------------------------------------------+------------------+--------------+---------------+
|       MODELS      |                          FEATURES                          | CROSS-VALIDATION | PUBLIC SCORE | PRIVATE SCORE |
+-------------------+------------------------------------------------------------+------------------+--------------+---------------+
|   DECISION TREE   | PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.5616      |   0.54541    |    0.52796    |
|   RANDOM FOREST   | PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.5743      |   0.54556    |    0.53966    |
|      XGBOOST      | PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.5768      |   0.54345    |    0.53673    |
|     ADA-BOOST     | PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHETIC FEATURES |      0.5715      |   0.54539    |    0.53996    |
| GRADIENT BOOSTING | PCA(100) + BINARY + LABEL + Y 150 CLIP +SYNTHET

## OBSERVATION


*   This dataset contain 100 PCA features from binary and label encode features. With target y clip at 150secs

*   In this dataset light gradient boosting perform well than other. LGB got 0.55793 in public and 0.54630 private score.

# Models with selected top features using selectkbest method







In [37]:
# Import the train and test csv files 

train = pd.read_csv(r'/content/drive/MyDrive/Datasets/selectK_train_feature.csv')
test = pd.read_csv(r'/content/drive/MyDrive/Datasets/selectK_test_feature.csv')

train = train.drop(['Unnamed: 0'], axis = 1)
test = test.drop(['Unnamed: 0'], axis = 1)
train.head()

Unnamed: 0,X10,X12,X13,X14,X19,X20,X22,X23,X27,X28,X29,X31,X35,X37,X43,X44,X45,X46,X47,X48,X50,X51,X52,X54,X56,X57,X61,X63,X64,X66,X68,X69,X71,X73,X75,X76,X77,X79,X80,X81,...,pca_feature21,pca_feature22,pca_feature24,pca_feature25,pca_feature28,pca_feature30,pca_feature32,pca_feature39,pca_feature43,pca_feature44,pca_feature50,pca_feature52,pca_feature53,pca_feature59,pca_feature60,pca_feature63,pca_feature66,pca_feature70,pca_feature73,pca_feature74,pca_feature77,pca_feature78,pca_feature81,pca_feature83,pca_feature86,pca_feature93,pca_feature95,pca_feature97,pca_feature98,X315_314_51_340,X299_300_301_328,X46_263_119_261,X136_118_136_355,qua_encode_2,qua_encode_3,cos_encode_1,cos_encode_3,cos_encode_4,ID,y
0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,-0.182578,-0.128126,0.178685,-0.104251,-0.528341,-0.116039,0.209134,-0.354578,0.946379,0.641226,0.246514,-0.16198,0.548344,0.760651,-0.152505,0.962756,-0.365927,0.270162,0.173254,-0.457502,0.013302,-0.052985,0.247189,0.623784,0.438775,-0.233706,-0.040995,-0.003752,-0.11769,0.0,0.0,1.0,1.0,40.042486,11.450113,0.10866,0.10866,0.30724,0,130.81
1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,...,-0.99116,-0.545977,-0.276158,-0.656953,-0.734347,0.026911,0.658565,-0.255818,0.176963,0.152777,-0.171294,0.031739,-0.039456,0.214851,0.36482,-0.508838,-0.300376,0.015067,-0.104844,-0.064088,-0.061016,-0.220592,-0.045568,-0.006343,-0.320449,0.136784,-0.086269,0.005784,0.012419,0.0,0.0,0.5,1.0,7.555795,11.143521,0.094123,0.107239,0.386071,6,88.53
2,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,1,0,0,1,0,...,0.69029,-0.187279,1.330146,0.390737,-0.217453,-0.563563,0.301389,0.231312,0.251452,0.245857,-0.170609,-0.645932,0.023164,0.127338,0.26793,-0.058629,-0.354888,0.02513,-0.434715,0.197347,0.157569,0.100554,-0.488942,0.372205,-0.384254,-0.17661,-0.056234,0.395914,1.056965,0.0,0.5,1.0,0.0,8.964781,351.430285,0.004724,0.003774,0.376125,7,76.26
3,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,...,0.282964,-0.034721,1.51571,-0.722524,0.072271,-0.293604,0.068063,-0.439959,0.314597,-0.010401,0.024335,0.258586,-0.064303,-0.167383,0.260068,-0.130885,-0.0184,0.026913,-0.05476,-0.043794,0.043378,-0.100348,-0.297885,0.150184,0.336399,-0.064002,0.114576,0.360489,0.002459,0.0,0.5,1.0,0.0,2.157439,349.571432,0.004743,0.003789,0.376125,9,80.62
4,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,...,-0.022836,-0.041093,0.570398,-0.057357,0.012085,0.37907,-0.289574,-0.302103,-0.238878,0.122182,0.145761,0.102616,-0.039255,-0.088233,0.254461,0.207313,-0.278137,0.230387,-0.312594,0.083544,0.190114,0.042105,-0.180643,0.136394,0.268128,-0.121553,-0.047947,0.041592,0.187224,0.0,0.5,1.0,0.0,105.22869,361.198271,0.004612,0.003685,0.376125,13,78.02


In [38]:
#Check the Shape of the test and train data
print('train data shape - ', train.shape)
print('test data shape  - ', test.shape)

train data shape -  (4209, 252)
test data shape  -  (4209, 251)


In [39]:
# declare the train set and label
train.loc[train['y'] > 150] = 150
y2 = train.y
x2 = train.drop(['y', 'ID'], axis= 1)

In [40]:
# Split train data into train-set and test-set
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,y2, test_size = 0.33,random_state= 42)

print(x_train2.shape, y_train2.shape)
print(x_test2.shape, y_test2.shape)

(2820, 250) (2820,)
(1389, 250) (1389,)


### Decision Tree

In [None]:
clf = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=8,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=10, splitter='best')

params = {'max_depth' : [2,3,4,8,10,15],
          'max_features' : ['auto', 'sqrt', 'log2'],
          'random_state' : [5,10,20,30],
          }

kfold_grid_search(clf, params, x2, y2, 10, kfold = 15, search= False) 

R2 metric : 
0 r2 score of the train data 0.7003943217596944 and test data 0.5912088567314289
1 r2 score of the train data 0.684772877128098 and test data 0.7461049970562634
2 r2 score of the train data 0.7024668433891952 and test data 0.5695775837295918
3 r2 score of the train data 0.6955558898354829 and test data 0.6247256843562159
4 r2 score of the train data 0.6943306686873321 and test data 0.6938806609949865
5 r2 score of the train data 0.6917795390935819 and test data 0.5715765496011846
6 r2 score of the train data 0.6950587650032459 and test data 0.6254903902405267
7 r2 score of the train data 0.6899252956182615 and test data 0.7482496095026618
8 r2 score of the train data 0.7066464250517441 and test data 0.4451958898852242
9 r2 score of the train data 0.699592124849862 and test data 0.5409245741315845
10 r2 score of the train data 0.7090500610772879 and test data 0.3688190526192887
11 r2 score of the train data 0.6980781796780453 and test data 0.632043078306469
12 r2 score of t

In [41]:
dt = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=3,
                            max_features='auto',
                            min_impurity_decrease=0.0,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, presort='deprecated',
                            splitter='best')


a = cross_val_score(dt, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6350922926910991 data


### Random Forest

In [None]:
clf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features=0.8, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=1e-05,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=9, min_weight_fraction_leaf=0.0,
                      n_estimators=40, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

params = {'n_estimators':[40,50,60,70,100],
             'max_depth':[3,5,6,7,8],
             'min_samples_split':[2,3,4,5,6,7,8,9,10],
             'max_features': [0.80,.95, 1.0],
             'min_samples_leaf': [1, 2,3,4,5,6,7,8,9],
             'min_impurity_decrease':[1e-5,1e-4,1e-3,1e-2,1e-1,0,1,10]}

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= False )

R2 metric : 
0 r2 score of the train data 0.6491851497180579 and test data 0.6722657818421716
1 r2 score of the train data 0.6428875293997556 and test data 0.7650168179513238
2 r2 score of the train data 0.6543022991441494 and test data 0.6009548522547989
3 r2 score of the train data 0.651093238820595 and test data 0.6413429640388058
4 r2 score of the train data 0.6486153400378359 and test data 0.6732278735764214
5 r2 score of the train data 0.6500914264009571 and test data 0.6524032274886278
6 r2 score of the train data 0.650940549896302 and test data 0.6542854706156918
7 r2 score of the train data 0.6447753756785972 and test data 0.7542422390060914
8 r2 score of the train data 0.6659204711262184 and test data 0.46907503683814333
9 r2 score of the train data 0.6517637548732664 and test data 0.6296769532934854
10 r2 score of the train data 0.6605605207274278 and test data 0.49104247104708254
11 r2 score of the train data 0.6507621664241039 and test data 0.6474150149129719
12 r2 score o

In [42]:
rf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features=0.8, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=1e-05,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=9, min_weight_fraction_leaf=0.0,
                      n_estimators=40, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

a = cross_val_score(rf, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6411188337826735 data


### XGBoost-Regressor

In [None]:
clf = XGBRFRegressor(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
               colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
               max_depth=15, max_features='log2', min_child_weight=1,
               missing=None, n_estimators=100, n_jobs=1, nthread=None,
               objective='reg:linear', random_state=5, reg_alpha=0,
               reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
               subsample=0.8, verbosity=1)

xparams = {'learning_rate':[0.1,0.5,0.8,1],
             'n_estimators':[70,80,100],
             'max_depth':[2,3,4],
             'colsample_bytree':[0.1,0.5,0.7,0.9,1],
             'subsample':[0.2,0.3,0.5,1],
             'gamma':[0.0001,0.001,0,0.1,0.01,0.5,1],
             'reg_alpha':[0.00001,0.0001,0.001,0.01,0.1]}


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold=15, search= False)

R2 metric : 
0 r2 score of the train data 0.6067502601868848 and test data 0.6374083132665591
1 r2 score of the train data 0.6037579887656499 and test data 0.6953451163213371
2 r2 score of the train data 0.6121444549439352 and test data 0.6011016381585581
3 r2 score of the train data 0.6129187708599382 and test data 0.5565082186028582
4 r2 score of the train data 0.6113841767426726 and test data 0.582448928624506
5 r2 score of the train data 0.6078223907106232 and test data 0.6276658565426563
6 r2 score of the train data 0.6084467931525492 and test data 0.6559734999802659
7 r2 score of the train data 0.603205051745912 and test data 0.7092297719647569
8 r2 score of the train data 0.6226020717930794 and test data 0.46860069407268745
9 r2 score of the train data 0.6124662945950516 and test data 0.5626195379397068
10 r2 score of the train data 0.6185700634680764 and test data 0.48692268287664975
11 r2 score of the train data 0.608856439703718 and test data 0.6154646014250558
12 r2 score of

In [43]:
 xg = XGBRFRegressor(colsample_bylevel=1,
               colsample_bynode=0.8, colsample_bytree=1, gamma=0,
               learning_rate=1, max_delta_step=0, max_depth=5,
               max_features=0.95, min_child_weight=1, min_impurity_decrease=1,
               min_samples_leaf=1, min_samples_split=5, missing=None,
               n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=True, subsample=0.8, verbosity=1)

a = cross_val_score(xg, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6096710572369947 data


AdaBoost-Regressor

In [None]:
clf =  AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='linear',
                  n_estimators=100, random_state=10)


params = {'n_estimators'  : [100, 150, 200, ],
          'learning_rate' :[0.0001,0.001,0.01, 0.1],
          'loss' : [ 'linear', 'square', 'exponential'],
          'random_state' : [10,20,30]
          }

kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = False) 

R2 metric : 
0 r2 score of the train data 0.6406019024644523 and test data 0.6641913063532133
1 r2 score of the train data 0.6341117815098307 and test data 0.7675518739100321
2 r2 score of the train data 0.6448193857404521 and test data 0.601364590744293
3 r2 score of the train data 0.6420025644845055 and test data 0.6401058176407162
4 r2 score of the train data 0.6426822251515699 and test data 0.6709984599032441
5 r2 score of the train data 0.6412649710061986 and test data 0.6561873993797418
6 r2 score of the train data 0.6414199311868302 and test data 0.6550053083325136
7 r2 score of the train data 0.635692428613147 and test data 0.750110156147811
8 r2 score of the train data 0.6566658201904927 and test data 0.4667308312720433
9 r2 score of the train data 0.6433253839092598 and test data 0.6252818172649386
10 r2 score of the train data 0.6530673875304205 and test data 0.4887717640412784
11 r2 score of the train data 0.6424865119330938 and test data 0.6470462212100361
12 r2 score of t

In [44]:
 ab =   AdaBoostRegressor(base_estimator=None, learning_rate=0.0001, loss='linear',
                  n_estimators=100, random_state=10)


a = cross_val_score(ab, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6395565714731158 data


### Gradient Boosting Regressor

In [None]:
clf = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

params = {'n_estimators' : [800,1000, 1500, 2000, 2500],
          'loss' : [ 'huber', 'exponential'],
          'learning_rate' : [0.01, 0.01, 0.1],
          'max_depth' : [3,4,5,7]
          }


kfold_grid_search(clf, params, x2, y2, fold = 10, kfold = 15, search = False) 

R2 metric : 
0 r2 score of the train data 0.646898352654415 and test data 0.6463232721630487
1 r2 score of the train data 0.6445815700448241 and test data 0.7630028652459149
2 r2 score of the train data 0.6638708224569014 and test data 0.5820050454376444
3 r2 score of the train data 0.6643729193689359 and test data 0.6298306073660422
4 r2 score of the train data 0.6452857437229575 and test data 0.6703858624196001
5 r2 score of the train data 0.6467163192346321 and test data 0.6466422501820477
6 r2 score of the train data 0.6520411010671734 and test data 0.6518687281616424
7 r2 score of the train data 0.6511587363040845 and test data 0.7573584179324647
8 r2 score of the train data 0.6688157545177016 and test data 0.453579856520536
9 r2 score of the train data 0.6498989292469194 and test data 0.6088874846193633
10 r2 score of the train data 0.6721223434084025 and test data 0.49579802946005624
11 r2 score of the train data 0.6556792100878011 and test data 0.6329712960936003
12 r2 score of

In [45]:
gb =  GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='huber',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=11, presort='deprecated',
                          random_state=10, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

a = cross_val_score(gb, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6401325221008258 data


## LGBM Regressor

In [None]:
clf = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=3,
              min_child_samples=50, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=4000, n_jobs=-1, num_leaves=5, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


params = {'min_child_samples' : [10, 20,50],
          'num_leaves' : [5,6],
          'max_depth' : [2, 3, 5],
          'n_estimators' : [1000,2000,4000,5000],
          'learning_rate' : [0.0001,0.001,0.01,0.1]
          }

kfold_grid_search(clf, params, x2, y2.ravel(), fold = 10, kfold = 15, search = False) 

R2 metric : 
0 r2 score of the train data 0.7572398922837325 and test data 0.648629927585285
1 r2 score of the train data 0.7559447382612352 and test data 0.7524994967038349
2 r2 score of the train data 0.7635946493631556 and test data 0.5796051680603062
3 r2 score of the train data 0.7616028837108777 and test data 0.6283320314911397
4 r2 score of the train data 0.756111951588603 and test data 0.6669960134479473
5 r2 score of the train data 0.7592501493147876 and test data 0.6344725589160572
6 r2 score of the train data 0.757831426306504 and test data 0.6637281324013
7 r2 score of the train data 0.7549205437978412 and test data 0.7264772198521539
8 r2 score of the train data 0.7704056362229155 and test data 0.47265909739244094
9 r2 score of the train data 0.7606748853035004 and test data 0.629765276411993
10 r2 score of the train data 0.7635879477252655 and test data 0.49182201718973395
11 r2 score of the train data 0.7608877785525503 and test data 0.6206099747367849
12 r2 score of the

In [46]:
lb =  LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=3,
              min_child_samples=50, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=4000, n_jobs=-1, num_leaves=5, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


a = cross_val_score(lb, x2, y2, scoring= 'r2', cv= 10)
print(f'R^2 Score {np.mean(a)} data' )

R^2 Score 0.6280864312506385 data


In [None]:
estimators = [ ('rf', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features=0.95, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.001,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=8, min_weight_fraction_leaf=0.0,
                      n_estimators=70, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)),
              
                ('xg', XGBRFRegressor(colsample_bylevel=1,
               colsample_bynode=0.8, colsample_bytree=1, gamma=0,
               learning_rate=1, max_delta_step=0, max_depth=5,
               max_features=0.95, min_child_weight=1, min_impurity_decrease=1,
               min_samples_leaf=1, min_samples_split=5, missing=None,
               n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=True, subsample=0.8, verbosity=1)),
              

              ('gb', GradientBoostingRegressor(max_depth= 2, learning_rate= 0.1, random_state= 10,n_estimators=5000, 
                                n_iter_no_change = 11))
              
             ]

stack = StackingRegressor(estimators= estimators,
                          final_estimator= Ridge(), 
                          )
 

In [None]:
cv_score = cross_val_score(stack, x2, y2.ravel(), scoring='r2', cv=10, verbose=5, n_jobs=-1)
print('Mean Score:',cv_score.mean())
print('Standard Deviation:',cv_score.std())  

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Mean Score: 0.5762229576080763
Standard Deviation: 0.08756604747415982


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 12.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 12.2min finished


## R^2 Score

In [55]:
col = ['MODELS', 'FEATURES', 'CROSS-VALIDATION' , 'PUBLIC SCORE', 'PRIVATE SCORE']

tb = PrettyTable() 

tb.add_column(col[0], ['DECISION TREE', 'RANDOM FOREST', 'XGBOOST', 'ADA-BOOST', 'GRADIENT BOOSTING', 'LIGHT GBM', 'STCKED-ENSEMBLE'])

tb.add_column(col[1], ['TOP FEATURES USING SELECT K BEST METHOD', 'TOP FEATURES USING SELECT K BEST METHOD', 'TOP FEATURES USING SELECT K BEST METHOD',
                       'TOP FEATURES USING SELECT K BEST METHOD', 'TOP FEATURES USING SELECT K BEST METHOD', 'TOP FEATURES USING SELECT K BEST METHOD',
                       'TOP FEATURES USING SELECT K BEST METHOD'])

tb.add_column(col[2], ['0.6018', '0.6435', '0.6100', '0.6420', '0.6409', '0.6331', '0.5762'])

tb.add_column(col[3], ['0.54021', '0.54543', '0.54561', '0.53577', '0.53822', '0.53822', '0.55039'])

tb.add_column(col[4], ['0.52598', '0.54326', '0.53864', '0.53955', '0.53663', '0.54449', '0.54710'])

print(tb)

+-------------------+-----------------------------------------+------------------+--------------+---------------+
|       MODELS      |                 FEATURES                | CROSS-VALIDATION | PUBLIC SCORE | PRIVATE SCORE |
+-------------------+-----------------------------------------+------------------+--------------+---------------+
|   DECISION TREE   | TOP FEATURES USING SELECT K BEST METHOD |      0.6018      |   0.54021    |    0.52598    |
|   RANDOM FOREST   | TOP FEATURES USING SELECT K BEST METHOD |      0.6435      |   0.54543    |    0.54326    |
|      XGBOOST      | TOP FEATURES USING SELECT K BEST METHOD |      0.6100      |   0.54561    |    0.53864    |
|     ADA-BOOST     | TOP FEATURES USING SELECT K BEST METHOD |      0.6420      |   0.53577    |    0.53955    |
| GRADIENT BOOSTING | TOP FEATURES USING SELECT K BEST METHOD |      0.6409      |   0.53822    |    0.53663    |
|     LIGHT GBM     | TOP FEATURES USING SELECT K BEST METHOD |      0.6331      |   0.5

## OBSERVATION



*   This dataset created with help of SelectKBest feature selection method, we selected top 250 most important features. But we can see that these features are not get good score than other. 
*   In this sets stacked ensembles perform best and also another datasets, stacked model got 0.55039 in public and 0.54710 in private leaderboard.

