# import packages

In [12]:
import rmsp
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor

import sys
sys.path.append('../python_modules/')  # Defining the path to my modules
from hyperparameter_tunning import RandomSearch_KF, GridSearch_KF

# load data

In [16]:
# Loading data
data = pd.read_csv('wl_semisyn_bivariate.csv')
# data_orig['z'] = 0

# Converting Pandas Dataframe into a RMSP Point Data
data = rmsp.PointData(data, x='x', y="y")
data.drop('x1_shift', axis=1, inplace=True)
data.rename(columns={'x1':'var1', 'tg':'var2'}, inplace=True)

type(data)

rmsp.core.point.PointData

# sub-sampling and splitting data

In [17]:
# Define threshold on the reference variable
thr_var = 'var1'
threshold = data[thr_var].quantile(0.50)

# fractions to select below and above threshold
frac_above = 0.60
frac_below = 0.40

# Select samples above and below the median threshold
samples_above_threshold = data[data[thr_var] > threshold]
samples_below_threshold = data[(data[thr_var] < threshold)]

# Sample subsets
complete_set_above = samples_above_threshold.sample(frac=frac_above, random_state=42)
complete_set_below = samples_below_threshold.sample(frac=frac_below, random_state=42)

# Combine subsets
complete_set = pd.concat([complete_set_above, complete_set_below])

# Remaining samples (excluding complete set) go to missing set
missing_set = data.drop(complete_set.index)

# Split the "complete" subset into validation and modeling subsets
valid_set = complete_set.sample(frac=0.15, random_state=42)
model_set = complete_set.drop(valid_set.index)

# Create flags in the main dataframe
data['missing'] = 0
data['valid'] = 0
data['model'] = 0
data.loc[missing_set.index, 'missing'] = 1
data.loc[valid_set.index,   'valid']   = 1
data.loc[model_set.index,   'model']   = 1

# hyperparameter tunning using modeling subset

In [22]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [3, 4, 5, 7],
    'min_samples_leaf': [3, 4, 5],
    'subsample': [0.7, 0.8, 0.9],
}


In [23]:
# Instantiating base model
base_model = GradientBoostingRegressor(random_state=42)

# Instantiating randomized search object
rand_search = RandomSearch_KF(estimator=base_model, param_grid=param_grid, n_folds=10, n_iter=2000, random_state=42)

# Running workflow with the modeling data
rand_search.fit(model_set, ['var1'], 'var2')

Initializing... there are 1728 possible combinations of hyperparameters
Running 2000 combinations randomly...

Iteration 1728/2000 - Avg test R²: 0.6783 | Avg train R²: 0.7907 - Time: 0.02 min 

Total time for 2000 iterations: 19.89 minutes


In [24]:
# Retrieve the best parameters
rand_search.print_best_params()


Best test R² score: 0.6966
Associated training R² score: 0.7619

Best hyperparameter combination:
  alpha: 0.9
  ccp_alpha: 0.0
  criterion: friedman_mse
  init: None
  learning_rate: 0.05
  loss: squared_error
  max_depth: 2
  max_features: None
  max_leaf_nodes: None
  min_impurity_decrease: 0.0
  min_samples_leaf: 4
  min_samples_split: 4
  min_weight_fraction_leaf: 0.0
  n_estimators: 100
  n_iter_no_change: None
  random_state: 42
  subsample: 0.8
  tol: 0.0001
  validation_fraction: 0.1
  verbose: 0
  warm_start: False


In [26]:
# Retrieve the best model and refit it on all data
final_model = rand_search.refit_best_model(model_set, ['var1'], 'var2')


Best model has been refitted using all data.
R2 from training data (just for reference): 0.7589


In [27]:
# Check final hyperparameters
final_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.05,
 'loss': 'squared_error',
 'max_depth': 2,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 0.8,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}