In [1]:
import pickle

import os
from collections import OrderedDict

import hazelbean as hb
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import mpl_toolkits
from mpl_toolkits.basemap import Basemap
import seaborn as sns

import math
from scipy import stats

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
import sklearn.metrics

import xgboost as xgb

import gdal
import pygeoprocessing
# import taskgraph
import netCDF4

from modeling_utils import *
from viz_utils import *
from raster_calc_utils import *
from spatial_utils import * 

## XGBoost Regressor Tuning


In [4]:
from sklearn.model_selection import GridSearchCV
df = pd.read_csv('../Data/intermediate/baseline_df.csv')
df = df.set_index('pixel_id')
df = df.sample(frac=0.05, replace=False, weights=None, random_state=None, axis=0)
len(df)

45340

In [5]:
xgb_regressor = xgb.XGBRegressor()

parameters1 = {'nthread':[4], 
              'objective':['reg:linear'],
              'learning_rate': [.03, .07, .2], #so called `eta` value
              'max_depth': [3, 5, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.5,0.75,1],
              'colsample_bytree': [0.5,0.75,1],
              'n_estimators': [300,500,600]}

parameters2 = {'nthread':[4], 
              'objective':['reg:linear'],
              'learning_rate': [.01,.03,.05,.07], #so called `eta` value
              'max_depth': [5,6,7,8,9],
              'min_child_weight': [3,4,5],
              'silent': [1],
              'subsample': [0.5,0.75,1],
              'colsample_bytree': [0.65,0.75,0.85],
              'n_estimators': [300,500,600,700]}

parameters3 = {'nthread':[4], 
              'objective':['reg:linear'],
              'learning_rate': [.02,.03,.04], #so called `eta` value
              'max_depth': [7,8,9,10],
              'min_child_weight': [3,4,5],
              'silent': [1],
              'subsample': [0.65,0.75,0.85],
              'colsample_bytree': [0.55,0.65,0.75,0.85],
              'n_estimators': [300,500,600,700]}

xgb_grid = GridSearchCV(xgb_regressor,
                        parameters1,
                        cv = 2,
                        n_jobs = 3,
                        verbose=True)

x = df.drop(['log_calories_per_ha'], axis=1)
y = df['log_calories_per_ha']

X_train, X_test, y_train, y_test = train_test_split(x, y)
xgb_grid.fit(X_train,y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 2 folds for each of 243 candidates, totalling 486 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed: 11.2min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed: 35.7min
[Parallel(n_jobs=3)]: Done 486 out of 486 | elapsed: 41.1min finished


0.8499972337524272
{'colsample_bytree': 1, 'learning_rate': 0.07, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 600, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.75}


In [6]:
xgb_grid = GridSearchCV(xgb_regressor,
                        parameters2,
                        cv = 2,
                        n_jobs = 3,
                        verbose=True)

xgb_grid.fit(X_train,y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 2 folds for each of 2160 candidates, totalling 4320 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed: 17.3min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed: 41.7min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed: 74.7min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed: 115.4min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed: 167.1min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed: 241.4min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed: 322.4min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed: 421.2min
[Parallel(n_jobs=3)]: Done 4320 out of 4320 | elapsed: 456.3min finished


0.8590652492103497
{'colsample_bytree': 0.85, 'learning_rate': 0.05, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 700, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.75}


In [7]:
xgb_grid = GridSearchCV(xgb_regressor,
                        parameters3,
                        cv = 2,
                        n_jobs = 3,
                        verbose=True)

xgb_grid.fit(X_train,y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 2 folds for each of 1728 candidates, totalling 3456 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed: 15.6min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed: 37.6min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed: 70.6min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed: 115.7min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed: 174.8min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed: 249.8min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed: 345.0min
[Parallel(n_jobs=3)]: Done 3456 out of 3456 | elapsed: 379.9min finished


0.8607798491893492
{'colsample_bytree': 0.85, 'learning_rate': 0.04, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 700, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.75}
