In [1]:
import xgboost as xg 
import pandas as pd
from hydra import initialize, compose
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error, mean_absolute_percentage_error
import numpy as np

In [2]:

with initialize(version_base=None, config_path="./conf"):
    cfg = compose(config_name="config")

In [3]:

data = pd.read_csv(f'{cfg.data}/cesm_data_variant.csv')
ds = data[data['year'] < 2015]
obs = pd.read_csv(f'{cfg.data}/cleaned_observed_ann_input.csv')
obs = obs[cfg.model.input]
# X = ds.drop(columns = cfg.model.output,axis=1)
X = ds[cfg.model.input]
y = ds[cfg.model.output]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=5)
# X_test, X_hold_out, y_test, y_hold_out = train_test_split(X_other, y_other, test_size = 0.33333, random_state=5)

In [8]:
def print_errors(y_test,pred,num_features):
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    Adj_r2 = 1 - (1-r2_score(y_test, pred)) * (len(y_test)-1)/(len(y_test)-num_features-1)
    mape = mean_absolute_percentage_error(y_test,pred)
    print("Testing performance")
    print('RMSE: {:.2f}'.format(rmse))
    print('MSE: {:.2f}'.format(mse))
    print('MAE: {:.2f}'.format(mae))
    print('MAPE: {:.2f}'.format(mape))
    print('R2: {:.2f}'.format(r2))
    print('Adjusted R2: {:.2f}'.format(Adj_r2))

In [9]:
model_feature = xg.XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=6,
             missing=np.nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1)

model_feature.fit(X_train,y_train)

In [10]:
pred = model_feature.predict(X_test)
print_errors(y_test,pred,X_train.shape[1])

Testing performance
RMSE: 34.21
MSE: 1170.25
MAE: 5.67
MAPE: 11998041729512.21
R2: 0.82
Adjusted R2: 0.82


In [None]:
obs_pred = model_feature.predict(obs)