# XGBoost

#### Import the libraries

In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [25]:
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from scipy import stats

### Read the data

In [26]:
# read the data
df = pd.read_csv("../data/total_dry.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


##### Prepare the data

In [27]:
# delete unnamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# convert all columns of DataFrame
df = df.dropna()
# find the objects columns
cols = df.columns[df.dtypes.eq('object')]
# convert to numeric these columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [28]:
# check the shape
df.shape

(601076, 92)

#### Create X features and Y target

In [29]:
# split data into X and y
array = df.values
X = array[:,0:91]
Y = array[:,91]

In [30]:
# split data into train and test sets
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

#### Create the model

In [31]:
# fit model no training data
model = XGBRegressor(n_estimators=100,
                    learning_rate = .1,
                    max_depth = 6,
                    random_state=42,
                    n_jobs = -1,
                    early_stopping_rounds=10)

In [32]:
model.fit(
    X_train, 
    y_train, 
    eval_metric="mae",
    eval_set=[(X_test, y_test)],
    verbose=True)

[0]	validation_0-mae:28.2021
[1]	validation_0-mae:25.3987
[2]	validation_0-mae:22.879
[3]	validation_0-mae:20.6153
[4]	validation_0-mae:18.5834
[5]	validation_0-mae:16.7587
[6]	validation_0-mae:15.1192
[7]	validation_0-mae:13.6464
[8]	validation_0-mae:12.3295
[9]	validation_0-mae:11.1479
[10]	validation_0-mae:10.0893
[11]	validation_0-mae:9.13828
[12]	validation_0-mae:8.28901
[13]	validation_0-mae:7.52761
[14]	validation_0-mae:6.84866
[15]	validation_0-mae:6.24492
[16]	validation_0-mae:5.70648
[17]	validation_0-mae:5.231
[18]	validation_0-mae:4.80911
[19]	validation_0-mae:4.43748
[20]	validation_0-mae:4.10876
[21]	validation_0-mae:3.81828
[22]	validation_0-mae:3.57273
[23]	validation_0-mae:3.35675
[24]	validation_0-mae:3.16716
[25]	validation_0-mae:2.99472
[26]	validation_0-mae:2.84635
[27]	validation_0-mae:2.7197
[28]	validation_0-mae:2.60129
[29]	validation_0-mae:2.50086
[30]	validation_0-mae:2.41533
[31]	validation_0-mae:2.33053
[32]	validation_0-mae:2.24199
[33]	validation_0-mae:2.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=10,
       gamma=0, importance_type='gain', learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=-1, nthread=None, objective='reg:linear',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [33]:
preds = model.predict(X_test)

In [34]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 3.243471


#### Cross-Validation

In [36]:
X, y = df.iloc[:,:-1],df.iloc[:,-1]

In [None]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123