# XGBoost

#### Import the libraries

In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [9]:
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from scipy import stats

### Read the data

In [10]:
# read the data
#df = pd.read_csv("../data/total_dry_no_labels.csv", index_col=0)
df = pd.read_csv("../data/total_dry_no_labels.csv", index_col=0)
df.columns = df.columns.str.replace(' ', '')
#df = df.head(100000)
#df.drop('network_count', axis=1, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


##### Prepare the data

In [11]:
# delete unnamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# convert all columns of DataFrame
df = df.dropna()
# find the objects columns
cols = df.columns[df.dtypes.eq('object')]
# convert to numeric these columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [13]:
# check the shape
df.shape

(601076, 50)

#### Create X features and Y target

In [27]:
# split data into X and y
array = df.values
X = array[:,0:49]
Y = array[:,49]

In [28]:
# split data into train and test sets
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

#### Create the model

In [16]:
# fit model no training data
model = XGBRegressor(n_estimators=100,
                    learning_rate = .1,
                    max_depth = 6,
                    random_state=42,
                    n_jobs = -1,
                    early_stopping_rounds=10)

In [17]:
model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse",
    eval_set=[(X_test, y_test)],
    verbose=True)

[0]	validation_0-rmse:30.9202
[1]	validation_0-rmse:27.9275
[2]	validation_0-rmse:25.2414
[3]	validation_0-rmse:22.8352
[4]	validation_0-rmse:20.6752
[5]	validation_0-rmse:18.7406
[6]	validation_0-rmse:16.9884
[7]	validation_0-rmse:15.4115
[8]	validation_0-rmse:14.0161
[9]	validation_0-rmse:12.761
[10]	validation_0-rmse:11.6619
[11]	validation_0-rmse:10.6807
[12]	validation_0-rmse:9.79363
[13]	validation_0-rmse:9.0231
[14]	validation_0-rmse:8.32633
[15]	validation_0-rmse:7.72623
[16]	validation_0-rmse:7.20214
[17]	validation_0-rmse:6.72755
[18]	validation_0-rmse:6.32531
[19]	validation_0-rmse:5.96324
[20]	validation_0-rmse:5.65257
[21]	validation_0-rmse:5.38239
[22]	validation_0-rmse:5.15265
[23]	validation_0-rmse:4.96165
[24]	validation_0-rmse:4.78939
[25]	validation_0-rmse:4.6363
[26]	validation_0-rmse:4.50556
[27]	validation_0-rmse:4.40619
[28]	validation_0-rmse:4.28646
[29]	validation_0-rmse:4.19903
[30]	validation_0-rmse:4.11429
[31]	validation_0-rmse:4.03432
[32]	validation_0-rms

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=10,
       gamma=0, importance_type='gain', learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=-1, nthread=None, objective='reg:linear',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [18]:
preds = model.predict(X_test)

In [19]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 3.120412


#### Cross-Validation

In [20]:
X, y = df.iloc[:,:-1],df.iloc[:,-1]

In [21]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [22]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

In [23]:
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10, 
                    metrics="rmse", as_pandas=True, seed=seed)



In [24]:
print((cv_results["test-rmse-mean"]).tail(1))

49    4.501214
Name: test-rmse-mean, dtype: float64


#### Feature Importance

In [25]:
model = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)



In [26]:
import matplotlib.pyplot as plt

xgb.plot_tree(model,num_trees=4)
fig = plt.gcf()
fig.set_size_inches(100, 100)
#plt.rcParams['figure.figsize'] = [100, 100]
plt.savefig('../reports/tree.png')
plt.show()

<Figure size 10000x10000 with 1 Axes>