# XGBoost

We'll continue by evaluating a XGBoost model using AUC to get the best score. We'll also review the Confusion Matrix for the best model on the validation data as well as looking at the models precision and recall.

---

## Imports and import Data

In [64]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    roc_auc_score,
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

from sklearn import tree

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split


cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=999)

from sklearn.metrics import accuracy_score

# import packages for hyperparameters tuning
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

### Import the data

In [65]:
df = pd.read_csv("./final_data.csv")

In [66]:
df.shape

(72063, 18)

In [67]:
# Some records are noy unique
df.drop_duplicates(inplace=True)

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71866 entries, 0 to 72062
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   block                 71866 non-null  object 
 1   iucr                  71866 non-null  object 
 2   primary_type          71866 non-null  object 
 3   description           71866 non-null  object 
 4   location_description  71866 non-null  object 
 5   arrest                71866 non-null  bool   
 6   domestic              71866 non-null  bool   
 7   beat                  71866 non-null  int64  
 8   district              71866 non-null  int64  
 9   ward                  71866 non-null  int64  
 10  community_area        71866 non-null  int64  
 11  fbi_code              71866 non-null  object 
 12  latitude              71866 non-null  float64
 13  longitude             71866 non-null  float64
 14  hour                  71866 non-null  int64  
 15  day                

### Set Target

The target feature is `arrest` which is a boolean feature. For the medel we need to change this to `0` and `1` values.

We will also change the `domestic` feature in the same manner

In [69]:
df.arrest = df.arrest.astype(int)
df.domestic = df.domestic.astype(int)

### Identify Catergorical and Numeric Columns

In [70]:
categorical_columns = [
    "iucr",
    "primary_type",
    "description",
    "location_description",
    "fbi_code",
    "zip",
    "street",
]

numerical_columns = [
    "domestic",
    "beat",
    "district",
    "ward",
    "community_area",
    "latitude",
    "longitude",
    "hour",
    "day",
]

features = categorical_columns + numerical_columns

---

# XGBoost

### Split the Data

The data will be split as follows:

#### Training

80% of the data will be used for Training. During training K-Fold cross validation will be used.

One the best model parameters are identified the full training dataset will be used to train the final model.

#### Test

20% of the data will be help back for final testing of the model.

In [71]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

dict_train = df_train[features].to_dict(orient="records")
dict_val = df_val[features].to_dict(orient="records")
dict_test = df_test[features].to_dict(orient="records")

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)
X_test = dv.transform(dict_test)

y_train = df_train.arrest.values
y_val = df_val.arrest.values
y_test = df_test.arrest.values

In [72]:
dict_train[0]

{'iucr': '143A',
 'primary_type': 'WEAPONS VIOLATION',
 'description': 'UNLAWFUL POSSESSION - HANDGUN',
 'location_description': 'STREET',
 'fbi_code': '15',
 'zip': '001XX',
 'street': 'W, HUBBARD, ST',
 'domestic': 0,
 'beat': 1831,
 'district': 18,
 'ward': 42,
 'community_area': 8,
 'latitude': 41.890011565,
 'longitude': -87.631793561,
 'hour': 21,
 'day': 1}

---

### RMSE

In [73]:
from hyperopt import hp
import numpy as np
from sklearn.metrics import mean_squared_error


# XGB parameters
xgb_reg_params = {
    "learning_rate": hp.choice("learning_rate", np.arange(0.05, 0.31, 0.05)),
    "max_depth": hp.choice("max_depth", np.arange(5, 16, 1, dtype=int)),
    "min_child_weight": hp.choice("min_child_weight", np.arange(1, 8, 1, dtype=int)),
    "colsample_bytree": hp.choice("colsample_bytree", np.arange(0.3, 0.8, 0.1)),
    "subsample": hp.uniform("subsample", 0.8, 1),
    "n_estimators": 100,
    "eval_metric": "rmse",
    "early_stopping_rounds": 10,
}
xgb_para = dict()
xgb_para["reg_params"] = xgb_reg_params
xgb_para["loss_func"] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

In [74]:
import xgboost as xgb
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials


class HPOpt(object):
    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(
                fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials
            )
        except Exception as e:
            return {"status": STATUS_FAIL, "exception": str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para["reg_params"])
        return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        reg.fit(
            self.x_train,
            self.y_train,
            eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
        )
        pred = reg.predict(self.x_test)
        loss = para["loss_func"](self.y_test, pred)
        return {"loss": loss, "status": STATUS_OK}

In [75]:
obj = HPOpt(X_train, X_val, y_train, y_val)

xgb_opt = obj.process(
    fn_name="xgb_reg", space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=10
)

[0]	validation_0-rmse:0.41451	validation_1-rmse:0.41612
[1]	validation_0-rmse:0.36855	validation_1-rmse:0.37202
[2]	validation_0-rmse:0.33716	validation_1-rmse:0.34365
[3]	validation_0-rmse:0.31730	validation_1-rmse:0.32487
[4]	validation_0-rmse:0.30605	validation_1-rmse:0.31552
[5]	validation_0-rmse:0.29948	validation_1-rmse:0.31004
[6]	validation_0-rmse:0.29535	validation_1-rmse:0.30708
[7]	validation_0-rmse:0.29312	validation_1-rmse:0.30565
[8]	validation_0-rmse:0.29102	validation_1-rmse:0.30401
[9]	validation_0-rmse:0.29017	validation_1-rmse:0.30348
[10]	validation_0-rmse:0.28882	validation_1-rmse:0.30302
[11]	validation_0-rmse:0.28708	validation_1-rmse:0.30207
[12]	validation_0-rmse:0.28632	validation_1-rmse:0.30163
[13]	validation_0-rmse:0.28588	validation_1-rmse:0.30136
[14]	validation_0-rmse:0.28521	validation_1-rmse:0.30108
[15]	validation_0-rmse:0.28492	validation_1-rmse:0.30099
[16]	validation_0-rmse:0.28420	validation_1-rmse:0.30091
[17]	validation_0-rmse:0.28392	validation

In [76]:
# Best parameters
xgb_opt

({'colsample_bytree': 1,
  'learning_rate': 3,
  'max_depth': 3,
  'min_child_weight': 5,
  'subsample': 0.8899372509684718},
 <hyperopt.base.Trials at 0x2904f2da0>)

In [77]:
learning_rate = np.arange(0.05, 0.31, 0.05)
max_depth = np.arange(5, 16, 1, dtype=int)
min_child_weight = np.arange(1, 8, 1, dtype=int)
colsample_bytree = np.arange(0.3, 0.8, 0.1)

In [82]:
cs = colsample_bytree[1]
lr = learning_rate[3]
md = max_depth[3]
mcw = min_child_weight[5]

In [83]:
print(f"colsample_bytree: {cs}")
print(f"learning_rate: {lr}")
print(f"max_depth: {md}")
print(f"min_child_weight: {mcw}")
print(f"subsample: 0.8899372509684718")

colsample_bytree: 0.4
learning_rate: 0.2
max_depth: 8
min_child_weight: 6
subsample: 0.8899372509684718


In [84]:
xg = xgb.XGBClassifier(
    colsample_bytree=cs,
    learning_rate=lr,
    max_depth=md,
    min_child_weight=mcw,
    subsample=0.900961518576018,
    eval_metric="rmse",
    early_stopping_rounds=10,
)

evaluation = [(X_train, y_train), (X_val, y_val)]

xg.fit(
    X_train,
    y_train,
    eval_set=evaluation,
    verbose=False,
)

y_pred = xg.predict(X_test)

In [85]:
print("Confusion Matrix Tree : \n", confusion_matrix(y_test, y_pred), "\n")
print("The precision for Tree is ", precision_score(y_test, y_pred))
print("The recall for Tree is ", recall_score(y_test, y_pred), "\n")

Confusion Matrix Tree : 
 [[11113   172]
 [ 1355  1734]] 

The precision for Tree is  0.9097586568730325
The recall for Tree is  0.5613467141469731 

