# Machine Learning - Xgboost CV

## Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt  
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import cv

## Data Import

In [2]:
data_path = './'
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = data_path + 'machine-learning-xgboost-cv-dataset.csv'
df = pd.read_csv(data)
display(df)

./requirements.txt
./machine-learning-xgboost-cv-dataset.csv
./machine-learning-xgboost-cv.ipynb
./.ipynb_checkpoints/machine-learning-xgboost-cv-checkpoint.ipynb


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...,...,...
435,1,3,29703,12051,16027,13135,182,2204
436,1,3,39228,1431,764,4510,93,2346
437,2,3,14531,15488,30243,437,14841,1867
438,1,3,10290,1981,2232,1038,168,2125


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


In [4]:
df.describe()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
count,440.0,440.0,440.0,440.0,440.0,440.0,440.0,440.0
mean,1.322727,2.543182,12000.297727,5796.265909,7951.277273,3071.931818,2881.493182,1524.870455
std,0.468052,0.774272,12647.328865,7380.377175,9503.162829,4854.673333,4767.854448,2820.105937
min,1.0,1.0,3.0,55.0,3.0,25.0,3.0,3.0
25%,1.0,2.0,3127.75,1533.0,2153.0,742.25,256.75,408.25
50%,1.0,3.0,8504.0,3627.0,4755.5,1526.0,816.5,965.5
75%,2.0,3.0,16933.75,7190.25,10655.75,3554.25,3922.0,1820.25
max,2.0,3.0,112151.0,73498.0,92780.0,60869.0,40827.0,47943.0


In [5]:
df.isnull().sum()

Channel             0
Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
dtype: int64

In [6]:
X = df.drop('Channel', axis=1)

y = df['Channel']

In [7]:
X.head()

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,3,12669,9656,7561,214,2674,1338
1,3,7057,9810,9568,1762,3293,1776
2,3,6353,8808,7684,2405,3516,7844
3,3,13265,1196,4221,6404,507,1788
4,3,22615,5410,7198,3915,1777,5185


In [8]:
y.head()

0    2
1    2
2    2
3    1
4    2
Name: Channel, dtype: int64

In [9]:
# convert labels into binary values
y[y == 2] = 0
y[y == 1] = 1
y.head()

0    0
1    0
2    0
3    1
4    0
Name: Channel, dtype: int64

## Model Training 

In [10]:
data_dmatrix = xgb.DMatrix(data=X,label=y) # define data_dmatrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 37)

In [11]:
%%time

# declare parameters
params = {
    'objective':'binary:logistic',
    'max_depth': 4,
    'alpha': 10,
    'learning_rate': 1.0,
    'n_estimators':100
}         
       
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)

# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
print(xgb_clf)

XGBClassifier(alpha=10, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1.0, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, ...)
CPU times: user 99.3 ms, sys: 59.9 ms, total: 159 ms
Wall time: 123 ms


## Model Predict

In [12]:
y_pred = xgb_clf.predict(X_test)

In [13]:
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

XGBoost model accuracy score: 0.8864


## Cross Validation

In [14]:
params = {
    "objective":"binary:logistic",
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
    'max_depth': 5, 
    'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix,
            params=params, 
            nfold=3,
            num_boost_round=50, 
            early_stopping_rounds=10, 
            metrics="auc", 
            as_pandas=True,
            seed=37)

## Results

In [15]:
xgb_cv.head()

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.943976,0.006322,0.932113,0.014852
1,0.959946,0.005627,0.951679,0.011266
2,0.961044,0.004379,0.951592,0.015347
3,0.962835,0.005306,0.95251,0.013141
4,0.965025,0.004129,0.950808,0.01489
