# Machine Learning - Xgboost CV

## Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt  
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import cv

## Data Import

In [None]:
data_path = './'
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = data_path + 'machine-learning-xgboost-cv-dataset.csv'
df = pd.read_csv(data)
display(df)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
X = df.drop('Channel', axis=1)

y = df['Channel']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# convert labels into binary values
y[y == 2] = 0
y[y == 1] = 1
y.head()

## Model Training 

In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y) # define data_dmatrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 37)

In [None]:
%%time

# declare parameters
params = {
    'objective':'binary:logistic',
    'max_depth': 4,
    'alpha': 10,
    'learning_rate': 1.0,
    'n_estimators':100
}         
       
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)

# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
print(xgb_clf)

## Model Predict

In [None]:
y_pred = xgb_clf.predict(X_test)

In [None]:
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

## Cross Validation

In [None]:
params = {
    "objective":"binary:logistic",
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
    'max_depth': 5, 
    'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix,
            params=params, 
            nfold=3,
            num_boost_round=50, 
            early_stopping_rounds=10, 
            metrics="auc", 
            as_pandas=True,
            seed=37)

## Results

In [None]:
xgb_cv.head()