# XGBoost

### Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
plt.rcParams['figure.figsize'] = [14, 8]

In [2]:
# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')

In [3]:
dataset.head(1)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1


In [4]:
X = dataset.iloc[:, [3, 4, 6, 7, 8, 9, 10, 11, 12]].values

In [5]:
y = dataset.iloc[:, 13].values

In [6]:
X[0]

array([619, 'France', 42, 2, 0.0, 1, 1, 1, 101348.88], dtype=object)

In [7]:
y[0]

1

In [8]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 1] = labelencoder_X.fit_transform(X[:, 1])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

In [9]:
X[0:2]

array([[  0.00000000e+00,   0.00000000e+00,   6.19000000e+02,
          4.20000000e+01,   2.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          1.01348880e+05],
       [  0.00000000e+00,   1.00000000e+00,   6.08000000e+02,
          4.10000000e+01,   1.00000000e+00,   8.38078600e+04,
          1.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.12542580e+05]])

In [10]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Fitting XGBoost to the training set

In [11]:
from xgboost import XGBClassifier

In [12]:
classifier = XGBClassifier()

In [13]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### Predicting the Test set results

In [14]:
y_pred = classifier.predict(X_test)

In [15]:
y_pred[0:10]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [16]:
y_test[0:10]

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1])

### Making the confussion Matrix

In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1532,   63],
       [ 203,  202]])

### Calculating Accuracy

In [18]:
(cm[0][0]+cm[1][1])/np.sum(cm)

0.86699999999999999

### Applying k-Fold Cross Validation

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
accuracies = cross_val_score(estimator = classifier, 
                             X = X_train,
                             y = y_train,
                             cv = 10)
accuracies # 10 test set accuracies

array([ 0.87640449,  0.8639201 ,  0.88125   ,  0.86625   ,  0.86375   ,
        0.855     ,  0.865     ,  0.8575    ,  0.8485607 ,  0.87359199])

In [21]:
np.mean(accuracies) # mean of accuracies

0.86512272851207572

In [22]:
np.std(accuracies) # startdard deviation of accuracies

0.0094793902817781814

### Applying Grid Search to find the best model and the best parameters (Optional)

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
help(XGBClassifier())

Help on XGBClassifier in module xgboost.sklearn object:

class XGBClassifier(XGBModel, sklearn.base.ClassifierMixin)
 |  Implementation of the scikit-learn API for XGBoost classification.
 |  
 |      Parameters
 |  ----------
 |  max_depth : int
 |      Maximum tree depth for base learners.
 |  learning_rate : float
 |      Boosting learning rate (xgb's "eta")
 |  n_estimators : int
 |      Number of boosted trees to fit.
 |  silent : boolean
 |      Whether to print messages while running boosting.
 |  objective : string or callable
 |      Specify the learning task and the corresponding learning objective or
 |      a custom objective function to be used (see note below).
 |  booster: string
 |      Specify which booster to use: gbtree, gblinear or dart.
 |  nthread : int
 |      Number of parallel threads used to run xgboost.  (Deprecated, please use n_jobs)
 |  n_jobs : int
 |      Number of parallel threads used to run xgboost.  (replaces nthread)
 |  gamma : float
 |      Minimu

In [40]:
# Tried various parameters, this one is the best till now.
parameters = [{'max_depth': [3], 'learning_rate': [0.1], 'n_estimators': [250], 'booster': ['gbtree', 'gblinear', 'dart']}]

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [41]:
best_accuracy = grid_search.best_score_
best_accuracy

0.86550000000000005

In [42]:
best_parameters = grid_search.best_params_
best_parameters

{'booster': 'gbtree',
 'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimators': 250}