# Boosting - Python - XGBoost

In [1]:
import multiprocessing

num_cpu = multiprocessing.cpu_count()
num_cpu

16

In [2]:
import pandas as pd

dataset = pd.read_csv('data/churn_modelling.csv')
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,2978,15655123,Dumetolisa,505,Spain,Female,45,9,131355.3,3,1,0,195395.33,1
1,3716,15640409,Carpenter,817,Germany,Female,46,0,89087.89,1,0,1,87941.85,1
2,2644,15586870,Ni,632,France,Male,27,4,193125.85,1,1,1,152665.85,0
3,8748,15667216,Chung,579,France,Female,29,10,73194.52,2,1,1,129209.09,0
4,3384,15665766,T'ang,698,Germany,Male,39,9,133191.19,2,0,1,53289.49,0


In [3]:
dataset = pd.get_dummies(dataset, columns=["Geography", "Gender"], sparse=False)
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,2978,15655123,Dumetolisa,505,45,9,131355.3,3,1,0,195395.33,1,0,0,1,1,0
1,3716,15640409,Carpenter,817,46,0,89087.89,1,0,1,87941.85,1,0,1,0,1,0
2,2644,15586870,Ni,632,27,4,193125.85,1,1,1,152665.85,0,1,0,0,0,1
3,8748,15667216,Chung,579,29,10,73194.52,2,1,1,129209.09,0,1,0,0,1,0
4,3384,15665766,T'ang,698,39,9,133191.19,2,0,1,53289.49,0,0,1,0,0,1


In [4]:
dataset.shape

(10000, 17)

In [5]:
col = dataset.pop("Exited")
dataset.insert(16, col.name, col)

In [6]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Exited
0,2978,15655123,Dumetolisa,505,45,9,131355.3,3,1,0,195395.33,0,0,1,1,0,1
1,3716,15640409,Carpenter,817,46,0,89087.89,1,0,1,87941.85,0,1,0,1,0,1
2,2644,15586870,Ni,632,27,4,193125.85,1,1,1,152665.85,1,0,0,0,1,0
3,8748,15667216,Chung,579,29,10,73194.52,2,1,1,129209.09,1,0,0,1,0,0
4,3384,15665766,T'ang,698,39,9,133191.19,2,0,1,53289.49,0,1,0,0,1,0


In [7]:
dataset = dataset.drop(columns=["RowNumber", "CustomerId", "Surname"])
dataset.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Exited
0,505,45,9,131355.3,3,1,0,195395.33,0,0,1,1,0,1
1,817,46,0,89087.89,1,0,1,87941.85,0,1,0,1,0,1
2,632,27,4,193125.85,1,1,1,152665.85,1,0,0,0,1,0
3,579,29,10,73194.52,2,1,1,129209.09,1,0,0,1,0,0
4,698,39,9,133191.19,2,0,1,53289.49,0,1,0,0,1,0


In [8]:
dataset.shape

(10000, 14)

In [9]:
X = dataset.iloc[:, :13].values
X[0, :]

array([5.0500000e+02, 4.5000000e+01, 9.0000000e+00, 1.3135530e+05,
       3.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.9539533e+05,
       0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       0.0000000e+00])

In [10]:
y = dataset.iloc[:, 13].values
y[:5]

array([1, 1, 0, 0, 0])

In [11]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [12]:
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier

# can specify learning rate, number of estimators as decision trees, reg alpha, reg lambda, ...
classifier = XGBClassifier(n_jobs=num_cpu)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [13]:
# Predicting the Test set results

# classifier returns 0 or 1, no need to handle probabilities
y_pred = classifier.predict(X_test)
y_pred[:5]

array([0, 1, 0, 1, 0])

In [14]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[1487,   90],
       [ 216,  207]])

In [15]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, n_jobs=num_cpu)

In [16]:
accuracies.mean()

0.8568749999999999

In [17]:
accuracies.std()

0.010019512213675861