#**XGBoost Decision Tree - Classification (Sklearn)**

**Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Libraries**

In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.datasets import load_diabetes, load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb

**Code**

In [3]:
# Load Dataset
data = pd.read_csv('/content/drive/MyDrive/Practice/Churn_Modelling.csv')
print(data.shape)
print(data.head())

(10000, 14)
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4

In [4]:
# Prepare features and target variable
X = data.iloc[:, 3:13].values
y = data.iloc[:, 13].values

In [5]:
# Encoding categorical data
# Label encode the "Gender" and "Geography" columns
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])  # Gender column
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])  # Geography column

# One-hot encode the "Geography" column (column index 1 after label encoding)
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float64)

# Avoiding the dummy variable trap
X = X[:, 1:]

In [6]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
# Initialize XGBoost Classifier
n_trees = 5
model = xgb.XGBClassifier(n_estimators=n_trees, random_state=0)

In [8]:
# Fit the model
model.fit(X_train, y_train)

In [9]:
# Predicting the test set results
y_pred = model.predict(X_test)

In [10]:
# Displaying the number of trees and their structures
print(f"\nTotal Number of Trees Used: {n_trees}\n")

for i in range(n_trees):
    print(f"Tree {i + 1}:")
    tree_str = model.get_booster().get_dump()[i]
    print(tree_str)
    print()  # Adding a newline for better readability


Total Number of Trees Used: 5

Tree 1:
0:[f4<43] yes=1,no=2,missing=2
	1:[f7<3] yes=3,no=4,missing=4
		3:[f7<2] yes=7,no=8,missing=8
			7:[f0<1] yes=15,no=16,missing=16
				15:[f6<75395.5312] yes=29,no=30,missing=30
					29:[f3<1] yes=53,no=54,missing=54
						53:leaf=0.106711827
						54:leaf=-0.111228764
					30:[f6<177069.234] yes=55,no=56,missing=56
						55:leaf=-0.244059637
						56:leaf=0.143301338
				16:[f4<34] yes=31,no=32,missing=32
					31:[f9<1] yes=57,no=58,missing=58
						57:leaf=-0.0023466323
						58:leaf=-0.213118836
					32:[f6<152310.547] yes=59,no=60,missing=60
						59:leaf=0.273364156
						60:leaf=-0.255572557
			8:[f6<68432.4531] yes=17,no=18,missing=18
				17:[f5<1] yes=33,no=34,missing=34
					33:[f2<727] yes=61,no=62,missing=62
						61:leaf=-0.27052018
						62:leaf=-0.00417154795
					34:[f3<1] yes=63,no=64,missing=64
						63:leaf=-0.332662374
						64:leaf=-0.374820709
				18:[f4<37] yes=35,no=36,missing=36
					35:[f1<1] yes=65,no=66,missing=66
						

In [11]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'XGBoost Accuracy: {accuracy:.2f}')

XGBoost Accuracy: 0.87


In [12]:
# Classification report
print("Classification Report for XGBoost:")
print(classification_report(y_test, y_pred))

Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1595
           1       0.78      0.48      0.60       405

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000

