In [1]:
!pip install xgboost



In [2]:
import numpy as np
from xgboost import XGBClassifier 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import warnings

In [3]:
iris = datasets.load_iris()
feat_labels = ['Sepal Length','Sepal Width','Petal Length','Petal Width']
X = iris.data
y = iris.target

In [4]:
X[0:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [5]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [7]:
model = XGBClassifier() 
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [8]:
# Print the name and gini importance of each feature
for feature in zip(feat_labels, model.feature_importances_):
    print(feature)

('Sepal Length', 0.009823686)
('Sepal Width', 0.015306727)
('Petal Length', 0.8427093)
('Petal Width', 0.13216025)


In [9]:
sfm = SelectFromModel(model, threshold=0.25)
sfm.fit(X_train, y_train)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [10]:
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

Petal Length


In [11]:
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [12]:
clf_important = XGBClassifier()
clf_important.fit(X_important_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
y_important_pred = clf_important.predict(X_important_test)
accuracy_score(y_test, y_important_pred)


0.9

In [14]:

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
#accuracies = cross_val_score(estimator = clf_important, X = X_train, y = y_train, cv = 10)
#print(accuracies.mean())
#print(accuracies.std())

In [15]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.95

In [16]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[16  0  0]
 [ 0 22  1]
 [ 0  2 19]]


In [18]:
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
  
    'n_estimators': [100,1400, 200], 
     "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7, 10, 13, 15 ],
      "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
      "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
   
}

# Instantiate the grid search model
grid_search = RandomizedSearchCV(estimator = clf_important, param_distributions = param_grid, 
                          cv = 5, n_jobs = -1,verbose = 1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.8s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=0,
                                           num_pa...
                                           reg_lambda=1, scale_pos_weight=None,
                                           subsample=1, 

In [19]:
# printing the optimal accuracy score and hyperparameters

print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

We can get accuracy of 0.9666666666666666 using {'n_estimators': 1400, 'min_child_weight': 5, 'max_depth': 12, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.7}


In [20]:
xgb = XGBClassifier(n_estimators=1400, min_child_weight= 5, max_depth= 12, learning_rate= 0.1, gamma= 0.1, colsample_bytree=0.7)

In [21]:
# fit

xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=12,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=1400, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
# predict

predictions = xgb.predict(X_test)

In [23]:
# evaluation metrics

from sklearn.metrics import classification_report,confusion_matrix

In [24]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.85      0.96      0.90        23
           2       0.94      0.81      0.87        21

    accuracy                           0.92        60
   macro avg       0.93      0.92      0.92        60
weighted avg       0.92      0.92      0.92        60



In [25]:
print(confusion_matrix(y_test,predictions))

[[16  0  0]
 [ 0 22  1]
 [ 0  4 17]]
