# Building XGBoost Models: The Iris Dataset Example

---

**Author:** Dr. Saad Laouadi  
**Copyright:** Dr. Saad Laouadi  

---

## License

**This material is intended for educational purposes only and may not be used directly in courses, video recordings, or similar without prior consent from the author. When using or referencing this material, proper credit must be attributed to the author.**

```text
#**************************************************************************
#* (C) Copyright 2024 by Dr. Saad Laouadi. All Rights Reserved.           *
#**************************************************************************                                                                    
#* DISCLAIMER: The author has used their best efforts in preparing        *
#* this content. These efforts include development, research,             *
#* and testing of the theories and programs to determine their            *
#* effectiveness. The author makes no warranty of any kind,               *
#* expressed or implied, with regard to these programs or                 *
#* to the documentation contained within. The author shall not            *
#* be liable in any event for incidental or consequential damages         *
#* in connection with, or arising out of, the furnishing,                 *
#* performance, or use of these programs.                                 *
#*                                                                        *
#* This content is intended for tutorials, online articles,               *
#* and other educational purposes.                                        *
#**************************************************************************
```



In [8]:
## ======================================================================
#            Importing Necessary Modules and Tools for This Notebook
## ======================================================================

# Standard library imports
import time

# Data manipulation and analysis
import pandas as pd
import numpy as np 

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn: Machine learning and model evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# Preprocessing tools
from sklearn.preprocessing import MinMaxScaler

# Import XGBoost classifier
from xgboost import XGBClassifier

# Notebook configuration
pd.options.display.float_format = '{:,.3f}'.format
%matplotlib inline

# Silence warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn import datasets

# Configuration
IRIS_DATASET = datasets.load_iris()

In [9]:
# Convert SKlearn dataset into a dataframe
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                  columns = iris['feature_names'] + ['target'])
# Check the head
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(iris['data'],
                                                    iris['target'],
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(112, 4)
(38, 4)
(112,)
(38,)


In [12]:
# Instantiate xgboost classifier
# Use gbtree as base learner

xgb_clf = XGBClassifier(booster = "gbtree",
                        objective="mulit:softprob",
                        max_depth=6,
                        learning_rate=0.1,
                        n_estimators=100,
                        random_state=2,
                        n_jobs=-1,
                        use_label_encoder=False,
                        eval_metric='mlogloss')

# Fitting the classifier
xgb_clf.fit(X_train, y_train)

In [13]:
# Generate predictions
y_pred = xgb_clf.predict(X_test)

In [14]:
# Evaluate the model
score = accuracy_score(y_pred, y_test)

print(score)

0.9736842105263158


---
Retrain the Projec
---

In [24]:
import time
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn import datasets
from sklearn.pipeline import Pipeline

def setup_logger(log_time: bool = True):
    """Sets up the logger with an optional timestamp.

    Parameters
    ----------
    log_time : bool, optional
        Whether to include the timestamp in the log messages (default is True).
    """
    # Create a logger
    logger = logging.getLogger()
    
    # Clear any existing handlers to prevent duplicate logs
    if logger.hasHandlers():
        logger.handlers.clear()
    
    # Set the log format
    log_format = "%(message)s"
    if log_time:
        log_format = "%(asctime)s - %(message)s"
    
    # Set up the basic configuration for the logger
    logging.basicConfig(
        level=logging.INFO,
        format=log_format,
        datefmt="%Y-%m-%d %H:%M:%S"
    )

# Setup logger
setup_logger(log_time=False)
def load_data():
    """Load and return the Iris dataset as a pandas DataFrame."""
    iris = datasets.load_iris()
    df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                      columns=iris['feature_names'] + ['target'])
    return df

def preprocess_data(df):
    """Split the data into training and testing sets."""
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

def build_model():
    """Build and return an XGBoost classifier."""
    xgb_clf = XGBClassifier(booster="gbtree",
                            objective="multi:softprob",
                            random_state=2,
                            n_jobs=-1,
                            eval_metric='mlogloss')
    return xgb_clf

def train_model(model, X_train, y_train):
    """Train the model and return the trained model."""
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model and return the accuracy score."""
    y_pred = model.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    logging.info(f"Model Accuracy: {score:.4f}")
    return score

def hyperparameter_tuning(model, X_train, y_train):
    """Perform hyperparameter tuning using GridSearchCV."""
    param_grid = {
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200]
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    logging.info(f"Best Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def main():
    """Main function to run the entire pipeline."""
    logging.info("Starting the pipeline...")

    start_time = time.time()

    # Load and preprocess data
    df = load_data()
    X_train, X_test, y_train, y_test = preprocess_data(df)

    # Build and train model
    model = build_model()
    model = train_model(model, X_train, y_train)

    # Evaluate model
    evaluate_model(model, X_test, y_test)

    # Hyperparameter tuning
    model = hyperparameter_tuning(model, X_train, y_train)

    # Final evaluation after tuning
    evaluate_model(model, X_test, y_test)

    end_time = time.time()
    logging.info(f"Pipeline completed in {end_time - start_time:.2f} seconds.")

if __name__ == "__main__":
    main()

Starting the pipeline...
Model Accuracy: 1.0000


Fitting 5 folds for each of 36 candidates, totalling 180 fits


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Model Accuracy: 1.0000
Pipeline completed in 0.60 seconds.
