In [None]:
#Import all necessary libraries

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import kagglehub # Import kagglehub to download datasets from Kaggle.

print("Libraries imported successfully!") # Print confirmation message.

Libraries imported successfully!


In [None]:
#Data Loading and Preparation - Iowa Housing Dataset
#Login to Kaggle Hub to access datasets.
print("\nLogging into Kaggle Hub...")
kagglehub.login() # Authenticate with Kaggle.
print("Kaggle Hub login successful.")


Logging into Kaggle Hub...


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle Hub login successful.
Kaggle credentials set.
Kaggle credentials successfully validated.


In [None]:
# Download Iowa Housing dataset from Kaggle.
print("\nDownloading Iowa Housing dataset from Kaggle...")
dansbecker_home_data_for_ml_course_path = kagglehub.dataset_download('dansbecker/home-data-for-ml-course')

# Load Iowa training data.
print("\nLoading Iowa Housing training data...")
iowa_file_path = dansbecker_home_data_for_ml_course_path + '/train.csv' # Path to training data.
iowa_data = pd.read_csv(iowa_file_path)
print("Iowa dataset loaded.")
print("Iowa dataset loaded. Preview of first 5 rows:")
print(iowa_data.head())



Downloading Iowa Housing dataset from Kaggle...

Loading Iowa Housing training data...
Iowa dataset loaded.
Iowa dataset loaded. Preview of first 5 rows:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      

In [None]:
# Define features to use for Iowa dataset
features_to_use_iowa = ['LotArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GrLivArea',
                        'GarageArea', 'SalePrice', 'Neighborhood', 'BldgType', 'HouseStyle',
                        'ExterQual', 'KitchenQual', 'FireplaceQu']
iowa_data_subset = iowa_data[features_to_use_iowa].copy()
print(f"Selected {len(features_to_use_iowa)} features for Iowa dataset")

Selected 13 features for Iowa dataset


In [None]:
# Define target and features
y = iowa_data_subset['SalePrice']  # Target variable
X = iowa_data_subset.drop('SalePrice', axis=1)  # Features
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (1460, 12), y shape: (1460,)


In [None]:
# Identify numerical and categorical
numerical_cols_iowa = X.select_dtypes(exclude='object').columns
categorical_cols_iowa = X.select_dtypes(include='object').columns
print(f"Numerical columns ({len(numerical_cols_iowa)}): {list(numerical_cols_iowa)}")
print(f"Categorical columns ({len(categorical_cols_iowa)}): {list(categorical_cols_iowa)}")

Numerical columns (6): ['LotArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GrLivArea', 'GarageArea']
Categorical columns (6): ['Neighborhood', 'BldgType', 'HouseStyle', 'ExterQual', 'KitchenQual', 'FireplaceQu']


In [None]:
# Data Preprocessing for Iowa Housing Dataset

# Create preprocessor for Iowa dataset

# Create pipeline for numerical features
numerical_transformer_iowa = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer_iowa = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine transformers using ColumnTransformer
preprocessor_iowa = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_iowa, numerical_cols_iowa),
        ('cat', categorical_transformer_iowa, categorical_cols_iowa)
    ])

print("Iowa dataset preprocessing pipeline created successfully.")

Iowa dataset preprocessing pipeline created successfully.


**1.1: Gradient Boosting**

1.1.a Creating pipeline with GradientBoostingRegressor

In [None]:
iowa_pipeline_gbr = Pipeline(steps=[
    ('preprocessor', preprocessor_iowa),
    ('model', GradientBoostingRegressor(random_state=0))
])
print("Pipeline created with GradientBoostingRegressor")

Pipeline created with GradientBoostingRegressor


1.1.b: Use GridSearchCV to find best hyperparameters

In [None]:
print("Defining hyperparameter grid...")
param_grid_gbr = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

print("Creating GridSearchCV object...")
grid_search_gbr = GridSearchCV(
    iowa_pipeline_gbr,
    param_grid_gbr,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search_gbr.fit(X, y)
print("GridSearchCV fitting completed!")

Defining hyperparameter grid...
Creating GridSearchCV object...
GridSearchCV fitting completed!


1.1.c: Print best hyperparameters and MAE score

In [None]:
print("Best GradientBoostingRegressor results:")
print(f"Best parameters: {grid_search_gbr.best_params_}")
print(f"Best negative MAE score: {grid_search_gbr.best_score_}")
print(f"Best MAE score: {-grid_search_gbr.best_score_}")
best_gbr_mae = -grid_search_gbr.best_score_

Best GradientBoostingRegressor results:
Best parameters: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 100}
Best negative MAE score: -17899.668973366846
Best MAE score: 17899.668973366846


**1.2: XGBoost**

1.2.a: Create pipeline with XGBRegressor

In [None]:
iowa_pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor_iowa),
    ('model', XGBRegressor(random_state=0))
])
print("Pipeline created with XGBRegressor")

Pipeline created with XGBRegressor


1.2.b: Use GridSearchCV to find best hyperparameters

In [None]:
print("Defining hyperparameter grid...")
param_grid_xgb = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

print("Creating GridSearchCV object...")
grid_search_xgb = GridSearchCV(
    iowa_pipeline_xgb,
    param_grid_xgb,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search_xgb.fit(X, y)
print("GridSearchCV fitting completed!")

Defining hyperparameter grid...
Creating GridSearchCV object...
GridSearchCV fitting completed!


1.2.c: Print best hyperparameters and MAE score

In [None]:
print("Best XGBRegressor results:")
print(f"Best parameters: {grid_search_xgb.best_params_}")
print(f"Best negative MAE score: {grid_search_xgb.best_score_}")
print(f"Best MAE score: {-grid_search_xgb.best_score_}")
best_xgb_mae = -grid_search_xgb.best_score_

Best XGBRegressor results:
Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}
Best negative MAE score: -18310.4111328125
Best MAE score: 18310.4111328125


**1.3: Comparison**

In [None]:
print(f"GradientBoostingRegressor MAE: {best_gbr_mae}")
print(f"XGBRegressor MAE: {best_xgb_mae}")

GradientBoostingRegressor MAE: 17899.668973366846
XGBRegressor MAE: 18310.4111328125


GradientBoostingRegressor performed better (lower MAE).

**Gradient Boosting**

Gradient Boosting generally makes better predictions than a single decision tree, so it’s less likely to overfit. It works well with mixed data types (numbers and categories) and can handle missing values. You can also see which features matter most, which helps interpret the results. On the downside, it can be slow on large datasets, needs careful tuning of its settings, and isn’t as fast as XGBoost.

**XGBoost**

XGBoost is a faster, more scalable version of gradient boosting. It has built-in regularization to reduce overfitting, handles sparse data efficiently, and uses multiple cores to speed up training. However, if you don’t tune it properly, it can still overfit. It also may use more memory on big datasets, and its many parameters can feel overwhelming at first.

# Exercise 2: Adult Dataset - Gradient Boosting and XGBoost

In [12]:
# Load Adult Dataset (UCI Adult Income dataset)

# Download the Adult dataset
adult_data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names_adult = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                      "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                      "hours-per-week", "native-country", "income"]

In [13]:
# Load the Adult dataset
adult_data = pd.read_csv(adult_data_url, names=column_names_adult, na_values="?", skipinitialspace=True)
print("Adult dataset loaded. Preview of first 5 rows:")
print(adult_data.head())

Adult dataset loaded. Preview of first 5 rows:
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=5

In [14]:
# Define features and target for Adult dataset
X_adult = adult_data.drop('income', axis=1)  # Features
y_adult = adult_data['income']  # Target variable
print(f"X_adult shape: {X_adult.shape}, y_adult shape: {y_adult.shape}")

X_adult shape: (32561, 14), y_adult shape: (32561,)


In [15]:
# Identify numerical and categorical columns
numerical_cols_adult = X_adult.select_dtypes(exclude='object').columns
categorical_cols_adult = X_adult.select_dtypes(include='object').columns
print(f"Numerical columns ({len(numerical_cols_adult)}): {list(numerical_cols_adult)}")
print(f"Categorical columns ({len(categorical_cols_adult)}): {list(categorical_cols_adult)}")

Numerical columns (6): ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical columns (8): ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


In [16]:
# Data Preprocessing for Adult Dataset

# Create preprocessor for Adult dataset

# Create pipeline for numerical features
numerical_transformer_adult = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer_adult = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine transformers using ColumnTransformer
preprocessor_adult = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_adult, numerical_cols_adult),
        ('cat', categorical_transformer_adult, categorical_cols_adult)
    ])

print("Adult dataset preprocessing pipeline created successfully.")

Adult dataset preprocessing pipeline created successfully.


In [17]:
print("Checking and preparing the target variable for Adult dataset...")
print(f"Original target values: {y_adult.unique()}")
y_adult_encoded = y_adult.map({'<=50K': 0, '>50K': 1})
print(f"Encoded target values: {y_adult_encoded.unique()}")

Checking and preparing the target variable for Adult dataset...
Original target values: ['<=50K' '>50K']
Encoded target values: [0 1]


**2.1: Gradient Boosting**

2.1.a: Create pipeline with GradientBoostingClassifier

In [18]:
adult_pipeline_gbc = Pipeline(steps=[
    ('preprocessor', preprocessor_adult),
    ('model', GradientBoostingClassifier(random_state=0))
])
print("Pipeline created with GradientBoostingClassifier")

Pipeline created with GradientBoostingClassifier


2.1.b: Use GridSearchCV to find best hyperparameters

In [None]:
print("Defining hyperparameter grid...")
param_grid_gbc = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

print("Creating GridSearchCV object...")
grid_search_gbc = GridSearchCV(
    adult_pipeline_gbc,
    param_grid_gbc,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search_gbc.fit(X_adult, y_adult)
print("GridSearchCV fitting completed!")

Defining hyperparameter grid...
Creating GridSearchCV object...


2.1.c: Print best hyperparameters and accuracy score

In [None]:
print(f"Best parameters: {grid_search_gbc.best_params_}")
print(f"Best accuracy score: {grid_search_gbc.best_score_}")
best_gbc_accuracy = grid_search_gbc.best_score_

**2.2: XGBoost**

2.2.a: Create pipeline with XGBClassifier

In [None]:
adult_pipeline_xgbc = Pipeline(steps=[
    ('preprocessor', preprocessor_adult),
    ('model', XGBClassifier(random_state=0))
])
print("Pipeline created with XGBClassifier")

2.2.b: Use GridSearchCV to find best hyperparameters

In [None]:
print("Defining hyperparameter grid...")
param_grid_xgbc = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7]
}

print("Creating GridSearchCV object...")
grid_search_xgbc = GridSearchCV(
    adult_pipeline_xgbc,
    param_grid_xgbc,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the GridSearchCV object to the data
# XGBoost requires numerical targets
grid_search_xgbc.fit(X_adult, y_adult_encoded)
print("GridSearchCV fitting completed!")

2.2.c: Print best hyperparameters and accuracy score

In [None]:
print(f"Best parameters: {grid_search_xgbc.best_params_}")
print(f"Best accuracy score: {grid_search_xgbc.best_score_}")
best_xgbc_accuracy = grid_search_xgbc.best_score_

**2.3: Comparison**

2.3.a: Compare performance

In [None]:
print("Comparing performance of GradientBoostingClassifier and XGBClassifier:")
print(f"GradientBoostingClassifier accuracy: {best_gbc_accuracy}")
print(f"XGBClassifier accuracy: {best_xgbc_accuracy}")

XGBClassifier performed better (higher accuracy).

2.3.b: Discuss advantages and disadvantages


The GradientBoostingClassifier demonstrates several positive aspects. The model shows strong resistance to overfitting while  performing effectively across various data types. The model demonstrates strong performance when applied to datasets of moderate size.  The scikit-learn framework facilitates convenient tool integration for this classifier.

The algorithm comes with its own set of  limitations. The execution time of this algorithm exceeds XGBoost especially when handling extensive datasets. The implementation of  GradientBoostingClassifier does not optimize memory resources or processing time efficiency. The algorithm operates sequentially which makes parallel  execution more difficult.

XGBoost stands out for its quicker training and prediction speed because of its parallel  processing features. The model shows better efficiency in memory usage which enables it to work with massive datasets. The  model includes default regularization settings which help avoid overfitting. The algorithm frequently outperforms others while  requiring only limited parameter adjustments.

XGBoost demonstrates its own set of limitations during implementation. The algorithm  presents higher complexity levels for beginners in terms of both learning and practical implementation. The absence of scikit- learn framework results in creating an additional software dependency. The tool might prove excessive when used with datasets that  are relatively small.

Both algorithms work effectively for classification tasks yet XGBoost demonstrates superior performance and efficiency  at the expense of added complexity. GradientBoosting shows better tool integration with scikit-learn and tends  to be more user-friendly for those who already know the framework.