# XGBoost Tutorial

XGBoost is a common tree-based ensemble machine learning algorithm that uses a gradient boosting framework.
It is known for its performance and speed in various machine learning tasks, especially in structured/tabular data.

## Attempt 1

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

### Import the data

In [None]:
data = pd.read_stata("data/raw/usa_00005.dta")

y = data['inctot']

# all other variables are features
X = data.drop(columns=['inctot'])


### Prep Data for XGBoost

In [None]:


print(X.shape)

# one hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=False)
print(X_encoded.shape)

# get rid of columns with almost all 0's
cols_to_keep = [col for col in X_encoded.columns if (X_encoded[col] != 0).mean() > 0.01]
X_encoded = X_encoded[cols_to_keep]

print(X_encoded.shape)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)



### Define XGBoost Model Hyperparameters


In [None]:
params = {
    'eta': 0.1,
    'max_depth': 10,
    'min_child_weight': 100,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'verbosity': 2
}

evallist = [(dtrain, 'train'), (dtest, 'eval')]
num_round = 10000
early_stopping_rounds = 50



### Train the Model

In [None]:
bst = xgb.train(params,
                dtrain,
                num_round,
                evallist,
                early_stopping_rounds=early_stopping_rounds)

In [None]:
# remove rows with missing target 9999999
mask = (y != 9999999) & (y > 0)
X = X[mask]
y = y[mask]


In [None]:
len(y)


In [None]:
X_encoded.shape



## Attempt 2

### Import Data

In [None]:
data = pd.read_stata("data/raw/usa_00005.dta")

# subset to 10% of the data
data = data.sample(frac=0.1, random_state=42).reset_index(drop=True)

y = data['inctot']

# all other variables are features
X = data.drop(columns=['inctot'])


### Prep Data for XGBoost

In [None]:


print(X.shape)

# one hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=False)
print(X_encoded.shape)

# get rid of columns with almost all 0's
cols_to_keep = [col for col in X_encoded.columns if (X_encoded[col] != 0).mean() > 0.01]
X_encoded = X_encoded[cols_to_keep]

print(X_encoded.shape)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)



### Define XGBoost Model Hyperparameters


In [None]:
params = {
    'eta': 1,
    'max_depth': 10,
    'min_child_weight': 100,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'verbosity': 2
}

evallist = [(dtrain, 'train'), (dtest, 'eval')]
num_round = 10000
early_stopping_rounds = 50



### Train the Model

In [None]:
bst = xgb.train(params,
                dtrain,
                num_round,
                evallist,
                early_stopping_rounds=early_stopping_rounds)

### Visualize the Results

In [None]:
y_pred_plt = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))
y_test_plt = y_test

# y_plt_pred = np.log(y_pred)
# y_plt_test = np.log(y_test)

plt.scatter(y_test_plt, y_pred_plt, alpha=0.1)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("XGBoost: Predicted vs Actual Income")
plt.plot([y_test_plt.min(), y_test_plt.max()], [y_test_plt.min(), y_test_plt.max()], 'r--')  # diagonal line
plt.show()



In [None]:
# histogram of actual values
plt.hist(y_test, bins=50, alpha=0.7, color='blue')
plt.xlabel("Income")
plt.ylabel("Frequency")
plt.title("Histogram of Actual Income Values")
plt.show()

### Attempt 3

### Import Data

In [None]:
data = pd.read_stata("data/raw/usa_00005.dta")

# subset to 10% of the data
data = data.sample(frac=0.1, random_state=42).reset_index(drop=True)

y = data['inctot']

# all other variables are features
X = data.drop(columns=['inctot'])

missing_val = 9999999
mask = (y != missing_val)
X = X[mask]
y = y[mask]


### Prep Data for XGBoost

In [None]:


print(X.shape)

# one hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=False)
print(X_encoded.shape)

# get rid of columns with almost all 0's
cols_to_keep = [col for col in X_encoded.columns if (X_encoded[col] != 0).mean() > 0.01]
X_encoded = X_encoded[cols_to_keep]

print(X_encoded.shape)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)



### Define XGBoost Model Hyperparameters


In [None]:
params = {
    'eta': 1,
    'max_depth': 10,
    'min_child_weight': 100,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'verbosity': 2
}

evallist = [(dtrain, 'train'), (dtest, 'eval')]
num_round = 10000
early_stopping_rounds = 50



### Train the Model

In [None]:
bst = xgb.train(params,
                dtrain,
                num_round,
                evallist,
                early_stopping_rounds=early_stopping_rounds)

### Visualize the Results

In [None]:
y_pred_plt = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))
y_test_plt = y_test

# y_plt_pred = np.log(y_pred)
# y_plt_test = np.log(y_test)

plt.scatter(y_test_plt, y_pred_plt, alpha=0.1)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("XGBoost: Predicted vs Actual Income")
plt.plot([y_test_plt.min(), y_test_plt.max()], [y_test_plt.min(), y_test_plt.max()], 'r--')  # diagonal line
plt.show()



## Hyperparameter Tuning

In [None]:
etas = [0.01, 0.1, 0.5, 1]

models = {}

for eta in etas:
    params = {
        'eta': eta,
        'max_depth': 10,
        'min_child_weight': 100,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'verbosity': 1
    }

    evallist = [(dtrain, 'train'), (dtest, 'eval')]
    num_round = 10000
    early_stopping_rounds = 50

    print(f"Training with eta={eta}")
    bst = xgb.train(params,
                    dtrain,
                    num_round,
                    evallist,
                    early_stopping_rounds=early_stopping_rounds)
    models[eta] = bst

scores = [models[eta].best_score for eta in etas]
plt.plot(etas, scores, marker='o')
plt.xscale('log')
plt.xlabel('Eta')
plt.ylabel('Best RMSE Score')
plt.title('Effect of Eta on Model Performance')
plt.show()
