In [3]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv(url, names=names)

X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size=0.2, random_state=42)

# 2. Implement XGBoost
# eta (learning_rate) is small so we don't overshoot the minimum loss
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# 3. Predict
print(f"XGBoost Test Accuracy: {accuracy_score(y_test, xgb_model.predict(X_test)):.2f}")

XGBoost Test Accuracy: 0.72


In [2]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   - -------------------------------------- 2.1/72.0 MB 10.6 MB/s eta 0:00:07
   -- ------------------------------------- 4.5/72.0 MB 10.7 MB/s eta 0:00:07
   --- ------------------------------------ 6.3/72.0 MB 9.8 MB/s eta 0:00:07
   ---- ----------------------------------- 7.9/72.0 MB 9.4 MB/s eta 0:00:07
   ----- ---------------------------------- 9.4/72.0 MB 9.1 MB/s eta 0:00:07
   ------ --------------------------------- 11.3/72.0 MB 9.0 MB/s eta 0:00:07
   ------- -------------------------------- 13.1/72.0 MB 8.9 MB/s eta 0:00:07
   -------- ------------------------------- 14.7/72.0 MB 8.8 MB/s eta 0:00:07
   --------- ------------------------------ 16.5/72.0 MB 8.9 MB/s eta 0:00:07
   ---------- 

In [4]:
import numpy as np

# 1. Create a "Messy" version of the training data
X_train_messy = X_train.copy()
# Randomly replace 20% of the 'plas' (Glucose) column with NaN
ix = X_train_messy.sample(frac=0.2).index
X_train_messy.loc[ix, 'plas'] = np.nan

# 2. Train on the messy data
# XGBoost automatically learns the "Default Direction" for these NaNs
xgb_messy = XGBClassifier(n_estimators=50)
xgb_messy.fit(X_train_messy, y_train)

# 3. Predict on data that also has missing values
X_test_messy = X_test.copy()
X_test_messy.iloc[0:5, 1] = np.nan # Hide some glucose values in the test set

preds = xgb_messy.predict(X_test_messy)
print("XGBoost handled the missing values without needing Imputation!")

XGBoost handled the missing values without needing Imputation!


In [7]:
# 'reg_lambda' is L2 regularization (Hessian-based)
# 'reg_alpha' is L1 regularization (Lasso-based)
# 'gamma' makes the tree "conservative" (it only splits if gain > gamma)

xgb_tuned = XGBClassifier(
    n_estimators=1000, 
    learning_rate=0.01, 
    max_depth=3,
    gamma=0.5,           # Minimum loss reduction to split
    reg_lambda=2,        # L2 Regularization
    early_stopping_rounds=10 # Stop if error doesn't improve for 10 trees
)

# Use a validation set to monitor early stopping
xgb_tuned.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

print(f"Tuned XGBoost Accuracy: {accuracy_score(y_test, xgb_tuned.predict(X_test)):.2f}")
print(f"Trees used before stopping: {xgb_tuned.best_iteration}")

Tuned XGBoost Accuracy: 0.77
Trees used before stopping: 264
