# Imports

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier



# Load and split the data

## Classification - breast cancer

In [12]:
# Load dataset
dataset = load_breast_cancer(return_X_y=True)
Xc, yc = load_breast_cancer(return_X_y=True)
feature_names = load_breast_cancer().feature_names

Xc = pd.DataFrame(Xc, columns=feature_names)
yc = pd.Series(yc)

# Split into train/val/test
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42, stratify=yc)

## Regression - cars prices

In [11]:
data = "https://storage.googleapis.com/edulabs-public-datasets/CAR%20DETAILS%20FROM%20CAR%20DEKHO.csv"
df = pd.read_csv(data)
df['manufacturer'] = df['name'].str.split(' ').str[0]

Xr = df.drop(['name', 'selling_price'], axis=1)
yr = df['selling_price']

# One-hot encode categorical features
Xr = pd.get_dummies(Xr, drop_first=True)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)

# AdaBoost

In [15]:
ada_classifier = AdaBoostClassifier(n_estimators=50, random_state=432)
ada_classifier.fit(Xc_train, yc_train)
ada_predictions = ada_classifier.predict(Xc_test)

In [16]:
print("\nClassification Report:\n", classification_report(yc_test, ada_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(yc_test, ada_predictions))


Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.94        42
           1       0.95      0.99      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


Confusion Matrix:
 [[38  4]
 [ 1 71]]


In [None]:
ada_classifier.estimators_

In [None]:
ada_classifier.estimator_weights_

# XGBOOST

https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn

Lower learning rate = More trees, slower training, better generalization

Higher learning rate = Fewer trees, faster training, higher risk of overfitting

In [60]:
import xgboost as xgb
from sklearn import metrics

## Regression task

In [65]:
xgb_model = xgb.XGBRegressor(learning_rate=0.1, n_estimators=100)
xgb_model.fit(Xr_train, yr_train)
print(f"R2 score (test): {xgb_model.score(Xr_test, yr_test)}")
print(f"R2 score (train): {xgb_model.score(Xr_train, yr_train)}")
print(f"MAPE (test): {metrics.mean_absolute_percentage_error(yr_test, xgb_model.predict(Xr_test))}")
print(f"MAPE (train): {metrics.mean_absolute_percentage_error(yr_train, xgb_model.predict(Xr_train))}")

R2 score (test): 0.7358630299568176
R2 score (train): 0.9388269186019897
MAPE (test): 0.31680622696876526
MAPE (train): 0.279675155878067


### Tackling overfit



*   max_depth=3 → shallow trees generalize better
*   min_child_weight=5 → controls complexity in leaf splits
* subsample & colsample_bytree → introduce randomness











In [73]:
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    # learning_rate=0.05,        # reduce learning rate
    # max_depth=3,              # reduce from default (6)
    # min_child_weight=15,       # higher = less likely to overfit
    # subsample=0.8,            # row sampling
    # colsample_bytree=0.8,     # feature sampling
    # reg_alpha=100,   # L1 (sparse model)
    # reg_lambda=100,   # L2 (penalizes large leaf weights)
    random_state=42
)
xgb_model.fit(Xr_train, yr_train)
y_pred = xgb_model.predict(Xr_test)
print(f"R2 score (test): {xgb_model.score(Xr_test, yr_test)}")
print(f"R2 score (train): {xgb_model.score(Xr_train, yr_train)}")
print(f"MAPE (test): {metrics.mean_absolute_percentage_error(yr_test, xgb_model.predict(Xr_test))}")
print(f"MAPE (train): {metrics.mean_absolute_percentage_error(yr_train, xgb_model.predict(Xr_train))}")

R2 score (test): 0.723900318145752
R2 score (train): 0.908577561378479
MAPE (test): 0.3264654576778412
MAPE (train): 0.30861762166023254


### With early stopping

In [69]:
xgb_model = xgb.XGBRegressor(
    learning_rate=0.05,
    n_estimators=100,
    max_depth=3,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1,
    early_stopping_rounds=50,
    random_state=42
)

xgb_model.fit(
    Xr_train, yr_train,
    eval_set=[(Xr_test, yr_test)],
    verbose=False
)

print(f"R2 score (test): {xgb_model.score(Xr_test, yr_test)}")
print(f"R2 score (train): {xgb_model.score(Xr_train, yr_train)}")
print(f"MAPE (test): {metrics.mean_absolute_percentage_error(yr_test, xgb_model.predict(Xr_test))}")
print(f"MAPE (train): {metrics.mean_absolute_percentage_error(yr_train, xgb_model.predict(Xr_train))}")


R2 score (test): 0.6361368894577026
R2 score (train): 0.8137874007225037
MAPE (test): 0.43956294655799866
MAPE (train): 0.4219423234462738


## Classification

Your task: train classification model on breast cancer dataset

In [None]:
xgb_model = xgb.XGBClassifier()