# From SHAP to EBM

# Setup

In [None]:
import pandas as pd
import plotly.express as px
from xgboost import XGBRegressor

In [None]:
df = (
    pd.read_csv("../data/diamonds.csv", index_col=0)
    .sample(5000, random_state=42)
    .reset_index(drop=True)
)
df.head()

# Data Processing

In [None]:
df = df[(df.x > 0) & (df.y > 0) & (df.z > 0) & (df.z < 30)]
df["cut"] = pd.Categorical(
    df["cut"],
    categories=["Fair", "Good", "Very Good", "Premium", "Ideal"],
    ordered=True,
)
df["color"] = pd.Categorical(
    df["color"], categories=["J", "I", "H", "G", "F", "E", "D"], ordered=True
)
df["clarity"] = pd.Categorical(
    df["clarity"],
    categories=["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"],
    ordered=True,
)
df.describe()

In [None]:
df.info()

# Data Exploration

In [None]:
fig = px.scatter_matrix(
    df, dimensions=["carat", "depth", "table", "price", "x", "y", "z"]
)
fig.update_traces(marker=dict(size=3, opacity=0.5))
fig.show()

In [None]:
px.violin(df, x="cut", y="price", color="cut", title="Price by Cut")

In [None]:
px.violin(df, x="color", y="price", color="color", title="Price by Color")

In [None]:
px.violin(df, x="clarity", y="price", color="clarity", title="Price by Clarity")

In [None]:
px.scatter(df, x="carat", y="price", color="cut")

# Modelling with XGBoost

In [None]:
train_x = df.drop(columns="price").sample(frac=0.8, random_state=42)
test_x = df.drop(columns="price").drop(train_x.index)
train_y = df["price"].loc[train_x.index]
test_y = df["price"].loc[test_x.index]

In [None]:
model = XGBRegressor(
    objective="reg:squarederror",
    max_depth=6,
    eta=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=42,
    n_estimators=100,
    enable_categorical=True,
)
model.fit(train_x, train_y)
predicted_y = model.predict(test_x)
prediction_df = pd.DataFrame({"actual": test_y, "predicted": predicted_y})

In [None]:
def plot_gof(prediction_df: pd.DataFrame):
    scatter_gof_fig = px.scatter(
        prediction_df, x="predicted", y="actual", title="Goodness of Fit"
    )
    scatter_gof_fig.add_shape(
        type="line",
        x0=0,
        y0=0,
        x1=prediction_df["predicted"].max(),
        y1=prediction_df["predicted"].max(),
    )
    scatter_gof_fig.update_layout(autosize=False, width=600, height=600)
    scatter_gof_fig.show()
    errors = prediction_df["actual"] - prediction_df["predicted"]
    px.histogram(errors, title="Error Distribution", nbins=500).update_layout(
        showlegend=False
    ).show()


def compute_metrics(prediction_df: pd.DataFrame) -> dict[str, float]:
    error = prediction_df["actual"] - prediction_df["predicted"]
    mae = error.abs().mean()
    rmse = (error**2).mean() ** 0.5
    return {"mae": mae, "rmse": rmse}


plot_gof(prediction_df)
compute_metrics(prediction_df)