In [1]:
%%javascript
utils.load_extension('execute_time/ExecuteTime')

<IPython.core.display.Javascript object>

In [2]:
# Import candidate models
from doubt import Boot, QuantileRegressor, QuantileRegressionForest
from sklearn.linear_model import (LinearRegression, PoissonRegressor, 
                                  GammaRegressor, HuberRegressor)
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

# Import datasets
from doubt.datasets import (Airfoil, Blog, Concrete, CPU, 
                            FacebookComments, FishBioconcentration,
                            FishToxicity, ForestFire, NewTaipeiHousing,
                            PowerPlant, Protein, Servo,
                            SpaceShuttle, Yacht)

# Import external libraries
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange
from scipy.stats import ks_2samp, entropy, kruskal
import matplotlib.pyplot as plt; plt.style.use('ggplot')
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
from collections import defaultdict
import seaborn as sns
sns.set_theme(style="whitegrid")
import shap

# Import internal classes
from distributions import DistributionShift
from src.psi import psi

In [4]:
pwd

'/Users/cmougan/Desktop/MonitoringUncertainty'

In [3]:
train = pd.read_csv('data/train.csv',nrows=10_000)
test = pd.read_csv('data/dev_out.csv',nrows=10_000)

FileNotFoundError: [Errno 2] No such file or directory: 'data/dev_out.csv'

In [None]:
cols = [
    "fact_time",
    "fact_latitude",
    "fact_longitude",
    "fact_temperature",
    "fact_cwsm_class",
    "climate",
]


In [None]:
X_tr = train.drop(columns=cols).fillna(-1)
y_tr = train.fact_temperature

X_te = test.drop(columns=cols).fillna(-1)
y_te = test.fact_temperature

In [None]:
standard_scaler = StandardScaler()
        
# Scale the dataset
#X_tr = standard_scaler.fit_transform(X_tr)
#X_te = standard_scaler.transform(X_te)


# Fit the regressor
regressor = Boot(XGBRegressor(max_depth=5,n_estimators=50))
regressor.fit(X_tr, y_tr,n_boots=20)


In [None]:
# Train preds
predictions, intervals = regressor.predict(X_tr, uncertainty=0.05)


In [None]:
values = {}
values["uncertainty_train"] = intervals[:, 1] - intervals[:, 0]
values["mse_train"] = (predictions - y_tr) ** 2


In [None]:
# Test preds
predictions, intervals = regressor.predict(X_te, uncertainty=0.05)
values["uncertainty_test"] = intervals[:, 1] - intervals[:, 0]
values["mse_test"] = (predictions - y_te) ** 2


d = pd.DataFrame(values)

In [None]:
sc = StandardScaler()
sc.fit(d['uncertainty_train'].values.reshape(-1, 1))
d['uncertainty_train'] = sc.transform(d['uncertainty_train'].values.reshape(-1, 1))
d['uncertainty_test'] = sc.transform(d['uncertainty_test'].values.reshape(-1, 1))

sc = StandardScaler()
sc.fit(d['mse_train'].values.reshape(-1, 1))
d['mse_train'] = sc.transform(d['mse_train'].values.reshape(-1, 1))
d['mse_test'] = sc.transform(d['mse_test'].values.reshape(-1, 1))



In [None]:
sns.violinplot(data = d.drop(columns=['mse_train','mse_test']));

In [None]:
sns.violinplot(data = d.drop(columns=['uncertainty_train','uncertainty_test']));

In [None]:
d.describe()

## Detecting the source of uncertainty

In [None]:
reg = XGBRegressor()
reg.fit(X_te,d['uncertainty_test'])

In [None]:
mean_squared_error(reg.predict(X_te),d['uncertainty_test'])

In [None]:
d['uncertainty_test'].values

In [None]:

mean_squared_error(np.mean(d['uncertainty_test']),
                   d['uncertainty_test'])

In [None]:
explainer = shap.Explainer(reg)
shap_values = explainer(X_te)

# visualize the first prediction's explanation
# shap.plots.waterfall(shap_values[0])
shap.plots.bar(shap_values)

In [None]:
psi_dic = []
for c in X_tr.columns:
    psi_dic.append([c, psi(X_tr[c], X_te[c]),ks_2samp(X_tr[c], X_te[c]).statistic])



psi_dic = pd.DataFrame(psi_dic, columns=["column", "psi",'ks'])


plt.figure()
keys = psi_dic.sort_values("psi", ascending=False).head(10).column.values
values = psi_dic.sort_values("psi", ascending=False).head(10).psi.values

plt.bar(keys, values)
plt.xticks(rotation=45)
plt.show()



In [None]:
plt.figure()
keys = psi_dic.sort_values("ks", ascending=False).head(10).column.values
values = psi_dic.sort_values("ks", ascending=False).head(10).ks.values

plt.bar(keys, values)
plt.xticks(rotation=45)
plt.show()