In [1]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression, load_boston
from sklearn.model_selection import train_test_split, cross_validate, GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import check_cv
import duplicate as dp
from functools import partial


# https://altair-viz.github.io/user_guide/display_frontends.html#displaying-in-the-jupyter-notebook
# alt.renderers.enable('mimetype') # works off-line
# alt.renderers.enable('notebook') 
alt.renderers.enable('default') # recommended

RendererRegistry.enable('default')

In [2]:
# get bootstrap.py from parent directory
%cd ..
import bootstrap as bs
import reed as reed
%cd -

/Users/jack/Documents/GitHub/re-education
/Users/jack/Documents/GitHub/re-education/duplication


In [3]:
X, y = load_boston(return_X_y=True)#make_regression(n_samples=1000,n_features=4, noise=0, random_state=0)

feature_names = [f'X_{i}' for i in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=feature_names)
data['y'] = y
data.head()

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
n_duplicates = 4
n_splits = 5
base_groups = np.arange(y.size)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_dup, y_dup, weights_dup, groups_dup = dp.simple_duplicate(X_train, y_train, n_duplicates)

scoring = {
    "MSE": make_scorer(metrics.mean_squared_error),
    "R2": make_scorer(metrics.r2_score),
}


In [5]:
# use custom CV with groups
boost = GridSearchCV(
    # LinearRegression(),
    GradientBoostingRegressor(random_state=0,),
    # reed.StatsmodelsOLS(),
    param_grid={},
    cv=GroupKFold(n_splits=n_splits),
    
    
)
boost_dup = GridSearchCV(
    # LinearRegression(),
    GradientBoostingRegressor(random_state=0),
    # reed.StatsmodelsOLS(),
    param_grid={}, 
    cv=GroupKFold(n_splits=n_splits),


)


# output = cross_validate(
#     boost,
#     X,
#     y,
#     groups=base_groups,
#     scoring=scoring,
#     cv=dp.group_k_fold_unique_test_groups(X, y, base_groups, n_splits),
# )

# output_dup = cross_validate(
#     boost_dup,
#     X_dup,
#     y_dup,
#     groups=groups_dup,
#     fit_params={"sample_weight": weights_dup},
#     scoring=scoring,
#     cv=dp.group_k_fold_unique_test_groups(X_dup, y_dup, groups_dup, n_splits),
# )


def mean_var_cv_out(cv_out):
    means_vars = {}
    for k, v in cv_out.items():
        means_vars[k + "_mean"] = np.mean(v)
        means_vars[k + "_var"] = np.var(v)
    return means_vars


# mean_var_cv_out(output), mean_var_cv_out(output_dup)

# Train models, using bootstrapping

In [6]:
# use bootstrapping
def boost_param_extractor(estimator):
    return estimator.get_params()

sample_weight= weights_dup


results = bs.bootstrap(
    estimator=boost,
    X=X_train,
    y=y_train,
    error_score="raise",
    # groups=base_groups,
    parameter_extractor=boost_param_extractor,
    return_estimator=True,
    groups=True,
    n_jobs=1,
    sample_weight=sample_weight
)
print("Done un-duplicated")

results_dup = bs.bootstrap(
    estimator=boost_dup,
    X=X_dup,
    y=y_dup,
    # groups=groups_dup,
    parameter_extractor=boost_param_extractor,
    return_estimator=True,
    n_jobs=1,
    groups=True,
    sample_weight=sample_weight
)

# print(results['parameters'])

# print(scores)
# for score_dict in scores:
#     estimator = score_dict["estimator"]

Done un-duplicated


## Evaluate, compare models
Evaluate models on the (unduplicated) test data we set aside.

In [7]:
models = results['estimator']
models_dup = results_dup['estimator']

y_preds, y_preds_dup = ([model.predict(X_test) for model in ms] for ms in (models, models_dup))
y_preds, y_preds_dup = np.array(y_preds), np.array(y_preds_dup)




In [8]:

scores, scores_dup = ([metrics.mean_squared_error(y_pred, y_test) for y_pred in preds] for preds in (y_preds, y_preds_dup))

print(f"mean mse    : {np.mean(scores)}; mse var    : {np.var(scores)}")
print(f"mean dup mse: {np.mean(scores_dup)}; mse var dup: {np.var(scores_dup)}")

mean mse    : 12.670221765225381; mse var    : 2.779869526632825
mean dup mse: 10.82670118574453; mse var dup: 0.4808177083378803


In [16]:
# take an arbitrary point in the test set; let's see how its performance changes:
index = 0
point_y_hat = y_preds[:,index]
point_y_hat_dup = y_preds_dup[:, index]
point_y = y_test[index] * np.ones_like(point_y_hat)

point_data = pd.DataFrame(dict(
    point_y=point_y,
    point_y_hat=point_y_hat,
    point_y_hat_dup=point_y_hat_dup,
))




In [15]:
%matplotlib agg # remove to show figure
import matplotlib.pyplot as plt

plt.scatter(point_y_hat, point_y_hat_dup)
plt.gca().set_aspect('equal')
plt.xlabel("$\hat{y}$")
plt.ylabel("$\hat{y}$ dup")

plt.savefig('scatter.png')

- we see slightly more accurate estimates with substantially less variance in the errors when using the duplicated data
- TODO (after introducing econml model example so as to not prematurely harden the code?): wrap this up into a function that we can pass a model into.

# Visualise data

In [11]:
chart = alt.Chart(data).mark_point().encode(
    x='X_0',
    y='y',
    color='X_1:Q'
).properties(
    width=200,
    height=200
)

# un-comment to view
# chart

In [12]:
correlations = data.corr().stack().reset_index().rename(columns={0: 'correlation', 'level_0': 'variable 0', 'level_1': 'variable 1'})
# print(correlations.head())
corr_mat = alt.Chart(correlations).mark_rect().encode(
    x='variable 0:N',
    y='variable 1:N',
    color='correlation:Q'
).properties(
    width=200,
    height=200
)

combined_fig = alt.concat(
chart,
corr_mat
).resolve_scale(
    color='independent'
)

# un-comment to view
# combined_fig