In [1]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import make_regression, load_boston
from sklearn.model_selection import train_test_split, cross_validate, GroupKFold
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import check_cv
import duplicate as dp
from functools import partial

# https://altair-viz.github.io/user_guide/display_frontends.html#displaying-in-the-jupyter-notebook
alt.renderers.enable('mimetype')

RendererRegistry.enable('mimetype')

In [2]:
X, y = load_boston(return_X_y=True)#make_regression(n_samples=1000,n_features=4, noise=0, random_state=0)

feature_names = [f'X_{i}' for i in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=feature_names)
data['y'] = y
data.head()

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
n_duplicates = 4
n_splits = 5
base_groups = np.arange(y.size)
X_dup, y_dup, weights_dup, groups_dup = dp.simple_duplicate(X, y, n_duplicates)

scoring = {
    "MSE": make_scorer(metrics.mean_squared_error),
    "R2": make_scorer(metrics.r2_score),
}


In [4]:
predictions = alt.Chart(data).mark_point().encode(
    x='y_test',
    y='y',
    color='X_1:Q'
)

In [5]:

boost = GradientBoostingRegressor(random_state=0)
boost_dup = GradientBoostingRegressor(random_state=0)

output = cross_validate(
    boost, 
    X, 
    y, 
    groups=base_groups,
    scoring=scoring,
    cv=dp.group_k_fold_unique_test_groups(X, y, base_groups, n_splits)
    ) 

output_dup = cross_validate(
    boost_dup, 
    X_dup, 
    y_dup, 
    groups=groups_dup,
    fit_params={"sample_weight": weights_dup},
    scoring=scoring,
    cv=dp.group_k_fold_unique_test_groups(X_dup, y_dup, groups_dup, n_splits)
    )
    
def mean_var_cv_out(cv_out):
    means_vars = {}
    for k, v in cv_out.items():
        means_vars[k + "_mean"] = np.mean(v)
        means_vars[k + "_var"] = np.var(v)
    return means_vars

mean_var_cv_out(output), mean_var_cv_out(output_dup)

404 102
405 101
405 101
405 101
405 101
1616 102
1620 101
1620 101
1620 101
1620 101


({'fit_time_mean': 0.10366778373718262,
  'fit_time_var': 0.00023631606773960812,
  'score_time_mean': 0.0009811878204345702,
  'score_time_var': 5.193172910367139e-08,
  'test_MSE_mean': 8.695767071959676,
  'test_MSE_var': 0.725570080644878,
  'test_R2_mean': 0.8960373538263603,
  'test_R2_var': 0.00016759188747205692},
 {'fit_time_mean': 0.2801429271697998,
  'fit_time_var': 0.004177174901737998,
  'score_time_mean': 0.0015359878540039062,
  'score_time_var': 4.216538536638837e-07,
  'test_MSE_mean': 2.371482543289606,
  'test_MSE_var': 0.1169386026330567,
  'test_R2_mean': 0.9719874815876552,
  'test_R2_var': 4.069538408459185e-05})

In [None]:
chart = alt.Chart(data).mark_point().encode(
    x='X_0',
    y='y',
    color='X_1:Q'
).properties(
    width=200,
    height=200
)

# un-comment to view
# chart

In [None]:
correlations = data.corr().stack().reset_index().rename(columns={0: 'correlation', 'level_0': 'variable 0', 'level_1': 'variable 1'})
# print(correlations.head())
corr_mat = alt.Chart(correlations).mark_rect().encode(
    x='variable 0:N',
    y='variable 1:N',
    color='correlation:Q'
).properties(
    width=200,
    height=200
)

combined_fig = alt.concat(
chart,
corr_mat
).resolve_scale(
    color='independent'
)

# un-comment to view
# combined_fig