In [None]:
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.linear_model import ElasticNetCV, LinearRegression, RidgeCV, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y
from sklearn.utils.validation import _check_sample_weight
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
from collections import defaultdict
import dvu
import pandas as pd
import matplotlib.pyplot as plt
import json
from matplotlib.colors import TwoSlopeNorm
from matplotlib.colors import Normalize
import joblib
import viz
from interpret import show

import imodels
from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.base import RegressorMixin, ClassifierMixin
from imodels.algebraic.gam_multitask import MultiTaskGAMRegressor

# Fit some simple GAMs

In [None]:
dset = 'bike_sharing'
# dset = 'california_housing'
# dset = 'diabetes_regr'
# dset = 'heart'
fit_target_curves = False
for dset in ['bike_sharing', 'california_housing', 'diabetes_regr', 'heart']:
    for fit_target_curves in [True]:
        X, y, feature_names = imodels.get_clean_dataset(dset)
        X = StandardScaler().fit_transform(X)
        y = StandardScaler().fit_transform(y.reshape(-1, 1)).ravel()
        X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

        kwargs = dict(
            random_state=42,
            n_jobs=-2,
        )
        results = defaultdict(list)

        gam = MultiTaskGAMRegressor(
            multitask=True, interactions=False, fit_target_curves=fit_target_curves)

        np.random.seed(42)
        gam.fit(X, y_train)
        print('test_corr', np.corrcoef(
            y_test, gam.predict(X_test))[0, 1].round(3))
        print('test r2', gam.score(X_test, y_test).round(3))
        joblib.dump(
            gam, f'../figs/{dset}_gam_fit_target_curves={fit_target_curves}.pkl')

# Visualize the GAMs

In [None]:
def visualize_dset(dset, fit_target_curves=True):
    gam = joblib.load(
        f'../figs/{dset}_gam_fit_target_curves={fit_target_curves}.pkl')
    X, y, feature_names = imodels.get_clean_dataset(dset)
    X = StandardScaler().fit_transform(X)
    y = StandardScaler().fit_transform(y.reshape(-1, 1)).ravel()
    X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # generate linspace curves for each feature and show EBM evals of all of them
    mins = X.min(axis=0)
    maxes = X.max(axis=0)
    n = 100
    linspaces = [np.linspace(mins[i], maxes[i], n) for i in range(X.shape[1])]
    linspaces = np.array(linspaces).T
    evals = gam._extract_ebm_features(linspaces)
    num_ebms = len(gam.ebms_)
    num_features = X.shape[1]

    C = 3
    R = int(np.ceil(num_features / C))
    plt.figure(figsize=((C + 1) * 5.5,  R * 3))
    grid = plt.GridSpec(R, C + 1)  # , hspace=1.0, vspace=0.1)
    for feat_num in tqdm(range(num_features)):
        r = feat_num // C
        c = feat_num % C
        plt.subplot(grid[r, c])
        # plt.subplot(R, C, feat_num + 1)
        for ebm_num in range(num_ebms):

            idxs_feat_num = np.arange(
                feat_num, num_ebms * num_features, num_features)
            coefs = gam.lin_model.coef_[idxs_feat_num]

            # get diverging colormap based on coefs
            colors = viz._get_diverging_colors_centered_at_zero(coefs[:-1])

            if fit_target_curves:
                idxs = idxs_feat_num[:-1]
            else:
                idxs = idxs_feat_num
            for feat_name_predicting, idx in enumerate(idxs):
                plt.plot(linspaces[:, feat_num],
                         evals[:, idx],
                         alpha=0.5,
                         label=str(feature_names[feat_name_predicting]),
                         color=colors[feat_name_predicting])
            if fit_target_curves:
                plt.plot(linspaces[:, feat_num],
                         evals[:, idxs_feat_num[-1]], color='#444', linestyle=':', label='initial', lw=4)

        plt.plot(linspaces[:, feat_num], np.dot(
            evals[:, idxs_feat_num], coefs), color='black', lw=3, label='multi-task')
        plt.xlabel(feature_names[feat_num])

        dvu.line_legend()

    # add plot to the top right
    if fit_target_curves:
        plt.subplot(grid[0:2, -1])
        coefs_final = gam.lin_model.coef_[-num_features:]
        coefs_feat = gam.lin_model.coef_[:-num_features]
        plt.hist(coefs_final, bins=20, density=True,
                 color='black', label='Target curve coefs')
        plt.hist(coefs_feat, bins=20, density=True,
                 label='Feature curve coefs', alpha=0.5)
        plt.xlabel('Coefficient')
        plt.ylabel('Density')
        plt.legend()

    plt.tight_layout()
    plt.savefig(
        f'../figs/gam_curves/{dset}_fit_target_curves={fit_target_curves}.pdf', bbox_inches='tight')
    plt.show()


for dset in ['bike_sharing', 'california_housing', 'diabetes_regr', 'heart']:
    visualize_dset(dset)