In [None]:
%load_ext autoreload
%autoreload 2
import os
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from tqdm import tqdm
import pandas as pd
import joblib
import sys
from copy import deepcopy
import imodels
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge, Lasso, RidgeCV, ElasticNetCV, LinearRegression, LassoCV
from collections import defaultdict

# Set up a linear regression problem
Groundtruth is fit to the entire dataset (selected via CV) whereas the other models are fit to a small subset

In [None]:
PARAM_GRID_LINEAR_REGRESSION = [
    {
        "est": [
            RidgeCV(), ElasticNetCV(), LinearRegression(), LassoCV()
        ],
    },
]

# DSETS_CLASSIFICATION = ['pima_diabetes']
# X, y, feature_names = imodels.get_clean_dataset("pima_diabetes")
X, y, feature_names = imodels.get_clean_dataset("california_housing")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.99
)

# preprocess gt data
X = sklearn.preprocessing.StandardScaler().fit_transform(X)
y = sklearn.preprocessing.StandardScaler().fit_transform(y.reshape(-1, 1)).flatten()

# preprocess split data
trans = sklearn.preprocessing.StandardScaler()
X_train = trans.fit_transform(X_train)
X_test = trans.transform(X_test)
transy = sklearn.preprocessing.StandardScaler()
y_train = transy.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test = transy.transform(y_test.reshape(-1, 1)).flatten()
print("shapes", X.shape, y.shape, "nunique",
      np.unique(y).size, '-> train', X_train.shape)


def fit_and_get_feats(X_train, y_train, topk=2):
    m = imodels.AutoInterpretableRegressor(
        # m = imodels.AutoInterpretableClassifier(
        param_grid=PARAM_GRID_LINEAR_REGRESSION, refit=True)
    m.fit(X_train, y_train, cv=3)

    # print("best params", m.est_.best_params_)
    # print("best score", m.est_.best_score_)
    # print("best estimator", m.est_.best_estimator_)
    # print("best estimator params", m.est_.best_estimator_.get_params())
    # print('selected from', m.param_grid)
    df = pd.DataFrame(m.est_.cv_results_).sort_values(
        "rank_test_score").reset_index()
    first_cols = ["rank_test_score", "mean_test_score", "std_test_score"]
    df = df[first_cols +
            [c for c in df.columns if c not in first_cols]].round(3)
    # remove std_ cols
    df = df[[c for c in df.columns if "std_" not in c]]

    # Refit top models with best params
    d = defaultdict(list)
    for i in range(topk):
        params = df.loc[i, 'params']
        clf = m.est_.best_estimator_.set_params(**params)
        clf.fit(X_train, y_train)
        clf = clf.steps[0][1]
        d['model'].append(deepcopy(clf))
        d['train_score'].append(clf.score(X_train, y_train))
        d['test_score'].append(clf.score(X_test, y_test))
        d['coef'].append(clf.coef_)
        d['intercept'].append(clf.intercept_)
    d = pd.DataFrame(d)
    return d


d_gt = fit_and_get_feats(X, y, topk=1)
d_small = fit_and_get_feats(X_train, y_train, topk=2)
d = pd.concat((d_gt, d_small))

### Interpretation
Note: these our coefficients after standardizing the inputs.

In [None]:
out = {'feature_names': feature_names + ['Intercept']}
for i in range(len(d)):
    coef = d.iloc[i].coef.tolist() + \
        [d.iloc[i].intercept.tolist()]
    if i == 0:
        out['GT'] = coef
    else:
        out[f'{str(d.iloc[i]["model"])[:-4]} ({i})'] = coef

coefs = pd.DataFrame.from_dict(out)
col1 = coefs.columns[1]
coefs = coefs.sort_values(by=col1)
vabs = np.max(np.abs(coefs[col1]))

display(d.round(3).drop(
    columns=['coef', 'intercept']))
display(
    coefs
    .style.background_gradient(
        cmap=sns.diverging_palette(
            20, 220, as_cmap=True, center='dark'),
        vmin=-vabs, vmax=vabs
    )
    .format(precision=2)
)

# Let's ask GPT some questions about the models

In [None]:
import guidance

In [None]:
# connect to a chat model like GPT-4 or Vicuna
gpt4 = guidance.llms.OpenAI("gpt-4-0314")
# vicuna = guidance.llms.transformers.Vicuna("your_path/vicuna_13B", device_map="auto")

experts = guidance('''
{{#system~}}
You are a helpful and terse assistant.
{{~/system}}

{{#user~}}
I want a response to the following question:
{{query}}
Name 3 world-class experts (past or present) who would be great at answering this?
Don't answer the question yet.
{{~/user}}

{{#assistant~}}
{{gen 'expert_names' temperature=0 max_tokens=300}}
{{~/assistant}}

{{#user~}}
Great, now please answer the question as if these experts had collaborated in writing a joint anonymous answer.
{{~/user}}

{{#assistant~}}
{{gen 'answer' temperature=0 max_tokens=500}}
{{~/assistant}}
''', llm=gpt4)

experts(query='How can I be more productive?')