In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
import imodels
import eval_feat_select
from collections import defaultdict
from typing import List
import dvu
import imodelsx
import imodelsx.viz
import imodelsx.llm
from pprint import pprint
import feat_select
import viz_feat_select

import openai
openai.api_key_path = '/home/chansingh/.OPENAI_KEY'
dvu.set_style()


In [None]:
outcome = 'tbi_young'
if 'iai' in outcome:
    X, y, feats_raw, feats_abbrev_unique = feat_select.get_iai_data(outcome)
elif 'tbi' in outcome:
    X, y, feats_raw, feats_abbrev_unique = feat_select.get_tbi_data(outcome)
print(f"Unique features: {len(feats_abbrev_unique)}", 'X shape', X.shape)


# # plt.figure(figsize=(8, 12))
# # n = df_full.shape[1] - 1
# # plt.barh(y=np.arange(n), width=df_full.corr()["outcome"][:-1])
# # plt.yticks(np.arange(n), pd.Series(df_full.columns[:-1]).apply(lambda x: x[:20]))
# # plt.grid()
# # plt.show()

In [None]:
rng = np.random.default_rng(42)
# feats_select = rng.choice(list(feats_abbrev_unique), size=5, replace=False)
# feats_select = ['AbdTrauma']
feats_select = feat_select.FEATS[outcome]['pecarn_feats_ordered']
idxs_raw = feat_select.abbrevs_to_idxs_raw(feats_select, feats_raw)

mets = eval_feat_select.evaluate_features(
    X[:, idxs_raw], y, seed=42, class_weight=2, return_pr_curve=True
)
# prec, rec, thresh = mets['roc_auc_curve']
# plt.plot(rec, prec, '.')
# plt.xlabel('Recall')
# plt.ylabel('Precision')

In [None]:
def compute_mets_avg(strategies, feats_abbrev_unique, X, y):
    mets_list_strategies = defaultdict(list)
    for strategy in tqdm(strategies):
        for seed in range(2):
            mets_seed = defaultdict(list)
            feats_ordered = feat_select.get_feats_ordered(
                feats_abbrev_unique, strategy=strategy, seed=seed
            )
            for i in range(len(feats_ordered)):
                feats_select = feats_ordered[: i + 1]
                idxs_raw = feat_select.abbrevs_to_idxs_raw(feats_select, feats_raw)
                met_scores = eval_feat_select.evaluate_features(
                    X[:, idxs_raw], y, seed=42 + seed
                )
                for k in met_scores:
                    mets_seed[k].append(met_scores[k])
                mets_seed["n_feats"].append(i + 1)
            mets_list_strategies[strategy].append(pd.DataFrame(mets_seed))

    # convert mets_list_strategies to mets_avg
    mets_avg = defaultdict(list)
    for strategy in strategies:
        m = mets_list_strategies[strategy]
        cols = m[0].columns
        mets_mean = pd.DataFrame(
            data=np.mean([m[i].values for i in range(len(m))], axis=0),
            columns=cols,
        )
        mets_sem = pd.DataFrame(
            data=np.std([m[i].values for i in range(len(m))], axis=0) / np.sqrt(len(m)),
            columns=[k + "_sem" for k in cols],
        )
        mets_avg[strategy] = pd.concat([mets_mean, mets_sem], axis=1)

    return mets_avg


# mets_avg = compute_mets_avg(["gpt-4-0314", "pecarn", "random", "pecarn___gpt-4-0314"], feats_abbrev_unique, X, y)
mets_avg = compute_mets_avg(
    ["gpt-4-0314", "random"],
    [
        feat
        for feat in feats_abbrev_unique
        if not feat in feat_select.PECARN_FEATS_ORDERED_IAI
    ],
    X,
    y,
)

### Possibly-memorized plot

In [None]:
# viz_feat_select.viz_curves(mets_avg, strategies=['pecarn', 'gpt-4-0314', 'random'], outcome=outcome, n_end=8)
viz_feat_select.viz_curves(mets_avg, strategies=['gpt-4-0314', 'random'], outcome=outcome, n_end=8)
plt.show()

### Extrapolating to new features plot

In [None]:
viz_feat_select.viz_curves(mets_avg, strategies=['pecarn', 'pecarn___gpt-4-0314'], outcome=outcome, n_start=8-1)
plt.show()