In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy
import imodels
import eval_feat_select
from collections import defaultdict
from typing import List
import dvu
import imodelsx
import imodelsx.viz
import imodelsx.llm
from pprint import pprint
import feat_select
import viz_feat_select

import openai
openai.api_key_path = '/home/chansingh/.OPENAI_KEY'
dvu.set_style()
outcome = 'csi' # 'iai-i', 'tbi_young', 'tbi_old'
dset_dict = feat_select.DSET_DICTS[outcome]

In [41]:
X, y, feats_raw = imodels.get_clean_dataset("csi_pecarn_prop", data_source="imodels")
feats_raw = pd.Series(feats_raw)
# df = pd.DataFrame(X, columns=feats_raw)

# remove specific features
idxs = feats_raw.str.endswith("_nan")
# idxs |= feats_raw.isin(['AgeTwoPlus', 'AgeInMonth'])
# for k in ['LtCostalTender', 'RtCostalTender']:
# idxs |= feats_raw.str.startswith(k)

# apply
# X = X[:, ~idxs]
feats_raw = feats_raw[~idxs]
# feats_abbrev_unique = set(feats_raw.apply(raw_to_abbrev))

# return X, y, feats_raw, feats_abbrev_unique
feats_raw.values

array(['SITE', 'Predisposed', 'AxialLoadAnyDoc', 'axialloadtop', 'IsEms',
       'Position_IDEMS', 'Position_L', 'Position_PA', 'Position_S',
       'Position_W', 'Immobilization2', 'MedsRecd2', 'ArrPtIntub2',
       'AgeInYears', 'gender_F', 'LOC_0.0', 'LOC_1.0', 'ambulatory_0.0',
       'ambulatory_1.0', 'HighriskDiving_0.0', 'HighriskDiving_1.0',
       'HighriskFall_0.0', 'HighriskFall_1.0', 'HighriskHanging_0.0',
       'HighriskHanging_1.0', 'HighriskHitByCar_0.0',
       'HighriskHitByCar_1.0', 'HighriskMVC_0.0', 'HighriskMVC_1.0',
       'HighriskOtherMV_0.0', 'HighriskOtherMV_1.0', 'Clotheslining_0.0',
       'Clotheslining_1.0', 'AlteredMentalStatus2_0.0',
       'AlteredMentalStatus2_1.0', 'FocalNeuroFindings2_0.0',
       'FocalNeuroFindings2_1.0', 'PainNeck2_0.0', 'PainNeck2_1.0',
       'PosMidNeckTenderness2_0.0', 'PosMidNeckTenderness2_1.0',
       'TenderNeck2_0.0', 'TenderNeck2_1.0', 'Torticollis2_0.0',
       'Torticollis2_1.0', 'SubInjHead2_0.0', 'SubInjHead2_1.0',


In [4]:
feats_raw

Series([], dtype: object)

In [None]:
X, y, feats_raw, feats_abbrev_unique = dset_dict['get_data'](outcome=outcome)
print(f"Unique features: {len(feats_abbrev_unique)}", 'X shape', X.shape)
print('Positive outcomes', y.sum())

# # plt.figure(figsize=(8, 12))
# # n = df_full.shape[1] - 1
# # plt.barh(y=np.arange(n), width=df_full.corr()["outcome"][:-1])
# # plt.yticks(np.arange(n), pd.Series(df_full.columns[:-1]).apply(lambda x: x[:20]))
# # plt.grid()
# # plt.show()

In [None]:
rng = np.random.default_rng(42)
# feats_select = rng.choice(list(feats_abbrev_unique), size=5, replace=False)
# feats_select = ['AbdTrauma']
feats_select = dset_dict['pecarn_feats_ordered']
idxs_raw = feat_select.abbrevs_to_idxs_raw(feats_select, feats_raw)

mets = eval_feat_select.evaluate_features(
    X[:, idxs_raw], y, seed=42, class_weight=2, return_pr_curve=True
)
prec, rec, thresh = mets['roc_auc_curve']
plt.plot(rec, prec, '.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
def compute_mets_avg(strategies, feats_abbrev_unique, X, y, dset_dict, num_seeds=2):
    """Compute metrics when fitting a simple classifier
    using features selected from feats_abbrev_unique
    """
    mets_list_strategies = defaultdict(list)
    for strategy in tqdm(strategies):
        for seed in range(num_seeds):
            mets_seed = defaultdict(list)
            feats_ordered = feat_select.get_feats_ordered(
                feats_abbrev_unique, dset_dict, strategy=strategy, seed=seed
            )
            for i in range(len(feats_ordered)):
                feats_select = feats_ordered[: i + 1]
                idxs_raw = feat_select.abbrevs_to_idxs_raw(feats_select, feats_raw)
                met_scores = eval_feat_select.evaluate_features(
                    X[:, idxs_raw], y, seed=42 + seed
                )
                for k in met_scores:
                    mets_seed[k].append(met_scores[k])
                mets_seed["n_feats"].append(i + 1)
            mets_list_strategies[strategy].append(pd.DataFrame(mets_seed))

    # average over seed: convert mets_list_strategies to mets_avg
    mets_avg = defaultdict(list)
    for strategy in strategies:
        m = mets_list_strategies[strategy]
        cols = m[0].columns
        mets_mean = pd.DataFrame(
            data=np.mean([m[i].values for i in range(len(m))], axis=0),
            columns=cols,
        )
        mets_sem = pd.DataFrame(
            data=np.std([m[i].values for i in range(len(m))], axis=0) / np.sqrt(len(m)),
            columns=[k + "_sem" for k in cols],
        )
        mets_avg[strategy] = pd.concat([mets_mean, mets_sem], axis=1)

    return mets_avg

### Possibly-memorized plot

In [None]:
mets_avg = compute_mets_avg(
    ["gpt-4-0314", "pecarn", "random"],  # , "pecarn___gpt-4-0314"],
    feats_abbrev_unique,
    X,
    y,
    dset_dict,
)

In [None]:
viz_feat_select.viz_curves(
    mets_avg,
    strategies=["pecarn", "gpt-4-0314", "random"],
    outcome=outcome,
    n_end=len(dset_dict["pecarn_feats_ordered"]),
    n_pecarn=len(dset_dict["pecarn_feats_ordered"]),
)
plt.savefig(f"../results_llm/{outcome}_reselect_original.pdf")
# viz_feat_select.viz_curves(mets_avg, strategies=['gpt-4-0314', 'random'], outcome=outcome, n_end=8)
plt.show()

### Extrapolating to new features plot

In [None]:
strategies = ["gpt-4-0314", "random"]
mets_avg = compute_mets_avg(
    strategies,
    [
        feat
        for feat in feats_abbrev_unique
        if not feat in dset_dict["pecarn_feats_ordered"]
    ],
    X,
    y,
    dset_dict,
)

In [None]:
viz_feat_select.viz_curves(mets_avg, strategies=strategies, outcome=outcome, n_start=8-1)
plt.savefig(f'../results_llm/{outcome}_reselect_unused.pdf')
plt.show()