In [2]:
%load_ext autoreload
%autoreload 2
import os
from os.path import join, expanduser
from tqdm import tqdm
import pandas as pd
import joblib
import sys
import numpy as np
from copy import deepcopy
from neuro.features.feat_select import get_alphas
from neuro.features.qa_questions import get_questions, get_merged_questions_v3_boostexamples

In [3]:
sparse_feats_dir = expanduser('~/mntv1/deep-fMRI/qa/sparse_feats_shared/')
os.listdir(sparse_feats_dir)
qa_sparse_feats_dir = join(
    sparse_feats_dir,
    # 'qa_embedder___qa_questions_version=v3_boostexamples___ensemble1')
    'qa_embedder___qa_questions_version=v3_boostexamples_merged___ensemble2')
# 'eng1000___qa_questions_version=v1___mistralai-Mistral-7B-Instruct-v0.2')
# alphas = np.logspace(0, -3, 20)
alphas = get_alphas('qa_embedder')
# alphas = get_alphas('eng1000')

In [None]:
seeds = range(5)
nonzeros = pd.DataFrame(index=seeds, columns=alphas).astype(float)
enets = pd.DataFrame(index=seeds, columns=alphas)
fnames = os.listdir(qa_sparse_feats_dir)
for seed in tqdm(seeds):
    for alpha in tqdm(alphas):
        template = f'seed={seed}___feature_selection_frac=0.50___feature_selection_alpha={alpha:.2e}.joblib'
        if template in fnames:
            coef_enet = joblib.load(join(qa_sparse_feats_dir, template))
            coef_enet_selected = deepcopy(
                np.any(np.abs(coef_enet) > 0, axis=0).squeeze())
            enets.loc[seed, alpha] = coef_enet_selected
            nonzeros.loc[seed, alpha] = coef_enet_selected.sum()
# template = f'seed={seed}___feature_selection_frac=0.50___feature_selection_alpha={feature_selection_alpha:.2e}.joblib'
# os.listdir(qa_sparse_feats_dir)

In [None]:
# v3_boost_examples has 674, eng1000 has 985
nonzeros.columns.name = 'alpha'
nonzeros.index.name = 'seed'
# nonzeros.columns = np.arange(len(nonzeros.columns))
# enets.columns = nonzeros.columns
# nonzeros.columns = nonzeros.columns.round(4)

coefs_stable_dict = {}
coefs_all_dict = {}
# add row for fracs
for col in nonzeros.columns:
    coefs_list = enets[col]
    coefs_list = coefs_list[coefs_list.notna()]

    if len(coefs_list) > 0:
        # get fraction of times each element is True
        coefs_arr = np.vstack(coefs_list.values)
        coefs_all = coefs_arr.max(axis=0)
        coefs_arr = coefs_arr.min(axis=0)
        nonzeros_stable = np.sum(coefs_arr)
    else:
        nonzeros_stable = np.nan
    nonzeros.loc['stable', col] = nonzeros_stable
    coefs_all_dict[col] = deepcopy(coefs_all)
    coefs_stable_dict[col] = deepcopy(coefs_arr)

display(
    nonzeros
    .style
    .background_gradient(cmap='viridis', axis=None)
    .format('{:.0f}')
)

### Detailed breakdown

In [None]:
# questions = np.array(get_questions('v3_boostexamples', full=True))
questions = np.array(get_merged_questions_v3_boostexamples())
# print(len(questions))
qs_prev = []
for i in range(5):
    print(i, 'num questions:', len(
        questions[list(coefs_stable_dict.values())[i]]))

    print('----STABLE---')
    qs = sorted(questions[list(coefs_stable_dict.values())[i]].tolist())
    qs_unstable = sorted(questions[list(coefs_all_dict.values())[i]].tolist())
    display([q for q in qs if q not in qs_prev])
    for q in qs_prev:
        if not q in qs:
            print('[DROPPED]', q)

    # print('----UNSTABLE---')
    # display([q for q in qs_unstable if not q in qs])

    qs_prev = qs

### Inspect top questions

In [None]:
questions = np.array(get_merged_questions_v3_boostexamples())
i_best = 3
# qs_best = sorted(questions[list(coefs_stable_dict.values())[i_best]].tolist())
# len(qs_best), qs_best

df = pd.DataFrame(questions)
df.index.name = 'question_num'
df.rename(columns={0: 'question'}, inplace=True)
df['stable_idx'] = 10
coefs_stable_vals = list(coefs_stable_dict.values())
for i in range(len(coefs_stable_vals) - 1, -1, -1):
    df.loc[coefs_stable_vals[i], 'stable_idx'] = i
df = df.sort_values(by='stable_idx')
d_stable = df[df.stable_idx <= i_best]
print('n_stable', len(d_stable))

# display full questions no truncation
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
    display(d_stable)

d_stable_list = d_stable.question.values.tolist()
# print as numbered list
for i, q in enumerate(d_stable_list):
    print(f'{i + 1}. {q}')

In [None]:
# export as string
s = ''
for i in range(len(df)):
    s += f'{i}. {questions[i]}\n'
s