In [1]:
%load_ext autoreload
%autoreload 2
import os
from os.path import join, expanduser
from tqdm import tqdm
import pandas as pd
import joblib
import sys
import numpy as np
from copy import deepcopy
from neuro.features.feat_select import get_alphas
from neuro.features.qa_questions import get_questions, get_merged_questions_v3_boostexamples

In [2]:
sparse_feats_dir = expanduser('~/mntv1/deep-fMRI/qa/sparse_feats_shared/')
os.listdir(sparse_feats_dir)
qa_sparse_feats_dir = join(
    sparse_feats_dir,
    # 'qa_embedder___qa_questions_version=v3_boostexamples___ensemble1')
    'qa_embedder___qa_questions_version=v3_boostexamples_merged___ensemble2')
# 'eng1000___qa_questions_version=v1___mistralai-Mistral-7B-Instruct-v0.2')
# alphas = np.logspace(0, -3, 20)
alphas = get_alphas('qa_embedder')
# alphas = get_alphas('eng1000')

In [3]:
seeds = range(5)
nonzeros = pd.DataFrame(index=seeds, columns=alphas).astype(float)
enets = pd.DataFrame(index=seeds, columns=alphas)
fnames = os.listdir(qa_sparse_feats_dir)
for seed in tqdm(seeds):
    for alpha in tqdm(alphas):
        template = f'seed={seed}___feature_selection_frac=0.50___feature_selection_alpha={alpha:.2e}.joblib'
        if template in fnames:
            coef_enet = joblib.load(join(qa_sparse_feats_dir, template))
            coef_enet_selected = deepcopy(
                np.any(np.abs(coef_enet) > 0, axis=0).squeeze())
            enets.loc[seed, alpha] = coef_enet_selected
            nonzeros.loc[seed, alpha] = coef_enet_selected.sum()
# template = f'seed={seed}___feature_selection_frac=0.50___feature_selection_alpha={feature_selection_alpha:.2e}.joblib'
# os.listdir(qa_sparse_feats_dir)

100%|██████████| 9/9 [00:02<00:00,  4.24it/s]
100%|██████████| 9/9 [00:01<00:00,  8.14it/s]
100%|██████████| 9/9 [00:01<00:00,  8.83it/s]
100%|██████████| 9/9 [00:01<00:00,  8.74it/s]
100%|██████████| 9/9 [00:01<00:00,  8.21it/s]
100%|██████████| 5/5 [00:06<00:00,  1.28s/it]


In [4]:
# v3_boost_examples has 674, eng1000 has 985
nonzeros.columns.name = 'alpha'
nonzeros.index.name = 'seed'
# nonzeros.columns = np.arange(len(nonzeros.columns))
# enets.columns = nonzeros.columns
# nonzeros.columns = nonzeros.columns.round(4)

coefs_stable_dict = {}
coefs_all_dict = {}
# add row for fracs
for col in nonzeros.columns:
    coefs_list = enets[col]
    coefs_list = coefs_list[coefs_list.notna()]

    if len(coefs_list) > 0:
        # get fraction of times each element is True
        coefs_arr = np.vstack(coefs_list.values)
        coefs_all = coefs_arr.max(axis=0)
        coefs_arr = coefs_arr.min(axis=0)
        nonzeros_stable = np.sum(coefs_arr)
    else:
        nonzeros_stable = np.nan
    nonzeros.loc['stable', col] = nonzeros_stable
    coefs_all_dict[col] = deepcopy(coefs_all)
    coefs_stable_dict[col] = deepcopy(coefs_arr)

display(
    nonzeros
    .style
    .background_gradient(cmap='viridis', axis=None)
    .format('{:.0f}')
)

alpha,0.483293,0.400000,0.335982,0.280000,0.233572,0.162378,0.112884,0.078476,0.054556
seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,8,19,29,50,75,164,308,433,541
1,7,17,32,49,74,168,299,438,553
2,8,20,25,47,74,163,312,443,548
3,8,18,28,48,75,172,313,432,547
4,9,18,26,48,75,162,308,435,543
stable,7,15,22,35,54,117,246,380,515


### Detailed breakdown

In [6]:
# questions = np.array(get_questions('v3_boostexamples', full=True))
questions = np.array(get_merged_questions_v3_boostexamples())
# print(len(questions))
qs_prev = []
for i in range(5):
    print(i, 'num questions:', len(
        questions[list(coefs_stable_dict.values())[i]]))

    print('----STABLE---')
    qs = sorted(questions[list(coefs_stable_dict.values())[i]].tolist())
    qs_unstable = sorted(questions[list(coefs_all_dict.values())[i]].tolist())
    display([q for q in qs if q not in qs_prev])
    for q in qs_prev:
        if not q in qs:
            print('[DROPPED]', q)

    # print('----UNSTABLE---')
    # display([q for q in qs_unstable if not q in qs])

    qs_prev = qs

0 num questions: 7
----STABLE---


['Does the sentence contain a proper noun?',
 'Does the sentence describe a personal or social interaction that leads to a change or revelation?',
 'Does the sentence describe a personal reflection or thought?',
 'Does the sentence describe a physical action?',
 'Does the sentence describe a relationship between people?',
 'Does the sentence involve a description of physical environment or setting?',
 'Does the sentence involve the mention of a specific object or item?']

1 num questions: 15
----STABLE---


['Does the input involve planning or organizing?',
 'Does the sentence describe a visual experience or scene?',
 "Does the sentence express the narrator's opinion or judgment about an event or character?",
 'Does the sentence include dialogue?',
 'Does the sentence mention a specific location?',
 'Is the input related to a specific industry or profession?',
 'Is the sentence abstract rather than concrete?',
 'Is time mentioned in the input?']

2 num questions: 22
----STABLE---


['Does the input contain a number?',
 'Does the sentence contain a cultural reference?',
 'Does the sentence contain a negation?',
 'Does the sentence describe a sensory experience?',
 'Does the sentence include technical or specialized terminology?',
 'Does the sentence involve an expression of personal values or beliefs?',
 'Does the sentence involve spatial reasoning?']

3 num questions: 35
----STABLE---


['Does the input contain a measurement?',
 'Does the input describe a specific texture or sensation?',
 'Does the input include a comparison or metaphor?',
 'Does the sentence describe a physical sensation?',
 'Does the sentence describe a specific sensation or feeling?',
 'Does the sentence express a sense of belonging or connection to a place or community?',
 'Does the sentence include a direct speech quotation?',
 'Does the sentence include a personal anecdote or story?',
 'Does the sentence involve a discussion about personal or social values?',
 'Does the text describe a journey?',
 'Does the text describe a mode of communication?',
 'Does the text include a planning or decision-making process?',
 'Is the sentence reflective, involving self-analysis or introspection?']

4 num questions: 54
----STABLE---


['Does the input discuss a societal issue or social justice topic?',
 'Does the sentence convey a decision or choice made by the narrator?',
 'Does the sentence convey a sense of urgency or haste?',
 'Does the sentence describe a change in a physical or emotional state?',
 'Does the sentence describe a moment of relief or resolution of tension?',
 'Does the sentence describe an activity related to daily life or routine?',
 'Does the sentence express a philosophical or existential query or observation?',
 'Does the sentence include a conditional clause?',
 'Does the sentence include a specific sound or auditory description?',
 'Does the sentence include an account of a miscommunication or misunderstanding?',
 'Does the sentence involve a recount of a social or community event?',
 'Does the sentence involve an unexpected incident or accident?',
 'Does the sentence reference a specific time or date?',
 'Does the sentence use a unique or unusual word?',
 'Does the story involve a personal 

[DROPPED] Does the text include a planning or decision-making process?
[DROPPED] Is the sentence abstract rather than concrete?


### Inspect top questions

In [7]:
questions = np.array(get_merged_questions_v3_boostexamples())
i_best = 4
# qs_best = sorted(questions[list(coefs_stable_dict.values())[i_best]].tolist())
# len(qs_best), qs_best

df = pd.DataFrame(questions)
df.index.name = 'question_num'
df.rename(columns={0: 'question'}, inplace=True)
df['stable_idx'] = 10
coefs_stable_vals = list(coefs_stable_dict.values())
for i in range(len(coefs_stable_vals) - 1, -1, -1):
    df.loc[coefs_stable_vals[i], 'stable_idx'] = i
df = df.sort_values(by='stable_idx')
d_stable = df[df.stable_idx <= i_best]
print('n_stable', len(d_stable))

# display full questions no truncation
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
    display(d_stable)

d_stable_list = d_stable.question.values.tolist()
# print as numbered list
for i, q in enumerate(d_stable_list):
    print(f'{i + 1}. {q}')

n_stable 56


Unnamed: 0_level_0,question,stable_idx
question_num,Unnamed: 1_level_1,Unnamed: 2_level_1
496,Does the sentence describe a personal reflection or thought?,0
337,Does the sentence contain a proper noun?,0
91,Does the sentence describe a physical action?,0
495,Does the sentence describe a personal or social interaction that leads to a change or revelation?,0
601,Does the sentence involve the mention of a specific object or item?,0
566,Does the sentence involve a description of physical environment or setting?,0
359,Does the sentence describe a relationship between people?,0
107,Does the sentence mention a specific location?,1
333,Is time mentioned in the input?,1
415,Is the sentence abstract rather than concrete?,1


1. Does the sentence describe a personal reflection or thought?
2. Does the sentence contain a proper noun?
3. Does the sentence describe a physical action?
4. Does the sentence describe a personal or social interaction that leads to a change or revelation?
5. Does the sentence involve the mention of a specific object or item?
6. Does the sentence involve a description of physical environment or setting?
7. Does the sentence describe a relationship between people?
8. Does the sentence mention a specific location?
9. Is time mentioned in the input?
10. Is the sentence abstract rather than concrete?
11. Does the sentence express the narrator's opinion or judgment about an event or character?
12. Is the input related to a specific industry or profession?
13. Does the sentence include dialogue?
14. Does the sentence describe a visual experience or scene?
15. Does the input involve planning or organizing?
16. Does the sentence involve spatial reasoning?
17. Does the sentence involve an expr

In [None]:
# export as string
s = ''
for i in range(len(df)):
    s += f'{i}. {questions[i]}\n'
s