In [21]:
%load_ext autoreload
%autoreload 2
import os
from os.path import join, expanduser
from tqdm import tqdm
import pandas as pd
import joblib
import sys
import numpy as np
from copy import deepcopy
from neuro.features.feat_select import get_alphas
from neuro.features.qa_questions import get_questions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
sparse_feats_dir = expanduser('~/mntv1/deep-fMRI/qa/sparse_feats_shared/')
os.listdir(sparse_feats_dir)
qa_sparse_feats_dir = join(
    sparse_feats_dir,
    'qa_embedder___qa_questions_version=v3_boostexamples___ensemble1')
# 'eng1000___qa_questions_version=v1___mistralai-Mistral-7B-Instruct-v0.2')
# alphas = np.logspace(0, -3, 20)
alphas = get_alphas('qa_embedder')
# alphas = get_alphas('eng1000')

In [16]:
seeds = range(5)
nonzeros = pd.DataFrame(index=seeds, columns=alphas).astype(float)
enets = pd.DataFrame(index=seeds, columns=alphas)
fnames = os.listdir(qa_sparse_feats_dir)
for seed in tqdm(seeds):
    for alpha in tqdm(alphas):
        template = f'seed={seed}___feature_selection_frac=0.50___feature_selection_alpha={alpha:.2e}.joblib'
        if template in fnames:
            coef_enet = joblib.load(join(qa_sparse_feats_dir, template))
            coef_enet_selected = deepcopy(
                np.any(np.abs(coef_enet) > 0, axis=0).squeeze())
            enets.loc[seed, alpha] = coef_enet_selected
            nonzeros.loc[seed, alpha] = coef_enet_selected.sum()
# template = f'seed={seed}___feature_selection_frac=0.50___feature_selection_alpha={feature_selection_alpha:.2e}.joblib'
# os.listdir(qa_sparse_feats_dir)

100%|██████████| 9/9 [00:00<00:00, 11.92it/s]
100%|██████████| 9/9 [00:00<00:00, 12.78it/s]
100%|██████████| 9/9 [00:00<00:00, 12.14it/s]
100%|██████████| 9/9 [00:01<00:00,  8.75it/s]
100%|██████████| 9/9 [00:00<00:00, 12.47it/s]
100%|██████████| 5/5 [00:03<00:00,  1.26it/s]


In [29]:
# v3_boost_examples has 674, eng1000 has 985
nonzeros.columns.name = 'alpha'
nonzeros.index.name = 'seed'
# nonzeros.columns = np.arange(len(nonzeros.columns))
# enets.columns = nonzeros.columns
# nonzeros.columns = nonzeros.columns.round(4)

coefs_stable_dict = {}
# add row for fracs
for col in nonzeros.columns:
    coefs_list = enets[col]
    coefs_list = coefs_list[coefs_list.notna()]

    if len(coefs_list) > 0:
        # get fraction of times each element is True
        coefs_arr = np.vstack(coefs_list.values)
        coefs_arr = coefs_arr.min(axis=0)
        nonzeros_stable = np.sum(coefs_arr)
    else:
        nonzeros_stable = np.nan
    nonzeros.loc['stable', col] = nonzeros_stable
    coefs_stable_dict[col] = deepcopy(coefs_arr)

display(
    nonzeros
    .style
    .background_gradient(cmap='viridis', axis=None)
    .format('{:.0f}')
)

alpha,0.483293,0.400000,0.335982,0.280000,0.233572,0.162378,0.112884,0.078476,0.054556
seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,15,25,38,67,105,218,380,533,626
1,15,26,36,65,101,214,370,525,626
2,15,24,35,63,101,218,392,524,630
3,12,25,37,63,103,214,383,518,623
4,13,27,35,68,97,217,378,520,615
stable,10,21,28,44,71,147,299,468,595


### Top questions

In [30]:
questions = np.array(get_questions('v3_boostexamples', full=True))

In [37]:
q0 = sorted(questions[list(coefs_stable_dict.values())[0]].tolist())
q0

['Does the input include a philosophical or reflective thought?',
 'Does the sentence contain a proper noun?',
 'Does the sentence describe a physical action?',
 'Does the sentence describe a relationship between people?',
 'Does the sentence include dialogue or thoughts directed towards another character?',
 'Does the sentence involve a description of an interpersonal misunderstanding or dispute?',
 'Does the sentence involve a description of physical environment or setting?',
 'Does the sentence involve the mention of a specific object or item?',
 'Does the sentence reference a specific location or place?',
 'Is time mentioned in the input?']

In [45]:
q1 = sorted(questions[list(coefs_stable_dict.values())[1]].tolist())
for q in q0:
    if q not in q1:
        print('[DROPPED]', q)
for q in q1:
    if q not in q0:
        print(q)

Does the input involve planning or organizing?
Does the sentence describe a personal or social interaction that leads to a change or revelation?
Does the sentence describe a specific sensation or feeling?
Does the sentence describe a visual experience or scene?
Does the sentence include dialogue?
Does the sentence include technical or specialized terminology?
Does the sentence involve a discussion about personal or social values?
Does the sentence involve an expression of personal values or beliefs?
Does the sentence mention a specific location or place?
Is the input related to a specific industry or profession?
Is the sentence conveying the narrator's physical movement or action in detail?


In [46]:
q2 = sorted(questions[list(coefs_stable_dict.values())[2]].tolist())
for q in q1:
    if q not in q2:
        print('[DROPPED]', q)
for q in q2:
    if q not in q1:
        print(q)

[DROPPED] Does the sentence involve a description of an interpersonal misunderstanding or dispute?
Does the input describe a specific texture or sensation?
Does the sentence contain a cultural reference?
Does the sentence contain a negation?
Does the sentence express a sense of belonging or connection to a place or community?
Does the sentence express the narrator's opinion or judgment about an event or character?
Does the sentence include numerical information?
Does the sentence reference a specific time or date?
Is the sentence in the passive voice?
