In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import seaborn as sns
import os
import pandas as pd
from copy import deepcopy
from matplotlib import pyplot as plt
from os.path import join
import numpy as np
import imodelsx.process_results
import qa_questions
import random
import json
import joblib
from tqdm import tqdm
from collections import defaultdict
import feature_spaces
fit_encoding = __import__('01_fit_encoding')
import encoding_utils

### Look at examples

In [None]:
class A:
    use_test_setup = False
    subject = 'UTS03'
    num_stories = -1


args = A()
story_names_train, story_names_test = fit_encoding.get_story_names(args)
wordseqs = feature_spaces.get_story_wordseqs(story_names_train)

### Generate random exapmles for prompting new questions (v3)

In [None]:
seed = 43  # 42, 43
ngrams_examples = []
ngram_size = 10
num_examples_per_story = 1
random.seed(seed)
np.random.seed(seed)
for story_name in story_names_train:
    words_list = wordseqs[story_name].data
    ngrams_list = feature_spaces._get_ngrams_list_from_words_list(
        words_list, ngram_size=ngram_size)[ngram_size + 2:]
    ngrams_examples += np.random.choice(ngrams_list,
                                        num_examples_per_story).tolist()
print('\n'.join(['- ' + ngram for ngram in ngrams_examples]))

### Generate examples for boosted questions based model errors (v4, v5)
- note: v4 wasn't actually boosted because the model we used was basically random
- v5 settings were:
  - args_top.feature_space='qa_embedder-10' args_top.ndelays=4
  - args_top.corrs_test_mean=0.126 args_top.corrs_tune_pc_mean=0.134110

In [None]:
# load top model to boost
# results_dir = '/home/chansingh/mntv1/deep-fMRI/encoding/results_apr1'
results_dir = '/home/chansingh/mntv1/deep-fMRI/encoding/results_apr7'
r = imodelsx.process_results.get_results_df(results_dir)
for k in ['save_dir', 'save_dir_unique']:
    r[k] = r[k].map(lambda x: x if x.startswith('/home')
                    else x.replace('/mntv1', '/home/chansingh/mntv1'))

args_top = r[
    (r.feature_space.str.contains('qa_embedder')) *
    (r.pc_components == 100) *
    (r.ndelays == 4) *
    (r.qa_questions_version == 'v4')
].sort_values(
    by='corrs_tune_pc_mean',
    ascending=False).iloc[0]
print(f'{args_top.feature_space=} {args_top.ndelays=}')
print(f'{args_top.corrs_test_mean=:.3f} {args_top.corrs_tune_pc_mean=:3f}')

In [None]:
model_params_to_save = joblib.load(
    join(args_top.save_dir_unique, 'model_params.pkl'))

In [None]:
r = defaultdict(list)
for story_name in tqdm(story_names_train):
    # ngram for 3 trs preceding the current TR
    chunks = wordseqs[story_name].chunks()
    ngrams_list = feature_spaces._get_ngrams_list_from_chunks(
        chunks, num_trs=3)
    ngrams_list = np.array(ngrams_list[10:-5])

    stim_train_delayed, resp_train = fit_encoding.get_data(
        args_top, [story_name])

    preds_test = stim_train_delayed @ model_params_to_save['weights'] + \
        model_params_to_save['bias']

    # calculate correlation at each timepoint
    corrs_time = np.array([np.corrcoef(resp_train[i, :], preds_test[i, :])[0, 1]
                           for i in range(resp_train.shape[0])])
    corrs_time[:10] = 100  # don't pick first 10 TRs
    # get worst 3 idxs
    corrs_worst_idxs = np.argsort(corrs_time)[:3]

    for i in range(3):
        r['story_name'].append(story_name)
        r['corrs'].append(corrs_time[corrs_worst_idxs[i]])
        r['ngram'].append(ngrams_list[corrs_worst_idxs[i]])
        r['tr'].append(corrs_worst_idxs[i])

    joblib.dump(r, '04_ngrams_boost_v5.pkl')  # saved as 04_ngrams_boost.pkl

In [None]:
ngrams_boost = pd.DataFrame(joblib.load('04_ngrams_boost_v5.pkl'))
print('\n'.join(['- ' + x for x in ngrams_boost.iloc[::2]
      ['ngram'].values if len(x.strip()) > 1]))

In [None]:
questions_prev = json.load(open('../all_questions_v1-v4.json'))
print('\n'.join(['- ' + x for x in questions_prev[1::2]]))