In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../experiments')
import seaborn as sns
import os
import pandas as pd
from copy import deepcopy
from matplotlib import pyplot as plt
from os.path import join
import numpy as np
from ridge_utils.features import qa_questions, feature_spaces
from ridge_utils.data import response_utils
import random
import joblib
from tqdm import tqdm
import dvu
import logging
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
dvu.set_style()
fit_encoding = __import__('02_fit_encoding')

In [None]:
# read and export some questions
qs = pd.read_csv('../qa_results/v3_boostexamples_num=29/questions.csv')
qs = (
    qs.rename(columns={'question': 'Question',
              'avg_abs_coef_normalized': 'Importance'})
    .to_latex(escape=False, column_format='lrrrr', float_format='%.3f', index=False)
)
for s in ['Does the sentence', 'Is the sentence', 'Does the input', 'in the input?', '?']:
    qs = qs.replace(s, r'\textcolor{gray}{' + s + '}')
print(qs)

In [None]:
class A:
    use_test_setup = False
    subject = 'UTS03'
    feature_space = 'qa_embedder-10'
    # qa_embedding_model = 'mistralai/Mistral-7B-Instruct-v0.2'
    # qa_embedding_model = 'meta-llama/Meta-Llama-3-8B-Instruct'
    qa_embedding_model = 'ensemble1'
    trim = 5
    num_stories = -1
    # num_stories = 2
    seed_stories = 1


args = A()

In [None]:
story_names_train, story_names_test = fit_encoding.get_story_names(args)

In [None]:
# # get downsampled features
# features_downsampled_list = []
# for qa_questions_version in ['v1']:
#     # Features
#     features_downsampled_dict = feature_spaces.get_features(
#         args.feature_space,
#         allstories=story_names_train,
#         qa_embedding_model=args.qa_embedding_model,
#         qa_questions_version=qa_questions_version,
#     )
#     # n_time_points x n_features
#     features_downsampled = encoding_utils.trim_and_normalize_features(
#         features_downsampled_dict, args.trim, normalize=True
#     )
#     features_downsampled_list.append(deepcopy(features_downsampled))
# features_downsampled_list = np.hstack(features_downsampled_list)

# # transform so feats is (features x n_time_points)
# feats = features_downsampled_list.T

# get non-downsampled features
features_downsampled_list = []
ngrams_list = []
for qa_questions_version in ['v1', 'v2', 'v3_boostexamples']:
    # Features (this doesn't support ensemble1!)
    allstories, vectors, wordseqs, ngrams_list_dict = feature_spaces.get_features(
        args.feature_space,
        allstories=story_names_train,
        # allstories=story_names_test,
        qa_embedding_model=args.qa_embedding_model,
        qa_questions_version=qa_questions_version,
        downsample=False,
    )
    # n_time_points x n_features
    # features_downsampled = encoding_utils.trim_and_normalize_features(
    # features_downsampled_dict, args.trim, normalize=True
    # )
    features = np.vstack([vectors[k] for k in vectors.keys()])
    ngrams_list = sum([ngrams_list_dict[k]
                      for k in ngrams_list_dict.keys()], [])
    features_downsampled_list.append(deepcopy(features))
    # ngrams_list.append(ngrams)
    # assert len(ngrams) == features.shape[0]
features_downsampled_list = np.hstack(features_downsampled_list)

# transform so feats is (features x n_time_points)
feats = features_downsampled_list.T

In [None]:
# # export to csv
# qa_questions_version = 'v3_boostexamples'
# qs = qa_questions.get_questions(qa_questions_version, full=True)

# # save compressed
# np.savez_compressed(f'../data/{qa_questions_version}_answers_test_numpy',
#                     feats.astype(bool).T)
# joblib.dump({'columns': qs, 'index': ngrams_list},
#             f'../data/{qa_questions_version}_test_metadata.pkl')

In [None]:
for i in range(300):
    plt.plot(feats[i][:1000], '.')
plt.xlabel('Time')
plt.ylabel('Feature value')
plt.show()

In [None]:
corrs = np.corrcoef(feats)
# set diagonal to nan
# np.fill_diagonal(corrs, np.nan)
qs_1 = qa_questions.get_questions('v1')
# qs_2 = qa_questions.get_questions('v2')
# qs = qs_1 + qs_2
qs = qs_1
corrs = pd.DataFrame(corrs, columns=qs, index=qs)

In [None]:
clustermap = sns.clustermap(corrs)
plt.close()
corrs = corrs.iloc[:, clustermap.dendrogram_col.reordered_ind]
corrs = corrs.iloc[clustermap.dendrogram_row.reordered_ind, :]

In [None]:
# cbar in bottom right
# sns.clustermap(
sns.heatmap(
    corrs,
    # cbar_pos=(0.85, 0.03, 0.03, 0.2),
    # figsize=(20, 20),
    cbar_kws={'label': 'Correlation Coefficient'},
    vmin=-1, vmax=1, cmap='RdBu')
plt.show()

In [None]:
corrs_triu = corrs.where(np.triu(np.ones(corrs.shape), k=1).astype(bool))
plt.hist(corrs_triu.values.flatten(), bins=100)
plt.xlabel('Pairwise correlation')
plt.ylabel('Count')
plt.show()

In [None]:
# get indexes/columns of high correlations
high_corr = corrs_triu[corrs_triu > 0.6].stack().index
high_corr_idx = [(high_corr[i][0], high_corr[i][1])
                 for i in range(len(high_corr))]
high_corr_vals = [corrs_triu.loc[high_corr[i][0], high_corr[i][1]]
                  for i in range(len(high_corr))]

In [None]:
for i in range(len(high_corr_idx)):
    print(high_corr_vals[i])
    print('\t', high_corr_idx[i][0])
    print('\t', high_corr_idx[i][1])

In [None]:
yes_fracs = pd.DataFrame({
    'yes_frac': feats.mean(axis=1),
    'question': qs_1,
}).sort_values(by='yes_frac')

In [None]:
# display full width and non-truncated strings
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(yes_fracs.head(30).round(3))
    display(yes_fracs.tail(30).round(3))

In [None]:
# exact sparsity doesnt work that well bc of lanczos sampling
# feat_mins = np.zeros(feats.shape[0])
# for i in range(feats.shape[0]):
# feat_mins[i] = (feats[i] == np.min(feats[i])).sum()