In [1]:
import pandas as pd
import numpy as np
import os
import h5py
from sklearn import linear_model
import matplotlib.pyplot as plt

import deepdish as dd
import string
try:
    os.chdir('/data/MoL_clean/scripts')
except:
    pass
import util
# util has some variables in them
# import GLM_helper as gh

import scipy.stats as stats
import glob

import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sentence_transformers import SentenceTransformer
from scipy.stats import ttest_1samp
import seaborn as sns
from scipy import stats, linalg
import warnings
from joblib import Parallel, delayed
import multiprocessing
warnings.filterwarnings('ignore')

from sentence_transformers import CrossEncoder
import random
model = CrossEncoder('cross-encoder/stsb-roberta-large')


def getSentenceSimilarity(sentence1, sentence2):
    return model.predict([(sentence1, sentence2)])[0]

sub2subj = {"sub-01":"subj001", "sub-02":"subj002","sub-03":"subj003","sub-04":"subj005",
              "sub-05":"subj006", "sub-06":"subj007", "sub-07":"subj008", "sub-08":"subj009", 
           "sub-09":"subj010","sub-10":"subj011","sub-11":"subj013", "sub-12":"subj014", 
            "sub-13":"subj017", "sub-14":"subj018", "sub-15":"subj019", "sub-16":"subj020",
           "sub-17":"subj021", "sub-18":"subj022", "sub-19":"subj023", "sub-20":"subj024",'sub-21':'subj025',
           'sub-22':'subj026','sub-23':'subj027','sub-24':'subj029','sub-25':'subj031',
            'sub-101':'subj101', 'sub-102':'subj102', 'sub-103':'subj103', 'sub-105':'subj105','sub-107':'subj107','sub-108':'subj108'}
ses2w = {"ses-01":"W2", "ses-02":"W4D1", "ses-03":"W4D2"}


nv = 40962

subjects = ['sub-%.2d'%s for s in range(1,26)]
sessions = ['ses-%.2d'%s for s in range(1,4)]
runs = ['run-%.2d'%s for s in range(1,3)]
TR = 1.5
nTRs = {'Item':302, 'Loci':302, 'Encode':355}

nTRs_w4d2 = {'Item': 156, 'Loci': 156, 'Encode': 182}
SL_lh = list(dd.io.load('SLlist_verydense.lh.h5').values())
SL_rh = list(dd.io.load('SLlist_verydense.rh.h5').values())
ag = list(dd.io.load('ROIs/Ang_verts.h5').values())
pmc = list(dd.io.load('ROIs/PMC_verts.h5').values())
mPFC = list(dd.io.load('ROIs/mPFC_verts.h5').values())
ROIs = {'ag':ag, 'pmc':pmc, 'mpfc':mPFC}
SLlist = {'L':SL_lh, "R": SL_rh}
nSL_L = len(SLlist['L'])
hippo_ROIs = ['anterior_hipp','posterior_hipp','hippo']


In [2]:
def remove_xs(string_list):
    return [s for s in string_list if s!='x']


def add_numbers_to_duplicates(string_list):
    count = {}
    new_list = []
    
    for item in string_list:
        if item in count and item!='x':
            count[item] += 1
            new_item = f"{item} {count[item]}"
        else:
            count[item] = 1
            new_item = item
        
        new_list.append(new_item.lower())
        
    return new_list


def partial_corr(C, desired_i= None, desired_j=None):
    """
    Returns the sample linear partial correlation coefficients between pairs of variables in C, controlling 
    for the remaining variables in C.
    Parameters
    ----------
    C : array-like, shape (n, p)
        Array with the different variables. Each column of C is taken as a variable
    Desired_i, desired_j: int
        If only wants to calculate the partial correlation between desired_i and desired_j, set them to be the index of the variables

    
    Returns
    -------
    P : array-like, shape (p, p)
        P[i, j] contains the partial correlation of C[:, i] and C[:, j] controlling
        for the remaining variables in C
    """
    
    C = np.asarray(C)
    p = C.shape[1]
    P_corr = np.zeros((p, p), dtype=np.float)
    if desired_i is not None and desired_j is not None:
        P_corr[desired_i, desired_j] = 1
        P_corr[desired_j, desired_i] = 1
        idx = np.ones(p, dtype=np.bool)
        idx[desired_i] = False
        idx[desired_j] = False
        beta_i = linalg.lstsq(C[:, idx], C[:, desired_j])[0]
        beta_j = linalg.lstsq(C[:, idx], C[:, desired_i])[0]

        res_j = C[:, desired_j] - C[:, idx].dot( beta_i)
        res_i = C[:, desired_i] - C[:, idx].dot(beta_j)

        corr = stats.pearsonr(res_i, res_j)[0]
        P_corr[desired_i, desired_j] = corr
        P_corr[desired_j, desired_i] = corr
        P_corr = corr
    else:
        for i in range(p):
            P_corr[i, i] = 1
            for j in range(i+1, p):
                idx = np.ones(p, dtype=np.bool)
                idx[i] = False
                idx[j] = False
                beta_i = linalg.lstsq(C[:, idx], C[:, j])[0]
                beta_j = linalg.lstsq(C[:, idx], C[:, i])[0]

                res_j = C[:, j] - C[:, idx].dot( beta_i)
                res_i = C[:, i] - C[:, idx].dot(beta_j)
                
                corr = stats.pearsonr(res_i, res_j)[0]
                P_corr[i, j] = corr
                P_corr[j, i] = corr
            
    return P_corr

def SLtoVox(D, SLlist, nv, zeronan=True):
    # D is dict of L, R, with N x arbitrary dims
    # SLlist is dict of L, R list of length N, with vertices for each SL

    Dvox = dict()
    Dcount = dict()
    for hem in ['L', 'R']:
        Dvox[hem] = np.zeros((nv,)+ D[hem].shape[1:])
        Dcount[hem] = np.zeros((nv,)+(1,)*len(D[hem].shape[1:]))
        for i in range(len(SLlist[hem])):
            Dvox[hem][SLlist[hem][i]] += D[hem][i]
            Dcount[hem][SLlist[hem][i]] += 1

        Dcount[hem][Dcount[hem] == 0] = np.nan
        Dvox[hem] = Dvox[hem] / Dcount[hem]

        if zeronan:
            Dvox[hem][np.isnan(Dvox[hem])] = 0

    return Dvox

    
def get_beta_dicts(sub,ses,hippo=False, expert = False):
    task = 'Item'
    # load item 
    item_filenames = sorted(glob.glob(f'../behavioral/{sub2subj[sub]}/{ses2w[ses]}/*{task.lower()}*.csv'))
    item_words_lists = [[w for w in pd.read_csv(item_filenames[0])['Word'] if w is not np.nan],[w for w in pd.read_csv(item_filenames[1])['Word'] if w is not np.nan]]
    item_beta_dict = {'lh':{w:[] for w in item_words_lists[0]}, 'rh':{w:[] for w in item_words_lists[0]},'anterior_hipp':{w:[] for w in item_words_lists[0]},'posterior_hipp':{w:[] for w in item_words_lists[0]},'hippo':{w:[] for w in item_words_lists[0]}}
    task = 'Loci'
    # load loci lists, with loci names in two lists. Create a dictionary accordingly taking the union of the two lists
    loci_lists = [add_numbers_to_duplicates(list(pd.read_excel('../updated_sheets/%s_recallperformance.xlsx'%sub2subj[sub], sheet_name='%sloci1'%ses2w[ses].lower())['spoken_loci'])),
                        add_numbers_to_duplicates(list(pd.read_excel('../updated_sheets/%s_recallperformance.xlsx'%sub2subj[sub], sheet_name='%sloci2'%ses2w[ses].lower())['spoken_loci'])),]
    loci_beta_dict = {"lh":{l:[] for l in list(set(loci_lists[0]+loci_lists[1]))}, "rh":{l:[] for l in list(set(loci_lists[0]+loci_lists[1]))},"anterior_hipp":{l:[] for l in list(set(loci_lists[0]+loci_lists[1]))},"posterior_hipp":{l:[] for l in list(set(loci_lists[0]+loci_lists[1]))},'hippo':{l:[] for l in list(set(loci_lists[0]+loci_lists[1]))}}
    
    # create dictionary of locus and item for each of the two runs
    for hem in ['lh','rh','anterior_hipp','posterior_hipp','hippo']:
        run = 'run-01'
        loci_fmri = np.loadtxt('../outputs/betas/%s/%s_%s_%s_%s_beta.txt'%(task.lower(),sub,ses,run,hem))
        for i, l in enumerate(loci_lists[0]):
            loci_beta_dict[hem][l].append(loci_fmri[:,i])
        run = 'run-02'
        loci_fmri = np.loadtxt('../outputs/betas/%s/%s_%s_%s_%s_beta.txt'%(task.lower(),sub,ses,run,hem))
        for i, l in enumerate(loci_lists[1]):
            loci_beta_dict[hem][l].append(loci_fmri[:,i])
    # average locus rep if spoken in both runs
        for l in loci_beta_dict[hem]:
            if len(loci_beta_dict[hem][l]) == 2:
                loci_beta_dict[hem][l] = np.mean(loci_beta_dict[hem][l],axis=0)
            else:
                loci_beta_dict[hem][l] = loci_beta_dict[hem][l][0]
    
    for hem in ['lh','rh','anterior_hipp','posterior_hipp','hippo']: 
        run = 'run-01'
        task = 'Item'
        item_fmri = np.loadtxt('../outputs/betas/%s/%s_%s_%s_%s_beta.txt'%(task.lower(),sub,ses,run,hem))
        for i, w in enumerate(item_words_lists[0]):
            item_beta_dict[hem][w].append(item_fmri[:,i])
        run = 'run-02'
        item_fmri = np.loadtxt('../outputs/betas/%s/%s_%s_%s_%s_beta.txt'%(task.lower(),sub,ses,run,hem))
        for i, w in enumerate(item_words_lists[1]):
            item_beta_dict[hem][w].append(item_fmri[:,i])

        for w in item_beta_dict[hem]:
            item_beta_dict[hem][w] = np.mean(item_beta_dict[hem][w],axis=0)

    retrieve_beta_dict = {'lh':{},'rh':{},'anterior_hipp':{},'posterior_hipp':{},'hippo':{}}
    encode_beta_dict = {'lh':{},'rh':{},'anterior_hipp':{},'posterior_hipp':{},'hippo':{}}
    run = 'run-01'
    for hem in ['lh','rh','anterior_hipp','posterior_hipp','hippo']:
        recall_sheet = pd.read_excel('../updated_sheets/%s_recallperformance.xlsx'%sub2subj[sub], sheet_name=ses2w[ses].lower())
        recall_sheet['retrieval'] = recall_sheet['retrieval'].apply(lambda x: x.split(' ')[0])
        recall_fmri = np.loadtxt('../outputs/betas/%s/%s_%s_%s_%s_beta.txt'%('retrieve',sub,ses,run,hem))
        encode_fmri = np.loadtxt('../outputs/betas/%s/%s_%s_%s_%s_beta.txt'%('encode',sub,ses,run,hem))
        if ses != 'ses-03':
            encode_pairs = [recall_sheet['loci'][r]+'-'+recall_sheet['encode'][r] for r in range(len(recall_sheet))][:40]
        else:
            encode_pairs = [recall_sheet['loci'][r]+'-'+recall_sheet['encode'][r] for r in range(len(recall_sheet))][:20]
        valid_encode_pairs = [p for p in encode_pairs if p[0]!='x']
        for r, pair in enumerate(valid_encode_pairs):
            encode_beta_dict[hem][pair.lower()] = encode_fmri[:,r]
        ret_idx = 0
        for r in range(len(recall_sheet)):
            if recall_sheet['spoken_loci'][r]!='x':
                retrieve_beta_dict[hem][(recall_sheet['spoken_loci'][r]+'-'+recall_sheet['retrieval'][r]).lower()] = recall_fmri[:,ret_idx]
                ret_idx+=1
    return loci_beta_dict, item_beta_dict, encode_beta_dict, retrieve_beta_dict


def get_residuals(C, dv):
    """
    Returns the residuals of the dv(th) column of matrix C, accounting for all variances explained by other columns
    Parameters
    ----------
    C : array-like, shape (n, p)
        Array with the different variables. Each column of C is taken as a variable
    
    Returns
    -------
    res : array-like, shape (n,)
        Residual of dv
    """
    C = np.asarray(C)
    p = C.shape[1]
    idx = np.ones(p, dtype=np.bool)
    idx[dv] = False
    beta = linalg.lstsq(C[:, idx], C[:, dv])[0]

    res = C[:, dv] - C[:, idx].dot(beta)
            
    return res


def get_residual(target, var1, var2):
    # Create a 2D array for the independent variables
    X = np.column_stack((var1, var2))
    
    # Fit a linear regression model with the independent variables
    model = LinearRegression()
    model.fit(X, target)
    
    # Get the predicted values for the target variable based on the model
    predicted = model.predict(X)
    
    # Calculate the residuals by subtracting the predicted values from the original target values
    residuals = target - predicted
    
    return residuals


def calculate_weights(x, y, z, A):
    """
    Calculate the weights of variables x, y, z in predicting each timepoint of matrix A.

    Parameters:
    x, y, z: Arrays of shape (nv,) representing the variables in each vertices in the ROI.
    A: Matrix of shape (nv, nTR) where each column is a timepoint.

    Returns:
    weights: A matrix of shape (nTR, 3) containing the weights for each variable for each timepoint.
    """

    # Combine x, y, z into a single matrix
    predictors = np.column_stack((x, y, z))
    try:
        num_TRs = A.shape[1]
            # Initialize an array to store weights
        weights = np.zeros((num_TRs, 3))

        # Loop over each timepoint
        for i in range(num_TRs):
            # Create a linear regression model
            model = LinearRegression()

            # Fit the model
            model.fit(predictors, A[:, i])

            # Store the weights
            weights[i, :] = model.coef_
    except:
        num_TRs = 1
        model = LinearRegression()
        model.fit(predictors, A)
        weights = model.coef_
    

    return weights

In [3]:
def generate_shuffled_indexes(indexes_to_shuffle, num_permutation, seed=None):
    random_index_container = []
    for p in range(num_permutation):
        random.seed(seed+str(p))
        shuffled_list = indexes_to_shuffle[:]
        random.shuffle(shuffled_list)
        random_index_container.append(shuffled_list)
    return random_index_container

In [None]:
def get_story_matrix(sub, seses = ['ses-01','ses-02','ses-03'],expert = False):
    # generate a matrix with each locus-item pair as a row, with a bunch of measures on similarities
    # in neural activity, univariate activity, and story deviation, for different ROIs,
    # the sheet is used for subsequent analyses in R.
    for ses in seses:
        recall_sheet = pd.read_excel('../updated_sheets/%s_recallperformance.xlsx'%sub2subj[sub], sheet_name=ses2w[ses].lower())

        all_dist = {"encode_pair":[],"retrieve_pair":[], "story":[], "story_deviation":[],"story_deviation_locus":[],"story_deviation_item":[],'speak_duration':[],'sub':[],'ses':[],
                    'e_i_sim_ag':[],'e_i_sim_pmc':[],'e_i_sim_mpfc':[],
                    'e_l_sim_ag':[],'e_l_sim_pmc':[],'e_l_sim_mpfc':[],
                    'e_r_sim_ag':[],'e_r_sim_pmc':[],'e_r_sim_mpfc':[],
                    'e_li_sim_ag':[],'e_li_sim_pmc':[],'e_li_sim_mpfc':[],
                    'encode_ag_univariate':[],'encode_pmc_univariate':[],'encode_mpfc_univariate':[],'retrieve_ag_univariate':[],'retrieve_pmc_univariate':[],'retrieve_mpfc_univariate':[],
                    'locus_ag_univariate':[],'locus_pmc_univariate':[],'locus_mpfc_univariate':[],'item_ag_univariate':[],'item_pmc_univariate':[],'item_mpfc_univariate':[],
                    "encode_anterior_hipp_univariate":[], "encode_posterior_hipp_univariate":[], "encode_hippo_univariate":[],
                    'locus_anterior_hipp_univariate':[], 'locus_posterior_hipp_univariate':[], 'locus_hippo_univariate':[],
                    'item_anterior_hipp_univariate':[], '   item_posterior_hipp_univariate':[], 'item_hippo_univariate':[],
                    'retrieve_anterior_hipp_univariate':[], 'retrieve_posterior_hipp_univariate':[], 'retrieve_hippo_univariate':[],
                    'e_r_sim_residual_ag':[],'e_r_sim_residual_pmc':[],'e_r_sim_residual_mpfc':[],
                    'beta_e_residual_ag':[],'beta_e_residual_pmc':[],'beta_e_residual_mpfc':[],
                    'beta_e_residual_z_ag':[], 'beta_e_residual_z_pmc':[], 'beta_e_residual_z_mpfc':[],
                    'beta_i_z_ag':[], 'beta_i_z_pmc':[], 'beta_i_z_mpfc':[],
                    'beta_l_z_ag':[], 'beta_l_z_pmc':[], 'beta_l_z_mpfc':[],
                    'beta_e_residual_posterior_hipp':[], 'beta_e_residual_anterior_hipp':[], 'beta_e_residual_hippo':[],
                    'beta_e_residual_z_posterior_hipp':[], 'beta_e_residual_z_anterior_hipp':[], 'beta_e_residual_z_hippo':[],
                    'beta_i_z_posterior_hipp':[], 'beta_i_z_anterior_hipp':[], 'beta_i_z_hippo':[],
                    'beta_l_z_posterior_hipp':[], 'beta_l_z_anterior_hipp':[], 'beta_l_z_hippo':[],
                    'r_i_sim_ag':[],'r_i_sim_pmc':[],'r_i_sim_mpfc':[],
                    'r_l_sim_ag':[],'r_l_sim_pmc':[],'r_l_sim_mpfc':[],
                    'correct':[]}

        loci_beta_dict, item_beta_dict, encode_beta_dict, retrieve_beta_dict = get_beta_dicts(sub,ses, expert = expert)
        encode_loci_list = [l.lower() for l in list(recall_sheet['loci'])]
        valid_pairs = []
        
        idxes_in_valid = []
        for i in range(len(recall_sheet)):
            retrieve_pair = recall_sheet['spoken_loci'][i].lower()+'-'+recall_sheet['retrieval'][i].lower()
            locus, item = retrieve_pair.split('-')
            if retrieve_pair in encode_beta_dict['anterior_hipp'].keys() and locus in loci_beta_dict['anterior_hipp'].keys():
                valid_pairs.append(retrieve_pair)
                idxes_in_valid.append(i)

                
        shuffled_keys = generate_shuffled_indexes(valid_pairs, 1000, sub+ses)
        
        for i in range(len(recall_sheet)):
            
            retrieve_pair = recall_sheet['spoken_loci'][i].lower()+'-'+recall_sheet['retrieval'][i].lower()
            locus, item = retrieve_pair.split('-')
            valid_pair = False
            if retrieve_pair in encode_beta_dict['anterior_hipp'].keys() and locus in loci_beta_dict['anterior_hipp'].keys():
                valid_pair = True
                encode_pair = retrieve_pair
                all_dist['retrieve_pair'].append(retrieve_pair.lower())
                all_dist['encode_pair'].append(retrieve_pair.lower())
                all_dist['correct'].append(True)
                

            elif item == 'x' and locus in encode_loci_list and locus in loci_beta_dict['anterior_hipp'].keys():
                valid_pair = True
                locus_index_encoding = encode_loci_list.index(locus)
                item = recall_sheet['encode'][locus_index_encoding]
                encode_pair = locus.lower()+'-'+item.lower()
                all_dist['retrieve_pair'].append(retrieve_pair.lower())
                all_dist['encode_pair'].append(encode_pair.lower())
                all_dist['correct'].append(False)
                
                
            if valid_pair:
                try:
                    story = recall_sheet.iloc[i,7]
                    all_dist['story'].append(story)
                    all_dist['speak_duration'].append(recall_sheet['elapsed'][i])
                    all_dist['story_deviation'].append(getSentenceSimilarity(encode_pair,story))
                    all_dist['story_deviation_locus'].append(getSentenceSimilarity(locus,story))
                    all_dist['story_deviation_item'].append(getSentenceSimilarity(item,story))
                except Exception as error:
                    all_dist['story'].append(np.nan)
                    all_dist['speak_duration'].append(np.nan)
                    all_dist['story_deviation'].append(np.nan)
                    all_dist['story_deviation_locus'].append(np.nan)
                    all_dist['story_deviation_item'].append(np.nan)
                
                for roi in ROIs:
                    roi_l,roi_r = ROIs[roi][0], ROIs[roi][1]
                    encode_rep = np.concatenate((encode_beta_dict['lh'][encode_pair][roi_l], encode_beta_dict['rh'][encode_pair][roi_r]))
                    locus_rep = np.concatenate((loci_beta_dict['lh'][locus][roi_l],loci_beta_dict['rh'][locus][roi_r]))
                    item_rep = np.concatenate((item_beta_dict['lh'][item][roi_l],item_beta_dict['rh'][item][roi_r]))
                    retrieve_rep = np.concatenate((retrieve_beta_dict['lh'][retrieve_pair][roi_l],retrieve_beta_dict['rh'][retrieve_pair][roi_r]))
                    
                    locus_item_average = (locus_rep+item_rep)/2
                    
                    e_residual = get_residual(encode_rep,locus_rep,item_rep)
                    r_residual = get_residual(retrieve_rep,locus_rep,item_rep)

                    l_weight,i_weight,e_weight= calculate_weights(locus_rep, item_rep, e_residual, retrieve_rep)

                    loci_weights_shuffled_item, item_weights_shuffled_item, encode_weights_shuffled_item = [],[],[]
                    try:
                        for p in range(1000):
                            shuffled_key = shuffled_keys[p][idxes_in_valid.index(i)]
                            retrieve_rep_shuffled = np.concatenate((retrieve_beta_dict['lh'][shuffled_key][roi_l],retrieve_beta_dict['rh'][shuffled_key][roi_r]))
                            l_weights_shuffled,i_weights_shuffled,e_weights_shuffled = calculate_weights(locus_rep, item_rep, e_residual, retrieve_rep_shuffled)
                            loci_weights_shuffled_item.append(l_weights_shuffled)
                            item_weights_shuffled_item.append(i_weights_shuffled)
                            encode_weights_shuffled_item.append(e_weights_shuffled)
                        
                        all_dist[f'beta_e_residual_z_{roi}'].append((e_weight-np.mean(encode_weights_shuffled_item))/np.std(encode_weights_shuffled_item))
                        all_dist[f'beta_l_z_{roi}'].append((l_weight-np.mean(loci_weights_shuffled_item))/np.std(loci_weights_shuffled_item))
                        all_dist[f'beta_i_z_{roi}'].append((i_weight-np.mean(item_weights_shuffled_item))/np.std(item_weights_shuffled_item))
                    
                    except: 
                        all_dist[f'beta_e_residual_z_{roi}'].append(np.nan)
                        all_dist[f'beta_l_z_{roi}'].append(np.nan)
                        all_dist[f'beta_i_z_{roi}'].append(np.nan)

                    e_i_similarity = stats.pearsonr(encode_rep,item_rep)[0]
                    e_l_similarity = stats.pearsonr(encode_rep,locus_rep)[0]
                    e_r_similarity = stats.pearsonr(encode_rep,retrieve_rep)[0]
                    e_li_similarity = stats.pearsonr(encode_rep,locus_item_average)[0]
                    e_r_similarity_residual = stats.pearsonr(e_residual,r_residual)[0]
                    r_i_similarity = stats.pearsonr(retrieve_rep,item_rep)[0]
                    r_l_similarity = stats.pearsonr(retrieve_rep,locus_rep)[0]
                    all_dist[f'item_{roi}_univariate'].append(np.mean(item_rep))
                    all_dist[f'locus_{roi}_univariate'].append(np.mean(locus_rep))
                    all_dist[f'encode_{roi}_univariate'].append(np.mean(encode_rep))
                    all_dist[f'retrieve_{roi}_univariate'].append(np.mean(retrieve_rep))

                    all_dist[f'e_i_sim_{roi}'].append(e_i_similarity)
                    all_dist[f'e_l_sim_{roi}'].append(e_l_similarity)
                    all_dist[f'e_r_sim_{roi}'].append(e_r_similarity)
                    all_dist[f'e_r_sim_residual_{roi}'].append(e_r_similarity_residual)
                    all_dist[f'r_l_sim_{roi}'].append(r_l_similarity)
                    all_dist[f'r_i_sim_{roi}'].append(r_i_similarity)
                    all_dist[f'e_li_sim_{roi}'].append(e_li_similarity)
                    all_dist[f'beta_e_residual_{roi}'].append(e_weight)
        

                for roi in hippo_ROIs:
                    # retrieve_pair = all_dist['retrieve_pair'][pair_idx]
                    # encode_pair = all_dist['encode_pair'][pair_idx]
                    # locus, _ = retrieve_pair.split('-')
                    # _, item = encode_pair.split('-')
                    encode_rep = encode_beta_dict[roi][encode_pair]
                    locus_rep = loci_beta_dict[roi][locus]
                    item_rep = item_beta_dict[roi][item]
                    retrieve_rep = retrieve_beta_dict[roi][retrieve_pair]                
                    
                    e_residual = get_residual(encode_rep,locus_rep,item_rep)
                    r_residual = get_residual(retrieve_rep,locus_rep,item_rep)
                    l_weight,i_weight,e_weight= calculate_weights(locus_rep, item_rep, e_residual, retrieve_rep)

                    loci_weights_shuffled_item, item_weights_shuffled_item, encode_weights_shuffled_item = [],[],[]
                    try:
                        for p in range(1000):
                            shuffled_key = shuffled_keys[p][idxes_in_valid.index(i)]
                            retrieve_rep_shuffled = retrieve_beta_dict[roi][shuffled_key]
                            l_weights_shuffled,i_weights_shuffled,e_weights_shuffled = calculate_weights(locus_rep, item_rep, e_residual, retrieve_rep_shuffled)
                            loci_weights_shuffled_item.append(l_weights_shuffled)
                            item_weights_shuffled_item.append(i_weights_shuffled)
                            encode_weights_shuffled_item.append(e_weights_shuffled)
                        
                        all_dist[f'beta_e_residual_z_{roi}'].append((e_weight-np.mean(encode_weights_shuffled_item))/np.std(encode_weights_shuffled_item))
                        all_dist[f'beta_l_z_{roi}'].append((l_weight-np.mean(loci_weights_shuffled_item))/np.std(loci_weights_shuffled_item))
                        all_dist[f'beta_i_z_{roi}'].append((i_weight-np.mean(item_weights_shuffled_item))/np.std(item_weights_shuffled_item))
                    
                    except Exception as e:
                        all_dist[f'beta_e_residual_z_{roi}'].append(np.nan)
                        all_dist[f'beta_l_z_{roi}'].append(np.nan)
                        all_dist[f'beta_i_z_{roi}'].append(np.nan)

                    all_dist[f'beta_e_residual_{roi}'].append(e_weight)

                    all_dist[f'encode_{roi}_univariate'].append(np.mean(encode_rep))
                    all_dist[f'retrieve_{roi}_univariate'].append(np.mean(retrieve_rep))
                    all_dist[f'item_{roi}_univariate'].append(np.mean(item_rep))
                    all_dist[f'locus_{roi}_univariate'].append(np.mean(locus_rep))

        all_dist['sub'] = sub
        all_dist['ses'] = ses
        all_dist = pd.DataFrame(all_dist)
        all_dist.to_csv(f'../outputs/story_matrix/{sub}_{ses}_story_matrix_withHipp.csv',index=False)
    # except Exception as error:
    #     print(error)
    #     print(f'{sub} {ses} not found')
    #     continue

In [5]:
x = Parallel(n_jobs=13)(delayed(get_story_matrix)(sub) for sub in subjects)

In [None]:
# combine all the matrices
story_matrices = []
for sub in subjects:
    for ses in sessions:
        story_matrix = pd.read_csv(f'../outputs/story_matrix/{sub}_{ses}_story_matrix_withHipp.csv')
        story_matrices.append(story_matrix)
story_matrix = pd.concat(story_matrices)
story_matrix.to_csv('../outputs/story_matrix_withHipp.csv',index=False)