# code for constructing the ENGLISH 2022 fMRI regressors
- donald dunagan


### Imports

In [23]:
import math

import pandas as pd

import matplotlib.pyplot as plt

import numpy as np
import numpy.linalg as npl

from nilearn.glm.first_level import compute_regressor

from scipy.stats import pearsonr

import seaborn as sns

%matplotlib inline

In [24]:
sns.set(rc={'figure.figsize':(17,8)})

# Calculate Regressors

In [25]:
'''
This code is from Christophe Pallier:
https://github.com/chrplr/lpp-scripts3/blob/master/models/en/bottomup-topdown-ortho/orthonormalize.py
'''
def ortho_proj(Y, M):
    """ returns the orthogonal component of Y to the space spanned by M and the constant vector 1 """
    if M.ndim == 1:   # M is a vector but needs to be a 2-D matrix
        M = M[:, np.newaxis]
    I = np.ones(len(M))
    I = I[:, np.newaxis]
    M2 = np.hstack((I, M))  # adding the constant 
    betas,_,_,_ = npl.lstsq(M2, Y, rcond=None)
    Xc = np.dot(M2, betas)  # colinear component "residuals"
    Xo = Y - Xc
    return Xo

## 10 January Traversals
- This gives us the bottom up complexity metric

In [27]:
#read in the traversal regressors
traversal_spreadsheet = pd.read_csv('disc_proj_en_10Jan2022.csv')
traversal_spreadsheet

Unnamed: 0,praatword,word,chapter,section,section_time_onset,section_time_offset,whole_time_onset,whole_time_offset,subtl freq,disc_new_td,disc_new_bu,disc_new_lc,disc_new_rev,proj_new_td,proj_new_bu,proj_new_lc,proj_new_rev
0,once,once,1,1,0.113,0.728,0.113,0.728,344.88,5,1,2,2,5,1,2,2
1,when,when,1,1,0.728,0.919,0.728,0.919,2034.10,5,1,2,2,2,1,2,2
2,i,i,1,1,0.919,1.025,0.919,1.025,39971.16,1,1,2,2,2,1,2,2
3,was,was,1,1,1.025,1.158,1.025,1.158,5654.73,0,1,1,1,1,0,1,1
4,six,six,1,1,1.158,1.464,1.158,1.464,199.53,2,0,1,1,2,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15383,tell,tell,27,9,725.985,726.295,5621.980,5622.300,1724.49,2,0,1,1,2,0,1,1
15384,me,me,27,9,726.295,726.435,5622.300,5622.430,9241.94,1,2,2,2,1,2,2,2
15385,he,he,27,9,726.435,726.525,5622.430,5622.520,7637.20,3,1,2,2,3,1,2,2
15386,s,is,27,9,726.525,726.615,5622.520,5622.610,20731.39,1,0,1,1,1,0,1,1


## 9 April Complexities
- use rule system to pull the WH questions and Object Relative Clause items
- Object relatives are Rule 1
- WH questions are Rule 2

In [None]:
april9_complexity_spreadsheet = pd.read_csv('COMPLEXITIES__en_9apr2022.csv')
april9_complexity_spreadsheet

## Check out Object relatives

In [None]:
april9_complexity_spreadsheet[april9_complexity_spreadsheet['new-disc-filler-gap-only-rule-1']>=1]

## Check out WH

In [None]:
april9_complexity_spreadsheet[april9_complexity_spreadsheet['new-disc-filler-gap-only-rule-2']>=1]

## GPT2 surprisal

In [9]:
#read in the gpt2 spreadsheet with gpt2 surprisal regressor
gpt2_spreadsheet = pd.read_csv('Prince_gpt2.tsv',sep='\t')
gpt2_spreadsheet

Unnamed: 0,chapter,section,section_time_onset,section_time_offset,whole_time_onset,whole_time_offset,sentence_id,token_id,token,surprisal,maybe_mismatch
0,1,1,0.113,0.728,0.113,0.728,1,1,Once,0.000000,False
1,1,1,0.728,0.919,0.728,0.919,1,2,when,8.462531,False
2,1,1,0.919,1.025,0.919,1.025,1,3,I,2.585655,False
3,1,1,1.025,1.158,1.025,1.158,1,4,was,2.780438,False
4,1,1,1.158,1.464,1.158,1.464,1,5,six,7.878005,False
...,...,...,...,...,...,...,...,...,...,...,...
15383,27,9,725.985,726.295,5621.980,5622.300,1602,11,tell,4.710349,False
15384,27,9,726.295,726.435,5622.300,5622.430,1602,12,me,3.268568,False
15385,27,9,726.435,726.525,5622.430,5622.520,1602,13,he,9.189348,False
15386,27,9,726.525,726.615,5622.520,5622.610,1602,14,is,2.041129,False


In [12]:
#number of volumes in each section
n_scans = [282,298,340,303,265,343,325,292,368]

In [13]:
sum(n_scans)

2816

## Regressors

In [14]:
def convolve_regressors(section_num): 

    regressors_df = pd.DataFrame()
    '''
    compute_regressor() arguments:
        exp_condition: matrix of size 3 x num_events which consists of (onsets, durations, amplitudes)
        hrf_model: use spm
        frame_times: the sampling times

    regressors of interest are orthogonalized against word_rate
    '''
    #########
    #regressors of non-interest
    #########

    #word rate
    word_rate = compute_regressor(exp_condition = np.vstack((april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num].section_time_offset,
                                                                   np.zeros(len(april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num])),
                                                                   np.ones(len(april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num])) )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]

    #RMS
    rms_spreadsheet = pd.read_csv('{}_rms.csv'.format(section_num))
    rms = compute_regressor(exp_condition = np.vstack((rms_spreadsheet.onset,
                                                                   np.zeros(len(rms_spreadsheet)),
                                                                   rms_spreadsheet.amplitude )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]

    #freq
    freq_spreadsheet = pd.read_csv('{}_freq.csv'.format(section_num))
    freq = compute_regressor(exp_condition = np.vstack((freq_spreadsheet.onset,
                                                                   np.zeros(len(freq_spreadsheet)),
                                                                   freq_spreadsheet.amplitude )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]

    #f0
    f0_spreadsheet = pd.read_csv('{}_f0.csv'.format(section_num))
    f0 = compute_regressor(exp_condition = np.vstack((f0_spreadsheet.onset,
                                                                   np.zeros(len(f0_spreadsheet)),
                                                                   f0_spreadsheet.amplitude )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]


    ############################
    #regressors of interest
    ############################
    
    
    
    #object relative clauses
    obj_relatives = compute_regressor(exp_condition = np.vstack((april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num].section_time_offset,
                                                                   np.zeros(len(april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num])),
                                                                   april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num]['new-disc-filler-gap-only-rule-1']
                                                                      )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]
    obj_relatives = ortho_proj(obj_relatives,word_rate)
    
    #WH questions
    WH = compute_regressor(exp_condition = np.vstack((april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num].section_time_offset,
                                                                   np.zeros(len(april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num])),
                                                                   april9_complexity_spreadsheet[april9_complexity_spreadsheet['section']==section_num]['new-disc-filler-gap-only-rule-2']
                                                                      )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]
    WH = ortho_proj(WH,word_rate)
    
    

    
    #proj_new_bu
    projective_bottomup = compute_regressor(exp_condition = np.vstack((traversal_spreadsheet[traversal_spreadsheet['section']==section_num].section_time_offset,
                                                                   np.zeros(len(traversal_spreadsheet[traversal_spreadsheet['section']==section_num])),
                                                                   traversal_spreadsheet[traversal_spreadsheet['section']==section_num]['proj_new_bu']
                                                                      )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]
    projective_bottomup = ortho_proj(projective_bottomup,word_rate)
    
  
    
    
    
    
    #GPT-2 surprisal
    gpt2_surprisal = compute_regressor(exp_condition = np.vstack((gpt2_spreadsheet[gpt2_spreadsheet['section']==section_num].section_time_offset,
                                                                   np.zeros(len(gpt2_spreadsheet[gpt2_spreadsheet['section']==section_num])),
                                                                   gpt2_spreadsheet[gpt2_spreadsheet['section']==section_num].surprisal
                                                                      )),
                                                      hrf_model="spm",
                                                      frame_times = np.arange(0.0, n_scans[section_num-1] * 2.0, 2.0))[0]
    gpt2_surprisal = ortho_proj(gpt2_surprisal,word_rate)

   
    
    
    
    
    #store all of the  regressors
    regressors_df['word_rate']=word_rate.flatten()
    regressors_df['rms']=rms.flatten()
    regressors_df['freq']=freq.flatten()
    regressors_df['f0']=f0.flatten()
    

    regressors_df['obj_relatives']=obj_relatives.flatten()
    regressors_df['WH']=WH.flatten()
        
    regressors_df['projective_bottomup']=projective_bottomup.flatten()    
        
        
    regressors_df['gpt2_surprisal']= gpt2_surprisal.flatten()

    

    regressors_df['section']=[section_num]*n_scans[section_num-1]
    
    return regressors_df

In [15]:
big_data = pd.DataFrame()
sections=[1,2,3,4,5,6,7,8,9]
for i in sections:
    data = convolve_regressors(i)
    big_data = big_data.append(data)

In [None]:
big_data

In [17]:
big_data.to_csv('lpp_en_regressors_19Apr2022.tsv',index=False,sep='\t')

## Correlations

In [None]:
corr = big_data.drop(columns=['section',
        ]).corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)]=True
picture = sns.heatmap(corr,center=0,linewidth=.5,mask=mask,annot=True)
#figure = picture.get_figure()