In [79]:
import pickle
import os

from neural_nlp.benchmarks.neural import *
from neural_nlp.models import *
import neural_nlp
from neural_nlp.models.implementations import *
from neural_nlp.stimuli import StimulusSet
import xarray as xr

benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
    ('Fedorenko2016v3-encoding', Fedorenko2016V3Encoding),
    ('Blank2014fROI-encoding', Blank2014fROIEncoding),
    # secondary benchmarks
    ('Pereira2018-rdm', PereiraRDM),
    ('Fedorenko2016v3-rdm', Fedorenko2016V3RDM),
    ('Fedorenko2016v3nonlang-encoding', Fedorenko2016V3NonLangEncoding),
    ('Blank2014fROI-rdm', Blank2014fROIRDM),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# how to fetch stimulus set
benchmark_test = benchmark_pool['Pereira2018-encoding']
stimuli_df = benchmark_test._target_assembly.attrs['stimulus_set']
stimuli_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [43]:
benchmark_test._target_assembly.values.shape
benchmark_test._target_assembly.coords

Coordinates:
  * presentation      (presentation) MultiIndex
  - stimulus_num      (presentation) int64 0 0 1 1 2 2 3 ... 12 12 13 13 14 14
  - passage_index     (presentation) int64 1 1 1 1 1 1 1 1 2 ... 3 4 4 4 4 4 4 4
  - passage_label     (presentation) object 'Accordion' ... 'dreams'
  - passage_category  (presentation) object 'music' 'beekeeping' ... 'dreams'
  - stimulus_id       (presentation) object '384sentences.0' ... '243sentences.14'
  - story             (presentation) object '384sentences.music' ... '243sentences.dreams'
  - experiment        (presentation) object '384sentences' ... '243sentences'
  * neuroid           (neuroid) MultiIndex
  - subject           (neuroid) object '018' '018' '018' ... '018' '018' '018'
  - voxel_num         (neuroid) int64 28 29 31 32 38 42 ... 152 153 154 159 160
  - atlas             (neuroid) object 'language' 'language' ... 'language'
  - filter_strategy   (neuroid) object '' '' '' '' '' '' ... '' '' '' '' '' ''
  - atlas_selection   (

# Create ablated datasets

#### template: local /Users/carinakauf/repos/6.884-final-project/package-data-mollica.ipynb

In [70]:
import random

#set seed for reproducability (note: PassageShuffle was run without this seed!)
random.seed(10)

In [80]:
original_stimuli = list(stimuli_df['sentence']) # Original stimuli
original_stimuli

['Beekeeping encourages the conservation of local habitats.',
 "It is in every beekeeper's interest to conserve local plants that produce pollen.",
 'As a passive form of agriculture, it does not require that native vegetation be cleared to make way for crops.',
 'Beekeepers also discourage the use of pesticides on crops, because they could kill the honeybees.',
 'Artisanal beekeepers go to extremes for their craft, but their product is worth the effort.',
 'Artisanal honey-making emphasizes quality and character over quantity and consistency.',
 'To produce the finest honey, beekeepers become micromanagers of their honeybees.',
 'They scout the fields, know when nectar flows, and select the best ways to extract honey.',
 'As the beekeeper opens the hive, the deep hum of 40,000 bees fills the air.',
 'The beekeeper checks honey stores, pollen supplies, and the bee nursery.',
 "Bees crawl across his bare arms and hands, but they don't sting, because they're gentle.",
 "I have a recurrin

In [46]:
#!pip3 install spacy

In [47]:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


    # POS tags
    # Universal POS Tags
    # http://universaldependencies.org/u/pos/
    "ADJ": "adjective",
    "ADP": "adposition",
    "ADV": "adverb",
    "AUX": "auxiliary",
    "CONJ": "conjunction",
    "CCONJ": "coordinating conjunction",
    "DET": "determiner",
    "INTJ": "interjection",
    "NOUN": "noun",
    "NUM": "numeral",
    "PART": "particle",
    "PRON": "pronoun",
    "PROPN": "proper noun",
    "PUNCT": "punctuation",
    "SCONJ": "subordinating conjunction",
    "SYM": "symbol",
    "VERB": "verb",
    "X": "other",
    "EOL": "end of line",
    "SPACE": "space",

In [49]:
#quick check

for sent in original_stimuli:
    tokens = nlp(sent)
    print([(elm, elm.pos_) for elm in tokens])
    break

[(Beekeeping, 'NOUN'), (encourages, 'VERB'), (the, 'DET'), (conservation, 'NOUN'), (of, 'ADP'), (local, 'ADJ'), (habitats, 'NOUN'), (., 'PUNCT')]


In [50]:
#helper function to randomly delete 50% of POS tags from list w/o shuffling the order
def delete_random_elems(input_list):
    random.seed(10)
    n = int(len(input_list)/2)
    to_delete = set(random.sample(range(len(input_list)), n)) #select indices to delete
    return [x for i,x in enumerate(input_list) if not i in to_delete]

In [51]:
def get_ablated_datasets(sentences,ablation_type,delete50percent=False):
    
    n = ['NOUN', 'PROPN', 'PRON'] #same as in O'Connor & Andreas (2021)
    v = ['VERB']
    a = ['ADJ']
    adv = ['ADV']
    
    if ablation_type == 'nouns':
        pos_list = n
    elif ablation_type == 'verbs':
        pos_list = v
    elif ablation_type == 'nounsverbs':
        pos_list = n + v
    elif ablation_type == 'nounsverbsadj':
        pos_list = n + v + a
    elif ablation_type == 'contentwords':
        pos_list = n + v + a + adv
    elif ablation_type == 'functionwords':
        pos_list = n + v + a + adv #exclude in next step
    else:
        print("Unknown condition")
        
    ablated_sentences = []
    for sent in sentences:
        tokens = nlp(sent)
        if ablation_type != "functionwords": #if some kind of content words
            
            if delete50percent == False:
                ablated_sentences.append(' '.join([str(elm).lower() for elm in tokens if elm.pos_ in pos_list]))
            else:
                full_list = [str(elm).lower() for elm in tokens if elm.pos_ in pos_list]
                half_list = delete_random_elems(full_list)
                ablated_sentences.append(' '.join(half_list))
                
        else:
            ablated_sentences.append(' '.join([str(elm).lower() for elm in tokens if elm.pos_ not in pos_list]))
            
    return ablated_sentences

### function test

In [52]:
noun_sentences = get_ablated_datasets(original_stimuli,ablation_type='nouns')
noun_sentences[:10]

['beekeeping conservation habitats',
 'it beekeeper interest plants pollen',
 'form agriculture it vegetation way crops',
 'beekeepers use pesticides crops they honeybees',
 'beekeepers extremes their craft their product effort',
 'honey making quality character quantity consistency',
 'honey beekeepers micromanagers their honeybees',
 'they fields nectar ways honey',
 'beekeeper hum bees air',
 'beekeeper honey stores pollen supplies bee nursery']

In [53]:
noun_50percent_sentences = get_ablated_datasets(original_stimuli,ablation_type='nouns',delete50percent=True)
noun_50percent_sentences[:10]

['beekeeping conservation',
 'beekeeper interest plants',
 'agriculture it crops',
 'use pesticides honeybees',
 'extremes their product effort',
 'making quality consistency',
 'beekeepers micromanagers their',
 'fields nectar ways',
 'bees air',
 'honey stores bee nursery']

In [57]:
#check if there is a noun in each list
for ind, elm in enumerate(noun_50percent_sentences):
    if elm == "":
        print(ind)

In [55]:
def get_datsaset(stimuli_df, original_stimuli, ablation_type,delete50percent=False):
    ablated_sentences = get_ablated_datasets(original_stimuli,ablation_type)
    
    ablated_df = stimuli_df.copy()
    ablated_df["sentence"] = ablated_sentences
    
    savedir = '/om/user/ckauf/neural-nlp/ressources/scrambled-stimuli-dfs'
    if delete50percent == True:
        fname = f'stimuli_{ablation_type}_delete50percent.pkl'
    else:
        fname = f'stimuli_{ablation_type}.pkl'
    with open(os.path.join(savedir, fname), 'wb') as fout:
        pickle.dump(ablated_df, fout)
    return ablated_df

In [56]:
#create deleted 50% noun dataset
abl_df = get_datsaset(stimuli_df, original_stimuli, ablation_type="nouns", delete50percent=True)

#loop over ablation types
ablation_types = ["contentwords", "nouns", "verbs", "nounsverbs", "nounsverbsadj", "functionwords"]

for abl in ablation_types:
    abl_df = get_datsaset(stimuli_df, original_stimuli, ablation_type=abl)
abl_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,the of .,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,is in every 's to that .,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"as a of , does not that be to for .",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,"the of on , because could the .",3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,"to for , but the .",4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,some to .,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,a is a .,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,a is as a .,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,a can and .,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


# Create random noun replacement condition

In [89]:
def get_random_noun_dataset(sentences):
    
    #set random seed for reproducability
    random.seed(10)
    
    n = ['NOUN', 'PROPN', 'PRON'] #same as in O'Connor & Andreas (2021)
    pos_list = n
    
    #gather all nouns in a list
    all_nouns = []
    #count how many nouns should go in each sentence
    nr_nouns_in_sentences = []
    #keep track of which nouns were in which sentence for checking later
    nouns_in_sentences = []
    
    for sent in sentences:
        tokens = nlp(sent)
        curr_nouns = [str(elm).lower() for elm in tokens if elm.pos_ in pos_list]
        
        all_nouns += curr_nouns
        nr_nouns_in_sentences.append(len(curr_nouns))
        nouns_in_sentences.append(curr_nouns)
    
    ablated_sentences = []
    for ind,elm in enumerate(nr_nouns_in_sentences):
        random_nouns = random.sample(all_nouns, elm)
        assert set(random_nouns) != set(nouns_in_sentences[ind]) #check that not the same nouns are selected
        ablated_sentences.append(' '.join(random_nouns))
            
    return ablated_sentences
random_nouns = get_random_noun_dataset(original_stimuli)

ablated_df = stimuli_df.copy()
ablated_df["sentence"] = random_nouns

savedir = '/om/user/ckauf/neural-nlp/ressources/scrambled-stimuli-dfs'
fname = 'stimuli_randomnouns.pkl'
with open(os.path.join(savedir, fname), 'wb') as fout:
    pickle.dump(ablated_df, fout)
ablated_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,forestry horror noise,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,edge handles they elements gas,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,leaves skis family molecule studies mouth,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,dishes anvil toaster station clarinet foundations,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,nail he end people dress overflow designs,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,mediterranean she plant,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,insulation role,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,cod pulp,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,owl driving,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


In [95]:
#check that nouns and random nouns datasets have same length strings:
with open(os.path.join(savedir,'stimuli_randomnouns.pkl'), "rb") as f:
    check_randomnouns_df = pickle.load(f)
with open(os.path.join(savedir,'stimuli_nouns.pkl'), "rb") as f:
    check_nouns_df = pickle.load(f)
sent_len_randomnouns = [len(elm.split()) for elm in list(check_randomnouns_df["sentence"])]
sent_len_nouns = [len(elm.split()) for elm in list(check_nouns_df["sentence"])]
assert sent_len_randomnouns == sent_len_nouns
print("Done")

Done


# Create sentence shuffling condition (random shuffling, not respecting passages, sentence length, etc)

In [82]:
from random import shuffle
#set seed for reproducability (note: PassageShuffle was run without this seed!)
random.seed(9)

shuffled_sentences = original_stimuli.copy()
shuffle(shuffled_sentences)

#quick test that all sentences have been shuffled
for ind in range(len(original_stimuli)):
    if original_stimuli[ind] != shuffled_sentences[ind]:
        continue
    else:
        print(ind)
print("Done")

shuffled_sentences

Done


['Violin is the smallest and highest pitched instrument in its family.',
 'The man wore a jet black tuxedo with satin stripes.',
 'Scissors are a two-bladed instrument used for cutting.',
 'A tomato softens and changes color from green to red as it gets more ripe.',
 'A refrigerator is generally white and people often put magnets on its surface.',
 'Black tie trousers should be a perfect match to the tuxedo jacket.',
 'People set cutlery, plates and food on a table to eat a meal together.',
 'A mountain is a landform that rises steeply above surrounding land.',
 'Some computer science conferences have art exhibitions that showcase the latest graphics technology.',
 'Female mosquitoes bite people and animals and suck their blood for protein.',
 'This owl hunts at night, flying swiftly and silently through the darkness.',
 'The elephant flaps its large ears to cool the blood in them and its body.',
 'A floor is a horizontal surface inside a building, used for walking on.',
 'A tropical c

In [84]:
ablated_df = stimuli_df.copy()
ablated_df["sentence"] = shuffled_sentences

savedir = '/om/user/ckauf/neural-nlp/ressources/scrambled-stimuli-dfs'
fname = f'stimuli_randomsentenceshuffle.pkl'
with open(os.path.join(savedir, fname), 'wb') as fout:
    pickle.dump(ablated_df, fout)
ablated_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,Violin is the smallest and highest pitched ins...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,The man wore a jet black tuxedo with satin str...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,Scissors are a two-bladed instrument used for ...,2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,A tomato softens and changes color from green ...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,A refrigerator is generally white and people o...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,She pressed her wings to her sides and settled...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,Patients are faced with the pain of the break ...,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,In waterbeds the mattress is filled with water...,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,In buildings the floor often has pipes and ele...,382,384sentences.382,384sentences,384sentences.human,96,Woman,human


# Create passage shuffling Stimuli_XArray

In [13]:
#index by experiment and passage_label/passage_index >> shuffle within those.
##NOTE: Haven't lowercased here!

from random import shuffle

def get_shuffled_within_passage(stimuli_df):
    shuffled_sentences = []
    for exp in list(np.unique(stimuli_df["experiment"])):
        for ind in list(np.unique(stimuli_df["passage_index"])):
            if exp == "243sentences" and ind > 72: #hot fix, 243sentences only has 72 passages
                continue
            else:
                curr_df = stimuli_df.loc[(stimuli_df["experiment"] == exp) & (stimuli_df["passage_index"] == ind)]
                curr_sent = list(curr_df["sentence"])
                while True:
                    shuffle(curr_sent)
                    if curr_sent != list(curr_df["sentence"]):
                        break
                shuffled_sentences += curr_sent
    return shuffled_sentences
shuffled_sentences = get_shuffled_within_passage(stimuli_df)

In [14]:
shuffled_sentences

["It is in every beekeeper's interest to conserve local plants that produce pollen.",
 'Beekeeping encourages the conservation of local habitats.',
 'As a passive form of agriculture, it does not require that native vegetation be cleared to make way for crops.',
 'Beekeepers also discourage the use of pesticides on crops, because they could kill the honeybees.',
 'Artisanal honey-making emphasizes quality and character over quantity and consistency.',
 'To produce the finest honey, beekeepers become micromanagers of their honeybees.',
 'They scout the fields, know when nectar flows, and select the best ways to extract honey.',
 'Artisanal beekeepers go to extremes for their craft, but their product is worth the effort.',
 'As the beekeeper opens the hive, the deep hum of 40,000 bees fills the air.',
 "Bees crawl across his bare arms and hands, but they don't sting, because they're gentle.",
 'The beekeeper checks honey stores, pollen supplies, and the bee nursery.',
 "I have a recurrin

In [15]:
ablated_df = stimuli_df.copy()
ablated_df["sentence"] = shuffled_sentences

savedir = '/om/user/ckauf/neural-nlp/ressources/scrambled-stimuli-dfs'
fname = f'stimuli_passageshuffle.pkl'
with open(os.path.join(savedir, fname), 'wb') as fout:
    pickle.dump(ablated_df, fout)
ablated_df

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category
0,It is in every beekeeper's interest to conserv...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
1,Beekeeping encourages the conservation of loca...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping
4,Artisanal honey-making emphasizes quality and ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping
...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part
623,A woman can become pregnant and bear children.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human
625,A woman is a female human adult.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human
