In [1]:
import os
from os import path as osp
import pandas as pd
import spacy
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
input_slice = '0_0'
input_base = '../generated_data/experiment_slices/results'
input_dir = osp.join(input_base, input_slice)
input_file = osp.join(input_dir, 'collected_annotations.csv')
output_base = '../collected_data/intermediate/head_noun_extraction'
output_dir = osp.join(output_base, input_slice)
if not osp.isdir(output_dir):
    print(f'create dir {output_dir}')
    os.makedirs(output_dir)

In [None]:
def cut_conjunctions(s, spacy_model):
    doc = spacy_model(s)

    for token in doc:
        if token.dep_ == 'cc':
            conjunction = token
            s = doc[:conjunction.i].text
            break
        
    return s

def clip_initial_article(s, spacy_model, normalize=True):
    if normalize:
        s = s.lower().strip()
    
    doc = spacy_model(s)
    
    serialized = doc.to_json()
    tokens = serialized['tokens']
    
    first_token = tokens[0]
    if first_token['pos'] == 'DET':
        assert len(tokens) > 1
        second_token = tokens[1]
        start_idx = second_token['start']
        return s[start_idx:]
    
    return s

def get_compounds(token):
    
    tokens = [token]
    for t in token.children:
        if t.dep_ == 'compound':
            # recursive call
            tokens += get_compounds(t)
        
    return tokens

def get_compound_str(token):

    compound_tokens = get_compounds(token)
    sorted_compound_tokens = sorted(compound_tokens, key=lambda x: x.i)
    compound_string = ' '.join([t.text for t in sorted_compound_tokens])
    
    return compound_string

# adapted from kilogram code (+ compounds)

def get_np_head(s, spacy_model, normalize=True):# -> Any | None:
    
    if normalize:
        s = s.lower().strip()
    
    #hard coded fix typo
    if s.startswith('aa '):
        s=s.replace('aa ', 'a ')
    
    #get tree
    doc = spacy_model(s)
    
    #single word
    if len(doc)==1:
        return doc[0]
        
    np_head = None
    for token in doc:
        if token.dep_=='ROOT' and token.head.pos_ in ['NOUN', 'INTJ', 'PROPN', 'PRON', 'ADJ', 'ADV']: 
            np_head = token
         
        if token.dep_=='ROOT' and token.head.pos_=='VERB':
            if list(token.children)[0].dep_=='prep':
                np_head = token
            else:
                np_head = list(token.children)[0]

        if token.dep_=='ROOT' and token.head.pos_=='ADP':
            np_head = list(token.children)[-1]

        # hard code "xx can" utterances
        if token.dep_=='ROOT' and token.text=='can':
            np_head = token
            
    return np_head

def get_head_string(s, spacy_model, normalize=True):
    head = get_np_head(s, spacy_model, normalize=normalize)
    if head:
        return head.text

def get_head_compound_string(s, spacy_model, normalize=True):
    head = get_np_head(s, spacy_model, normalize=normalize)
    if head:
        return get_compound_str(head)

In [4]:
ann_df = pd.read_csv(input_file, index_col=0)[
    ['item_identifyer', 'raw_annotation']
]

nlp = spacy.load("en_core_web_sm")

In [5]:
ann_df

Unnamed: 0,item_identifyer,raw_annotation
0,3-none-bws0_0,A PRIEST
1,3-none-bws0_1,praying person
2,3-none-bws0_10,a person reading a book
3,3-none-bws0_2,human in robe
4,3-none-bws0_3,Pulpit
...,...,...
2547,69-sea_bottom-sws16_15,sea otter
2548,70-bathroom-sws16_15,chair
2549,70-bedroom-sws12_13,TENTS
2550,71-office-sws16_15,desk lamp


In [6]:
print('remove newline characters and trailing whitespace...')
f = lambda x: x.replace('\n', ' ').strip()
ann_df['raw_annotation'] = ann_df.raw_annotation.progress_map(f)

print('clipping articles...')
f = lambda x: clip_initial_article(x, nlp)
ann_df['clean_annotation'] = ann_df.raw_annotation.progress_map(f)

# print('extracting head nouns (without compounds)...')
# f = lambda x: get_head_string(x, nlp)
# ann_df['head_noun'] = ann_df.raw_annotation.progress_map(f)

print('extracting head nouns (with compounds)...')
f = lambda x: get_head_compound_string(x, nlp)
ann_df['head_noun'] = ann_df.raw_annotation.progress_map(f)

ann_df['comments'] = ''

remove newline characters and trailing whitespace...


  0%|          | 0/2552 [00:00<?, ?it/s]

clipping articles...


  0%|          | 0/2552 [00:00<?, ?it/s]

extracting head nouns (with compounds)...


  0%|          | 0/2552 [00:00<?, ?it/s]

In [7]:
ann_df_0 = ann_df.sample(frac=0.5, random_state=123).sort_index()
ann_df_1 = ann_df[
    ~ann_df.index.isin(ann_df_0.index)
].sort_index()

pd.testing.assert_frame_equal(
    ann_df,
    pd.concat([ann_df_0, ann_df_1]).sort_index()
)

for i, df in enumerate([ann_df_0, ann_df_1]):
    out_path = osp.join(output_dir, f'collected_annotations_clean_auto_{i}.csv')
    print(f'write df {i} to {out_path}')
    df.to_csv(out_path)

write df 0 to ../collected_data/intermediate/head_noun_extraction/0_0/collected_annotations_clean_auto_0.csv
write df 1 to ../collected_data/intermediate/head_noun_extraction/0_0/collected_annotations_clean_auto_1.csv


In [8]:
len(ann_df_0), len(ann_df_1)

(1276, 1276)

In [9]:
ann_df_0.head()

Unnamed: 0,item_identifyer,raw_annotation,clean_annotation,head_noun,comments
0,3-none-bws0_0,A PRIEST,priest,priest,
5,3-none-bws0_4,person praying,person praying,person,
7,3-kitchen-sws6_10,table,table,table,
12,3-kitchen-sws6_4,knight,knight,knight,
13,3-bathroom-sws2_0,bathroom attendant,bathroom attendant,bathroom attendant,


In [10]:
ann_df_1.head()

Unnamed: 0,item_identifyer,raw_annotation,clean_annotation,head_noun,comments
1,3-none-bws0_1,praying person,praying person,person,
2,3-none-bws0_10,a person reading a book,person reading a book,person,
3,3-none-bws0_2,human in robe,human in robe,human,
4,3-none-bws0_3,Pulpit,pulpit,pulpit,
6,3-kitchen-sws6_0,rooms,rooms,rooms,
