In [1]:
import pandas as pd
import numpy as np
import childespy
import math

In [2]:
# set parameters

FULL_SAMPLED_TOKENS_CSV_NAME = 'sampled_full.csv'
TAGGED_SAMPLED_TOKENS_CSV_NAME = 'sampled_tagged.csv'

In [3]:
### set up get_nlp_tokenizer such that it returns a tagger's parse of an utterance without '_' or '+' ###
"""
the return value of get_nlp_tokenizer should be in the form
[{text: text, pos_: pos, morph: morph} for token in utterance]
"""

# spacy

import spacy
nlp = spacy.load("en_core_web_sm")

def get_nlp_tokenizer(utterance):
    info = nlp(utterance.replace("_", " ").replace("+", " "))
    return [{'text':word.text, 'pos':word.pos_,'morph':'|'.join(word.morph)} for word in info]


# stanza
''' 
import stanza

stanza.download('en')
nlp = stanza.Pipeline('en')

def get_nlp_tokenizer(utt):
    info = nlp(utt.replace("_", " ").replace("+", " "))
    all_tokens = []
    for sent in info.sentences:
        all_tokens.extend([{'text':word.text, 'pos':word.upos, 'morph':word.feats} for word in sent.words])
    return all_tokens
'''

# corenlp
'''
import stanza

corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

from stanza.server import CoreNLPClient

# https://stanfordnlp.github.io/stanza/client_properties.html

client = CoreNLPClient(
    annotators=['tokenize','ssplit', 'pos', 'depparse'], 
    memory='4G', 
    endpoint='http://localhost:9001',
    be_quiet=True)
print(client)

def get_nlp_tokenizer(utt):
    # nlp(utt.replace("_", " ").replace("+", " "))._sentences[0]
    info = client.annotate(utt.replace("_", " ").replace("+", " "))
    return [{'text':token.word, 'pos':token.pos, 'morph':''} for token in info.sentence[0].token]
'''


'\nimport stanza\n\ncorenlp_dir = \'./corenlp\'\nstanza.install_corenlp(dir=corenlp_dir)\n\n# Set the CORENLP_HOME environment variable to point to the installation location\nimport os\nos.environ["CORENLP_HOME"] = corenlp_dir\n\nfrom stanza.server import CoreNLPClient\n\n# https://stanfordnlp.github.io/stanza/client_properties.html\n\nclient = CoreNLPClient(\n    annotators=[\'tokenize\',\'ssplit\', \'pos\', \'depparse\'], \n    memory=\'4G\', \n    endpoint=\'http://localhost:9001\',\n    be_quiet=True)\nprint(client)\n\ndef get_nlp_tokenizer(utt):\n    # nlp(utt.replace("_", " ").replace("+", " "))._sentences[0]\n    info = client.annotate(utt.replace("_", " ").replace("+", " "))\n    return [{\'text\':token.word, \'pos\':token.pos, \'morph\':\'\'} for token in info.sentence[0].token]\n'

In [None]:
### PROCESSING SAMPLED_FULL_FRENCH BY SPLITTING TOKENS ###

In [4]:
sampled_full_tokens = pd.read_csv(FULL_SAMPLED_TOKENS_CSV_NAME, keep_default_na=False, index_col=0)
sampled_full_tokens

Unnamed: 0,token_id,gloss,language,token_order,prefix,stem,actual_phonology,model_phonology,suffix,num_morphemes,...,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin,utterance_gloss,part_of_speech
0,62438075,this,eng,1,,this,,,,1,...,328,22744,22743,42448,17084600,10570,9.265775,4,this one looks like she's driving a,pro:dem
1,61402650,need,eng,3,,need,,,,1,...,328,22707,22704,42252,16839687,2591,7.859799,4,now I need,v
2,62051390,anything,eng,4,,anything,,,,1,...,328,22729,22728,42378,16986849,407,6.008813,2,did we see anything else when we were there,pro:indef
3,61453077,merrily,eng,4,,merry,,,dadj LY,3,...,328,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv
4,61453075,merrily,eng,2,,merry,,,dadj LY,3,...,328,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,62816244,largest,eng,4,,large,,,SP,2,...,328,22756,22755,42518,17177704,18,2.890372,1,one of the largest snakes is the giant anacond...,adj
996,61971711,picture's,eng,17,,picture,,,,2,...,328,22729,22728,42372,16979363,3,1.098612,0,okay let's put them all take them out and put ...,n
997,61474947,onto,eng,9,,onto,,,,1,...,328,22721,22720,42286,16859093,111,4.709530,2,it's a whole stack so Rusty's pushing Percy on...,prep
998,62551990,dumping,eng,2,,dump,dʌmpiŋ,dʌmpɪŋ,PRESP,2,...,328,22743,22743,42433,17094010,17,2.833213,1,it's dumping some dirt,part


In [5]:
# alignment function

def childes_to_tagger_mapping(utterance, tokenizer_info):
    '''
    utterance: utterance as recorded in childesdb (contains spaces, underscores, plus signs, apostrophes, etc.)
    returns: dictionary mapping (token index in utterance, token) to a list of indices that the token maps to in the tagger's tokenizer
    '''

    # map each childes token (by token index) into a list of indices (character indices)
    char_indices_of_token = dict()
    char_index = 0
    for index, token in enumerate(utterance.split()):
        token_char_length = len(token) - token.count("_") - token.count("+")
        char_indices_of_token[(index, token)] = [i for i in range(char_index, char_index + token_char_length)]
        char_index += token_char_length
        # len(token) gets complicated when we remove _ and + from utterance

    # map each index to a token in the tokenizer
    char_index_to_tokenizer = dict()
    char_index = 0
    for token_index in range(len(tokenizer_info)):
        token = tokenizer_info[token_index]['text']
        for i in range(char_index, char_index + len(token)):
            char_index_to_tokenizer[i] = token_index
        char_index += len(token)
  
    # map token in utterance to token in tokenizer using char_indices_of_token and char_index_to_tokenizer
    utterance_to_tokenizer_index = dict()
    for index, token in enumerate(utterance.split()):
        tokenizer_tokens = set()
        for i in char_indices_of_token[(index, token)]:
            tokenizer_tokens.add(char_index_to_tokenizer[i])
        tokenizer_tokens = list(tokenizer_tokens)
        tokenizer_tokens.sort()
        # utterance_to_tokenizer_index[(index, token)] = tokenizer_tokens
        utterance_to_tokenizer_index[index] = tokenizer_tokens

    return utterance_to_tokenizer_index


In [6]:
def tagger_to_childes_mapping(utterance, tokenizer_info):
    '''
    utterance: utterance as recorded in childesdb (contains spaces, underscores, plus signs, apostrophes, etc.)
    returns: dictionary mapping (token index in tagger's tokenizer, token) to a list of indices that the token maps to in the childesdb utterance
    '''

    # map each tagger token (by token index) into a list of character indices
    tagger_to_index = dict()
    char_index = 0
    for token_index in range(len(tokenizer_info)):
        token = tokenizer_info[token_index].text
        tagger_to_index[(token_index, token)] = [i for i in range(char_index, char_index + len(token))]
        char_index += len(token)

    # map each character index to a token in the original childes utterance
    index_to_childes = dict()
    char_index = 0
    for index, token in enumerate(utterance.split()):
        token_char_length = len(token) - token.count("_") - token.count("+")
        for i in range(char_index, char_index + token_char_length):
            index_to_childes[i] = index
        char_index += token_char_length

    # map tagger token to childes token
    tagger_to_childes = dict()
    for token_index in range(len(tokenizer_info)):
        token = tokenizer_info[token_index]['text']
        indices = set()
        for char_index in tagger_to_index[(token_index, token)]:
            indices.add(index_to_childes[char_index])
        indices = list(indices)
        indices.sort()
        # tagger_to_childes[(token_index, token)] = indices
        tagger_to_childes[token_index] = indices
    
    return tagger_to_childes


In [7]:
# if clitic exists, take first of POS "+" first of clitic
  # create new POS category for contractions
token_copy = sampled_full_tokens.copy()

# take the general category, i.e. anything before the colon, except for dets
def simplify_pos(pos): return pos[:pos.find(":")] if (pos.find(":") != -1 and pos.find(" ") == -1 and pos[:pos.find(":")] != "det") else pos

# determine if it's a contraction
def contraction_pos(x):
    part_of_speech, clitic, suffix = x['part_of_speech'], x['clitic'], x['suffix']
    gloss = x['gloss']
    utterance = x['utterance_gloss']
    token_order = x['token_order']
    info = get_nlp_tokenizer(utterance)
    alignment = childes_to_tagger_mapping(utterance, info)
    if len(alignment[token_order - 1]) > 1:
        num_subtokens = len(alignment[token_order - 1])
        fake_gloss = "_".join([info[tagger_tok_ind]['text'] for tagger_tok_ind in alignment[token_order - 1]])
        # determine part of speech
        # this is somewhat english-centric (mostly for suffix), not sure how to get around that
        if "POSS" in suffix:
            fake_part_of_speech = (part_of_speech + "+")*(num_subtokens - 1) + 'poss'
        elif clitic != "":
            fake_part_of_speech = (part_of_speech + "+")*(num_subtokens - 1) + clitic.split()[0]
        else:
            fake_part_of_speech = (part_of_speech + "+")*(num_subtokens - 1) + part_of_speech
        return (fake_gloss, fake_part_of_speech)
    return (gloss, part_of_speech)

token_copy['part_of_speech'] = token_copy['part_of_speech'].map(simplify_pos)
new = token_copy.apply(lambda x: contraction_pos(x), axis=1, result_type="expand")
token_copy['gloss'] = new[0]
token_copy['part_of_speech'] = new[1]
token_copy.drop(token_copy.columns[token_copy.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
token_copy

# token_copy['gloss'], token_copy['part_of_speech'] = token_copy.apply(lambda x: contraction_pos(x), axis=1)
# print(token_copy[["gloss", "part_of_speech"]])

Unnamed: 0,token_id,gloss,language,token_order,prefix,stem,actual_phonology,model_phonology,suffix,num_morphemes,...,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin,utterance_gloss,part_of_speech
0,62438075,this,eng,1,,this,,,,1,...,328,22744,22743,42448,17084600,10570,9.265775,4,this one looks like she's driving a,pro
1,61402650,need,eng,3,,need,,,,1,...,328,22707,22704,42252,16839687,2591,7.859799,4,now I need,v
2,62051390,anything,eng,4,,anything,,,,1,...,328,22729,22728,42378,16986849,407,6.008813,2,did we see anything else when we were there,pro
3,61453077,merrily,eng,4,,merry,,,dadj LY,3,...,328,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv
4,61453075,merrily,eng,2,,merry,,,dadj LY,3,...,328,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,62816244,largest,eng,4,,large,,,SP,2,...,328,22756,22755,42518,17177704,18,2.890372,1,one of the largest snakes is the giant anacond...,adj
996,61971711,picture_'s,eng,17,,picture,,,,2,...,328,22729,22728,42372,16979363,3,1.098612,0,okay let's put them all take them out and put ...,n+cop
997,61474947,onto,eng,9,,onto,,,,1,...,328,22721,22720,42286,16859093,111,4.709530,2,it's a whole stack so Rusty's pushing Percy on...,prep
998,62551990,dumping,eng,2,,dump,dʌmpiŋ,dʌmpɪŋ,PRESP,2,...,328,22743,22743,42433,17094010,17,2.833213,1,it's dumping some dirt,part


In [8]:
thousand_samples = token_copy.copy()
thousand_samples['split_index'] = 0

def is_contraction(gloss, pos):
    if pos.find("+") != -1:
        return True
    if gloss.find("_") != -1:
        return True
    if gloss.find("+") != -1:
        return True
    return False

def split_word(gloss):
    if gloss.find("_") != -1:
        return gloss.split("_")
    if gloss.find("+") != -1:
        return gloss.split("+")
    if gloss.find(" ") != -1:
        return gloss.split(" ")
    print(gloss)

# split contractions into two lines
split_contractions = pd.DataFrame()
# save the original one-line contraction to delete
contractions_to_delete = pd.DataFrame()

for index,row in thousand_samples.iterrows():
    gloss = row['gloss']
    pos = row['part_of_speech']
    if not is_contraction(gloss, pos): 
        continue

    # otherwise, it is a contraction
    split_gloss = split_word(gloss)
    new_rows = [row.copy() for i in range(len(split_gloss))]
    for i in range(len(split_gloss)):
        new_rows[i]['split_index'] += i
        new_rows[i]['gloss'] = split_gloss[i]
        new_rows[i]['part_of_speech'] = pos.split("+")[i]
    split_contractions = split_contractions.append(new_rows)
    contractions_to_delete = contractions_to_delete.append([row])

samples_without_contractions = pd.concat([thousand_samples, contractions_to_delete]).drop_duplicates(keep=False)
samples_with_split_contractions = samples_without_contractions.append(split_contractions).reset_index()

final_samples = samples_with_split_contractions.drop(columns=['index'])
final_samples

#final_samples = samples_with_split_contractions.filter(['id', 'token_id', 'gloss', "utterance_id", 'utterance_gloss', "part_of_speech", "log_frequency_bin", "speaker_role", "token_order", "split_index"])
#final_samples.rename(columns = {'id':'token_id'}, inplace = True) 
#print(final_samples)


Unnamed: 0,token_id,gloss,language,token_order,prefix,stem,actual_phonology,model_phonology,suffix,num_morphemes,...,speaker_id,target_child_id,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin,utterance_gloss,part_of_speech,split_index
0,62438075,this,eng,1,,this,,,,1,...,22744,22743,42448,17084600,10570,9.265775,4,this one looks like she's driving a,pro,0
1,61402650,need,eng,3,,need,,,,1,...,22707,22704,42252,16839687,2591,7.859799,4,now I need,v,0
2,62051390,anything,eng,4,,anything,,,,1,...,22729,22728,42378,16986849,407,6.008813,2,did we see anything else when we were there,pro,0
3,61453077,merrily,eng,4,,merry,,,dadj LY,3,...,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv,0
4,61453075,merrily,eng,2,,merry,,,dadj LY,3,...,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,61643679,'s,eng,8,,nephew,,,dn POSS,3,...,22729,22728,42318,16904905,2,0.693147,0,my niece's birthday was Monday and my nephew's...,poss,1
1098,61529288,Noah,eng,2,,Noah,,,dn POSS,3,...,22721,22720,42284,16873728,2,0.693147,0,the Noah's ark you have yeah,adj,0
1099,61529288,'s,eng,2,,Noah,,,dn POSS,3,...,22721,22720,42284,16873728,2,0.693147,0,the Noah's ark you have yeah,poss,1
1100,61971711,picture,eng,17,,picture,,,,2,...,22729,22728,42372,16979363,3,1.098612,0,okay let's put them all take them out and put ...,n,0


In [9]:
# add a tagger_part_of_speech and tagger_morphology column

"""
def tagger_pos(token, utterance, token_order, split_index):
  '''
  note that token_order is one-indexed
  '''
  info = get_nlp_tokenizer(utterance)
  alignment = childes_to_tagger_mapping(utterance, info)
  if token_order - 1 in alignment:
    tokenizer_index = alignment[token_order - 1]
    if split_index >= len(tokenizer_index):
        print(token)
        print(utterance)
        print(token_order)
        print(split_index)
        print(alignment)
    return info[tokenizer_index[split_index]].pos_

  # otherwise, the alignment didn't work or the token maps to multiple tagger categories
  print("1")
  print(token, token_order)
  print(utterance)
  print(str(alignment))
  info = nlp(token)
  return info[0].pos_

def tagger_morph(token, utterance, token_order, split_index):
  info = get_nlp_tokenizer(utterance)
  alignment = childes_to_tagger_mapping(utterance, info)
  if token_order - 1 in alignment:
    tokenizer_index = alignment[token_order - 1]
    return "|".join(info[tokenizer_index[split_index]].morph)

  # otherwise, the alignment didn't work or the token maps to multiple tagger categories
  print("2")
  print(token, token_order)
  print(utterance)
  print(str(alignment))
  info = nlp(token)
  return "|".join(info[0].morph)
"""

def tagger_pos_clean(token, utterance, token_order, split_index):
    '''
    note that token_order is one-indexed
    "_clean" functions remove xxx and yyy from utterance before passing into tagger
    '''
    utt_clean = " ".join(utterance.replace("xxx", "").replace("yyy", "").split())
    # ASSUMPTION FOR PROVIDENCE AND PARIS (NOT NECESSARILY TRUE FOR OTHER CORPUSES): NO CHILDES TOKEN HAS A SPACE IN IT
    # new token order is the original minus the number of xxx's and yyy's BEFORE the token_order-th word in original utt
    childes_tokens_before = utterance.split()[:token_order]
    tokord_clean = token_order - childes_tokens_before.count("xxx") - childes_tokens_before.count("yyy")
    info = get_nlp_tokenizer(utt_clean)
    alignment = childes_to_tagger_mapping(utt_clean, info)
    if tokord_clean - 1 in alignment:
        tokenizer_index = alignment[tokord_clean - 1]
        if split_index >= len(tokenizer_index):
            print("not enough subtokens")
            print(token)
            print(utterance)
            print(token_order)
            print(split_index)
            print(alignment)
        if tokenizer_index[split_index] >= len(info):
            print("not enough tagger tokens")
            print(token)
            print(utterance)
            print(token_order)
            print(split_index)
            print(alignment)
        return info[tokenizer_index[split_index]]['pos']

    # otherwise, the alignment didn't work or the token maps to multiple tagger categories
    print(token, token_order)
    print(utterance)
    print(str(alignment))
    info = nlp(token)
    return info[0]['pos']

def tagger_morph_clean(token, utterance, token_order, split_index):
    utt_clean = " ".join(utterance.replace("xxx", "").replace("yyy", "").split())
    # ASSUMPTION FOR PROVIDENCE (NOT NECESSARILY TRUE FOR OTHER CORPUSES): NO CHILDES TOKEN HAS A SPACE IN IT
    # new token order is the original minus the number of xxx's and yyy's BEFORE the token_order-th word in original utt
    childes_tokens_before = utterance.split()[:token_order]
    tokord_clean = token_order - childes_tokens_before.count("xxx") - childes_tokens_before.count("yyy")
    info = get_nlp_tokenizer(utt_clean)
    alignment = childes_to_tagger_mapping(utt_clean, info)
    if tokord_clean - 1 in alignment:
        tokenizer_index = alignment[tokord_clean - 1]
        if split_index >= len(tokenizer_index):
            print("not enough subtokens")
            print(token)
            print(utterance)
            print(token_order)
            print(split_index)
            print(alignment)
        if tokenizer_index[split_index] >= len(info):
            print("not enough tagger tokens")
            print(token)
            print(utterance)
            print(token_order)
            print(split_index)
            print(alignment)
        return info[tokenizer_index[split_index]]['morph']

    # otherwise, the alignment didn't work or the token maps to multiple tagger categories
    print(token, token_order)
    print(utterance)
    print(str(alignment))
    info = nlp(token)
    return info[0]['morph']

final_samples["tagger_part_of_speech"] = final_samples.apply(lambda x: tagger_pos_clean(x['gloss'], x['utterance_gloss'], x['token_order'], x['split_index']), axis=1)
final_samples["tagger_morphology"] = final_samples.apply(lambda x: tagger_morph_clean(x['gloss'], x['utterance_gloss'], x['token_order'], x['split_index']), axis=1)

final_samples

Unnamed: 0,token_id,gloss,language,token_order,prefix,stem,actual_phonology,model_phonology,suffix,num_morphemes,...,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin,utterance_gloss,part_of_speech,split_index,tagger_part_of_speech,tagger_morphology
0,62438075,this,eng,1,,this,,,,1,...,42448,17084600,10570,9.265775,4,this one looks like she's driving a,pro,0,DET,Number=Sing|PronType=Dem
1,61402650,need,eng,3,,need,,,,1,...,42252,16839687,2591,7.859799,4,now I need,v,0,VERB,Tense=Pres|VerbForm=Fin
2,62051390,anything,eng,4,,anything,,,,1,...,42378,16986849,407,6.008813,2,did we see anything else when we were there,pro,0,PRON,Number=Sing
3,61453077,merrily,eng,4,,merry,,,dadj LY,3,...,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv,0,ADV,
4,61453075,merrily,eng,2,,merry,,,dadj LY,3,...,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv,0,ADV,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,61643679,'s,eng,8,,nephew,,,dn POSS,3,...,42318,16904905,2,0.693147,0,my niece's birthday was Monday and my nephew's...,poss,1,PART,
1098,61529288,Noah,eng,2,,Noah,,,dn POSS,3,...,42284,16873728,2,0.693147,0,the Noah's ark you have yeah,adj,0,PROPN,NounType=Prop|Number=Sing
1099,61529288,'s,eng,2,,Noah,,,dn POSS,3,...,42284,16873728,2,0.693147,0,the Noah's ark you have yeah,poss,1,PART,
1100,61971711,picture,eng,17,,picture,,,,2,...,42372,16979363,3,1.098612,0,okay let's put them all take them out and put ...,n,0,NOUN,Number=Sing


In [10]:
columns_to_drop = ['prefix','stem','actual_phonology','model_phonology','corpus_name','suffix','clitic','num_morphemes','english','language','speaker_id','target_child_id','utterance_type','transcript_id','speaker_code','speaker_name','target_child_name','target_child_sex','collection_name','collection_id','corpus_id','frequency','log_frequency','split_index']
final_samples_filtered = final_samples.drop(columns=columns_to_drop)

final_samples_filtered

Unnamed: 0,token_id,gloss,token_order,speaker_role,target_child_age,utterance_id,log_frequency_bin,utterance_gloss,part_of_speech,tagger_part_of_speech,tagger_morphology
0,62438075,this,1,Mother,29.624838,17084600,4,this one looks like she's driving a,pro,DET,Number=Sing|PronType=Dem
1,61402650,need,3,Mother,40.329370,16839687,4,now I need,v,VERB,Tense=Pres|VerbForm=Fin
2,62051390,anything,4,Mother,41.920779,16986849,2,did we see anything else when we were there,pro,PRON,Number=Sing
3,61453077,merrily,4,Mother,20.131830,16851507,1,merrily merrily merrily merrily,adv,ADV,
4,61453075,merrily,2,Mother,20.131830,16851507,1,merrily merrily merrily merrily,adv,ADV,
...,...,...,...,...,...,...,...,...,...,...,...
1097,61643679,'s,8,Mother,19.033245,16904905,0,my niece's birthday was Monday and my nephew's...,poss,PART,
1098,61529288,Noah,2,Mother,24.821865,16873728,0,the Noah's ark you have yeah,adj,PROPN,NounType=Prop|Number=Sing
1099,61529288,'s,2,Mother,24.821865,16873728,0,the Noah's ark you have yeah,poss,PART,
1100,61971711,picture,17,Mother,36.822111,16979363,0,okay let's put them all take them out and put ...,n,NOUN,Number=Sing


In [30]:
final_samples_filtered.to_csv(TAGGED_SAMPLED_TOKENS_CSV_NAME)
