In [1]:
import pandas as pd
import spacy
import numpy as np
import childespy
from os import path

In [9]:
NUM_BINS = 5
TOTAL_SAMPLES = 1000
MAX_POS_SAMPLES = 250

# need collection, language, corpus, and role if token_csv_name is none
COLLECTION = None
LANGUAGE = None
CORPUS = 'Providence'
ROLE = None

UTTERANCE_CSV_NAME = None 
TOKEN_CSV_NAME = None

FULL_SAMPLED_TOKENS_CSV_NAME = 'sampled_full.csv'

In [14]:
if (TOKEN_CSV_NAME is not None and path.exists(TOKEN_CSV_NAME)):
    all_tokens = pd.read_csv(TOKEN_CSV_NAME, keep_default_na = False, index_col=0)
else:
    all_tokens = childespy.get_tokens(collection=COLLECTION, language=LANGUAGE, corpus=CORPUS, role=ROLE, token="%")


In [15]:
token_copy = all_tokens.copy()
token_copy = token_copy[token_copy['part_of_speech'].astype(bool)] # omit the empty POS
token_copy = token_copy[token_copy['part_of_speech'].str.find(" ") == -1] # omit multiple-POS entries
token_copy = token_copy[~token_copy['gloss'].str.contains('xxx')]
token_copy = token_copy[~token_copy['gloss'].str.contains('yyy')]

def simplify_pos(pos):
    return pos[:pos.find(":")] if (pos.find(":") != -1 and pos.find(" ") == -1) else pos

def contraction_pos(part_of_speech, clitic, suffix):
    '''
    this function is very english-centric and childes-centric
    could incorporate another model, e.g. spacy, to find contractions
    '''
    if suffix == "dn POSS" or suffix == "dn AGT POSS":
        return part_of_speech + "+poss"
    if clitic == "":
        return part_of_speech
    else:
        return part_of_speech + "+" + clitic.split()[0]

simplified_pos_tokens = token_copy.copy()
simplified_pos_tokens['part_of_speech'] = token_copy['part_of_speech'].map(simplify_pos)
simplified_pos_tokens['part_of_speech'] = simplified_pos_tokens.apply(lambda x: contraction_pos(x['part_of_speech'], x['clitic'], x['suffix']), axis=1)
simplified_pos_tokens


Unnamed: 0,id,gloss,language,token_order,prefix,part_of_speech,stem,actual_phonology,model_phonology,suffix,...,target_child_name,target_child_age,target_child_sex,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id
1,61164901,where,eng,1,,pro,where,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
2,61164902,do,eng,2,,mod,do,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
3,61164903,you,eng,3,,pro,you,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
4,61164904,want,eng,4,,v,want,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
5,61164905,me,eng,5,,pro,me,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814599,63103187,lick,eng,3,,v,lick,liʔ,lɪk,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17280964
1814600,63103188,hippo,eng,4,,n,hippo,ɪ,hɪ,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17280964
1814601,63103189,hippo,eng,1,,n,hippo,hɪpo,hɪpoʊ,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17280992
1814604,63103192,la,eng,2,,co,la,lɑ,lɑː,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17281030


In [16]:
all_samples = pd.DataFrame()

def sample_from(pos_df, all_samples):
    # aggregate by type to count type frequency within POS
    types = pos_df.groupby(['gloss']).id.agg(np.size).reset_index()
    pos_df_with_frequency = pos_df.merge(types, on=['gloss'], suffixes=(None, "_frequency"))
    pos_df_with_frequency.rename(columns={'id_frequency': 'frequency'}, inplace=True)
    pos_df_with_frequency['log_frequency'] = np.nan
    pos_df_with_frequency['log_frequency'] = np.log(pos_df_with_frequency['frequency'])

    # assign frequency bin to each token
    token_bins = pd.cut(pos_df_with_frequency['log_frequency'], NUM_BINS, labels=list(range(NUM_BINS)))
    pos_df_with_bins = pos_df_with_frequency.join(token_bins, rsuffix="_bin")

    samples_per_bin = int(MAX_POS_SAMPLES/NUM_BINS)
    logfreq_sampled_tokens = pos_df_with_bins.groupby('log_frequency_bin').apply(lambda x: x.sample(n=samples_per_bin, replace=True) if len(x)>0 else x).reset_index(drop=True)
    logfreq_sampled_tokens.drop_duplicates(keep='first', inplace=True)
    all_samples = pd.concat([all_samples, logfreq_sampled_tokens])
    return all_samples

pos_groupby = simplified_pos_tokens.groupby(['part_of_speech'])
iterator = ["n", "v", "adj", "mod", "adv", "part", "prep", "pro", "det"]
for name in iterator:
    pos_df = pos_groupby.get_group(name) # dataframe with only the tokens of this part of speech
    all_samples = sample_from(pos_df, all_samples)

contraction_df = simplified_pos_tokens[simplified_pos_tokens['part_of_speech'].str.contains("\+")]
all_samples = sample_from(contraction_df, all_samples)

print(all_samples[["id", "gloss"]])


           id        gloss
0    61606126        kayak
1    62654815  Nasturtiums
2    62006551        curls
3    61949202        toads
4    61754789         Palm
..        ...          ...
245  62547206        gonna
246  62978642        who's
247  61208021         it's
248  62224254       that's
249  61594043       that's

[2347 rows x 2 columns]


In [17]:
thousand_samples = all_samples.sample(n=TOTAL_SAMPLES).reset_index(drop=True)
thousand_samples

Unnamed: 0,id,gloss,language,token_order,prefix,part_of_speech,stem,actual_phonology,model_phonology,suffix,...,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin
0,62438075,this,eng,1,,pro,this,,,,...,Eng-NA,21,328,22744,22743,42448,17084600,10570,9.265775,4
1,61402650,need,eng,3,,v,need,,,,...,Eng-NA,21,328,22707,22704,42252,16839687,2591,7.859799,4
2,62051390,anything,eng,4,,pro,anything,,,,...,Eng-NA,21,328,22729,22728,42378,16986849,407,6.008813,2
3,61453077,merrily,eng,4,,adv,merry,,,dadj LY,...,Eng-NA,21,328,22721,22720,42274,16851507,19,2.944439,1
4,61679240,stretching,eng,3,,part,stretch,,,PRESP,...,Eng-NA,21,328,22721,22720,42294,16923698,2,0.693147,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,62816244,largest,eng,4,,adj,large,,,SP,...,Eng-NA,21,328,22756,22755,42518,17177704,18,2.890372,1
996,61971711,picture's,eng,17,,n+cop,picture,,,,...,Eng-NA,21,328,22729,22728,42372,16979363,3,1.098612,0
997,61474947,onto,eng,9,,prep,onto,,,,...,Eng-NA,21,328,22721,22720,42286,16859093,111,4.709530,2
998,62551990,dumping,eng,2,,part,dump,dʌmpiŋ,dʌmpɪŋ,PRESP,...,Eng-NA,21,328,22743,22743,42433,17094010,17,2.833213,1


In [18]:
if (UTTERANCE_CSV_NAME is not None and path.exists(UTTERANCE_CSV_NAME)):
    utterances = pd.read_csv(UTTERANCE_CSV_NAME, keep_default_na=False)
else:
    utterances = childespy.get_utterances(corpus=CORPUS)


In [22]:
utterance_glosses = utterances.filter(['id', 'gloss'], axis=1)
samples_with_context = thousand_samples.merge(utterance_glosses, how='inner', left_on='utterance_id', right_on='id', suffixes=(None, '_utterance'))
samples_with_context.drop(samples_with_context.columns[samples_with_context.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
samples_with_context = samples_with_context.rename(columns={'gloss_utterance': 'utterance_gloss', 'id': 'token_id'})
samples_with_context = samples_with_context.drop(columns=['id_utterance', 'part_of_speech'])
samples_with_context = samples_with_context.merge(all_tokens[['id', 'part_of_speech']], how="left", left_on='token_id', right_on='id')
samples_with_context = samples_with_context.drop(columns=['id'])
samples_with_context


Unnamed: 0,token_id,gloss,language,token_order,prefix,stem,actual_phonology,model_phonology,suffix,num_morphemes,...,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin,utterance_gloss,part_of_speech
0,62438075,this,eng,1,,this,,,,1,...,328,22744,22743,42448,17084600,10570,9.265775,4,this one looks like she's driving a,pro:dem
1,61402650,need,eng,3,,need,,,,1,...,328,22707,22704,42252,16839687,2591,7.859799,4,now I need,v
2,62051390,anything,eng,4,,anything,,,,1,...,328,22729,22728,42378,16986849,407,6.008813,2,did we see anything else when we were there,pro:indef
3,61453077,merrily,eng,4,,merry,,,dadj LY,3,...,328,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv
4,61453075,merrily,eng,2,,merry,,,dadj LY,3,...,328,22721,22720,42274,16851507,19,2.944439,1,merrily merrily merrily merrily,adv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,62816244,largest,eng,4,,large,,,SP,2,...,328,22756,22755,42518,17177704,18,2.890372,1,one of the largest snakes is the giant anacond...,adj
996,61971711,picture's,eng,17,,picture,,,,2,...,328,22729,22728,42372,16979363,3,1.098612,0,okay let's put them all take them out and put ...,n
997,61474947,onto,eng,9,,onto,,,,1,...,328,22721,22720,42286,16859093,111,4.709530,2,it's a whole stack so Rusty's pushing Percy on...,prep
998,62551990,dumping,eng,2,,dump,dʌmpiŋ,dʌmpɪŋ,PRESP,2,...,328,22743,22743,42433,17094010,17,2.833213,1,it's dumping some dirt,part


In [23]:
samples_with_context.to_csv(FULL_SAMPLED_TOKENS_CSV_NAME)