In [1]:
import pandas as pd
import spacy
import numpy as np
import childespy
from os import path

In [20]:
NUM_BINS = 5
TOTAL_SAMPLES = 1000
MAX_POS_SAMPLES = 250

# need collection, language, corpus, and role if token_csv_name is none
COLLECTION = None
LANGUAGE = None
CORPUS = 'Providence'
ROLE = None

UTTERANCE_CSV_NAME = None
TOKEN_CSV_NAME = None

FULL_SAMPLED_TOKENS_CSV_NAME = 'sampled_full.csv'

In [4]:
if (TOKEN_CSV_NAME is not None and path.exists(TOKEN_CSV_NAME)):
    all_tokens = pd.read_csv(TOKEN_CSV_NAME, keep_default_na = False)
else:
    all_tokens = childespy.get_tokens(collection=COLLECTION, language=LANGUAGE, corpus=CORPUS, role=ROLE, token="%")


R[write to console]: Using current database version: '2020.1'.

R[write to console]: Getting data from 7 children in 1 corpus ...



In [6]:
token_copy = all_tokens.copy()
token_copy = token_copy[token_copy['part_of_speech'].astype(bool)] # omit the empty POS
token_copy = token_copy[token_copy['part_of_speech'].str.find(" ") == -1] # omit multiple-POS entries
token_copy = token_copy[~token_copy['gloss'].str.contains('xxx')]
token_copy = token_copy[~token_copy['gloss'].str.contains('yyy')]

def simplify_pos(pos):
    return pos[:pos.find(":")] if (pos.find(":") != -1 and pos.find(" ") == -1) else pos

def contraction_pos(part_of_speech, clitic, suffix):
    '''
    this function is very english-centric and childes-centric
    could incorporate another model, e.g. spacy, to find contractions
    '''
    if suffix == "dn POSS" or suffix == "dn AGT POSS":
        return part_of_speech + "+poss"
    if clitic == "":
        return part_of_speech
    else:
        return part_of_speech + "+" + clitic.split()[0]

simplified_pos_tokens = token_copy.copy()
simplified_pos_tokens['part_of_speech'] = token_copy['part_of_speech'].map(simplify_pos)
simplified_pos_tokens['part_of_speech'] = simplified_pos_tokens.apply(lambda x: contraction_pos(x['part_of_speech'], x['clitic'], x['suffix']), axis=1)
simplified_pos_tokens


Unnamed: 0,id,gloss,language,token_order,prefix,part_of_speech,stem,actual_phonology,model_phonology,suffix,...,target_child_name,target_child_age,target_child_sex,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id
1,61164901,where,eng,1,,pro,where,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
2,61164902,do,eng,2,,mod,do,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
3,61164903,you,eng,3,,pro,you,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
4,61164904,want,eng,4,,v,want,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
5,61164905,me,eng,5,,pro,me,,,,...,Alex,16.887410,male,Eng-NA,21,328,22708,22704,42204,16759250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814599,63103187,lick,eng,3,,v,lick,liʔ,lɪk,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17280964
1814600,63103188,hippo,eng,4,,n,hippo,ɪ,hɪ,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17280964
1814601,63103189,hippo,eng,1,,n,hippo,hɪpo,hɪpoʊ,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17280992
1814604,63103192,la,eng,2,,co,la,lɑ,lɑː,,...,William,39.822173,male,Eng-NA,21,328,22764,22764,42569,17281030


In [9]:
all_samples = pd.DataFrame()

def sample_from(pos_df, all_samples):
    # aggregate by type to count type frequency within POS
    types = pos_df.groupby(['gloss']).id.agg(np.size).reset_index()
    pos_df_with_frequency = pos_df.merge(types, on=['gloss'], suffixes=(None, "_frequency"))
    pos_df_with_frequency.rename(columns={'id_frequency': 'frequency'}, inplace=True)
    pos_df_with_frequency['log_frequency'] = np.nan
    pos_df_with_frequency['log_frequency'] = np.log(pos_df_with_frequency['frequency'])

    # assign frequency bin to each token
    token_bins = pd.cut(pos_df_with_frequency['log_frequency'], NUM_BINS, labels=list(range(NUM_BINS)))
    pos_df_with_bins = pos_df_with_frequency.join(token_bins, rsuffix="_bin")

    samples_per_bin = int(MAX_POS_SAMPLES/NUM_BINS)
    logfreq_sampled_tokens = pos_df_with_bins.groupby('log_frequency_bin').apply(lambda x: x.sample(n=samples_per_bin, replace=True) if len(x)>0 else x).reset_index(drop=True)
    logfreq_sampled_tokens.drop_duplicates(keep='first', inplace=True)
    all_samples = pd.concat([all_samples, logfreq_sampled_tokens])
    return all_samples

pos_groupby = simplified_pos_tokens.groupby(['part_of_speech'])
iterator = ["n", "v", "adj", "mod", "adv", "part", "prep", "pro", "det"]
for name in iterator:
    pos_df = pos_groupby.get_group(name) # dataframe with only the tokens of this part of speech
    all_samples = sample_from(pos_df, all_samples)

contraction_df = simplified_pos_tokens[simplified_pos_tokens['part_of_speech'].str.contains("\+")]
all_samples = sample_from(contraction_df, all_samples)

print(all_samples[["id", "gloss"]])


           id    gloss
0    62750920    mamas
1    62662987  grating
2    61799987  rockery
3    61648025     haul
4    61386720     PPAC
..        ...      ...
245  61395347  there's
246  62173036   what's
247  62493825    gonna
248  61844456    don't
249  62014613   what's

[2357 rows x 2 columns]


In [12]:
thousand_samples = all_samples.sample(n=TOTAL_SAMPLES).reset_index(drop=True)
thousand_samples

Unnamed: 0,id,gloss,language,token_order,prefix,part_of_speech,stem,actual_phonology,model_phonology,suffix,...,collection_name,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin
0,62102927,see,eng,4,,v,see,,,,...,Eng-NA,21,328,22744,22743,42392,17011174,8831,9.086024,4
1,61333767,when,eng,3,,pro,when,,,,...,Eng-NA,21,328,22704,22704,42236,16810692,149,5.003946,2
2,62715979,the,eng,4,,det,the,,,,...,Eng-NA,21,328,22756,22755,42513,17152784,60451,11.009588,4
3,61461882,shall,eng,3,,mod,shall,,,,...,Eng-NA,21,328,22721,22720,42278,16849153,210,5.347108,2
4,62646188,needs,eng,2,,v,need,,,3S,...,Eng-NA,21,328,22756,22755,42497,17133390,456,6.122493,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,62689743,Dinah,eng,1,,n,Dinah,,,,...,Eng-NA,21,328,22756,22755,42506,17148056,15,2.708050,1
996,61439761,car,eng,3,,n,car,,,,...,Eng-NA,21,328,22721,22720,42269,16845998,1796,7.493317,4
997,61594706,zero,eng,14,,det,zero,,,,...,Eng-NA,21,328,22721,22720,42298,16890213,36,3.583519,1
998,62459454,shall,eng,1,,mod,shall,,,,...,Eng-NA,21,328,22744,22743,42444,17084032,210,5.347108,2


In [13]:
if (UTTERANCE_CSV_NAME is not None and path.exists(UTTERANCE_CSV_NAME)):
    utterances = pd.read_csv(UTTERANCE_CSV_NAME, keep_default_na=False)
else:
    utterances = childespy.get_utterances(corpus=CORPUS)


R[write to console]: Using current database version: '2020.1'.

R[write to console]: Getting data from 7 children in 1 corpus ...



In [16]:
utterance_glosses = utterances.filter(['id', 'gloss'], axis=1)
samples_with_context = thousand_samples.merge(utterance_glosses, how='inner', left_on='utterance_id', right_on='id', suffixes=(None, '_utterance'))
samples_with_context.drop(samples_with_context.columns[samples_with_context.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
samples_with_context = samples_with_context.rename(columns={'gloss_utterance': 'utterance_gloss', 'id': 'token_id'})
samples_with_context = samples_with_context.drop(columns=['id_utterance'])
samples_with_context


Unnamed: 0,token_id,gloss,language,token_order,prefix,part_of_speech,stem,actual_phonology,model_phonology,suffix,...,collection_id,corpus_id,speaker_id,target_child_id,transcript_id,utterance_id,frequency,log_frequency,log_frequency_bin,utterance_gloss
0,62102927,see,eng,4,,v,see,,,,...,21,328,22744,22743,42392,17011174,8831,9.086024,4,they like to see you
1,61333767,when,eng,3,,pro,when,,,,...,21,328,22704,22704,42236,16810692,149,5.003946,2,then that's when I'll play with you okay
2,62715979,the,eng,4,,det,the,,,,...,21,328,22756,22755,42513,17152784,60451,11.009588,4,it's best for the lamp+shade
3,61461882,shall,eng,3,,mod,shall,,,,...,21,328,22721,22720,42278,16849153,210,5.347108,2,forever you shall hold your banner
4,62646188,needs,eng,2,,v,need,,,3S,...,21,328,22756,22755,42497,17133390,456,6.122493,3,Frosty needs a tub huh
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,62689743,Dinah,eng,1,,n,Dinah,,,,...,21,328,22756,22755,42506,17148056,15,2.708050,1,Dinah won'tchu blow
996,61439761,car,eng,3,,n,car,,,,...,21,328,22721,22720,42269,16845998,1796,7.493317,4,in the car train
997,61594706,zero,eng,14,,det,zero,,,,...,21,328,22721,22720,42298,16890213,36,3.583519,1,there's the number eight there's the number tw...
998,62459454,shall,eng,1,,mod,shall,,,,...,21,328,22744,22743,42444,17084032,210,5.347108,2,shall we have a civilized breakfast together


In [21]:
samples_with_context.to_csv(FULL_SAMPLED_TOKENS_CSV_NAME)