In [1]:
import pandas as pd
import random
from tqdm import tqdm
from animacy_tagging import *

try:
    from collections.abc import Iterable
except ImportError:
    from collections import Iterable

from IPython.display import clear_output, display

In [2]:
lang = 'de'
subset = 'all'

In [3]:
fpaths = ['/home/echeng/morph_systems_entropy/wiki/{}/{}_{}_wikipedia.conllu'.format(lang, lang, i) for i in range(20)]

In [5]:
def subsample_nouns_and_save(lang, subset, genders, numbers):

    filepath = f'/home/echeng/morph_systems_entropy/wiki/{lang}/{subset}_tokens_{lang}_wikipedia.csv'

    tokens_df = pd.read_csv(filepath)
    print(tokens_df.head(10))

    # cut tokens_df into forms existing in both sing & plur
    number_dfs = {
        number: tokens_df[tokens_df['number']==number] for number in numbers
    }
    dfs = []
    for number in number_dfs:
        number_df = number_dfs[number]

        # sample 125 masc and 125 fem nouns
        gender_dfs = {
            gender: number_df[number_df['gender']==gender] for gender in genders
        }
        for gender in gender_dfs:
            print(f'{number} {gender}: {len(gender_dfs[gender])}')

        # get their "other" number counterpart
        other_number = 'Sing' if number == 'Plur' else 'Plur'
        other_number_df = sing_plur_dfs[other_number]
        masc_other_df = other_number_df[other_number_df['gender']=='Masc']
        fem_other_df = other_number_df[other_number_df['gender']=='Fem']

        # cut dfs to those existing in both plur/sing
        masc_lemmas_both_plur_sing = set(masc_df['lemma']).intersection(set(masc_other_df['lemma']))
        fem_lemmas_both_plur_sing = set(fem_df['lemma']).intersection(set(fem_other_df['lemma']))

        masc_df = masc_df[masc_df['lemma'].isin(masc_lemmas_both_plur_sing)]
        fem_df = fem_df[fem_df['lemma'].isin(fem_lemmas_both_plur_sing)]

        # Sample df
        sampled_masc_df = masc_df.sample(min(125, len(masc_df)))
        sampled_fem_df = fem_df.sample(min(125, len(fem_df)))
        print(f'Sampled masc {len(sampled_masc_df)}, fem {len(sampled_fem_df)}')
        # Match to other side
        sampled_masc_other_df = masc_other_df[masc_other_df['lemma'].isin(sampled_masc_df['lemma'])]
        sampled_fem_other_df = fem_other_df[fem_other_df['lemma'].isin(sampled_fem_df['lemma'])]
        print(f'Matched {other_number} masc: {len(sampled_masc_other_df)}. fem: {len(sampled_fem_other_df)}')

        dfs.append(sampled_masc_other_df)
        dfs.append(sampled_masc_df)
        dfs.append(sampled_fem_df)
        dfs.append(sampled_fem_other_df)

    subsampled_df = pd.concat(dfs)
    print('Number of nouns before dedup: ', len(subsampled_df))

    subsampled_df = subsampled_df.drop_duplicates()
    print('After dedup: ', len(subsampled_df))
    savepath = f'/home/echeng/morph_systems_entropy/wiki/{lang}/{subset}_tokens_contextentropy_subsample_{lang}_wikipedia.csv'
    subsampled_df.to_csv(savepath)

In [4]:
tokens = pd.read_csv(f'/home/echeng/morph_systems_entropy/wiki/{lang}/{subset}_tokens_{lang}_wikipedia.csv').drop('Unnamed: 0', axis=1)

In [28]:
# For English
plur_and_sing_df = tokens.drop(['text', 'animate', 'gender', 'count'], axis=1)
plur_and_sing_df

Unnamed: 0,lemma,number
0,1000,Plur
1,1000,Sing
2,100,Plur
3,100,Sing
4,11,Plur
...,...,...
74027,pib,Plur
74028,gen,Plur
74029,iron,Plur
74030,pr,Sing


In [29]:
transparent_lemmas = plur_and_sing_df.groupby(['lemma']).count()

In [31]:
transparent_lemmas = transparent_lemmas[transparent_lemmas['number']==2]

In [39]:
transparent_lemmas_subset = set(transparent_lemmas.sample(500).index)
transparent_lemmas_subset

{'90',
 'acceptance',
 'accordance',
 'actuary',
 'adam',
 'adherence',
 'admittance',
 'adolescence',
 'adventurer',
 'advocacy',
 'aerosol',
 'affliction',
 'aggravator',
 'airbrake',
 'alarmist',
 'altimeter',
 'ammunition',
 'analysand',
 'anarchist',
 'anarchy',
 'annalist',
 'antiquarian',
 'aquarium',
 'aqueduct',
 'arb',
 'archer',
 'arietta',
 'aristocracy',
 'artificer',
 'artistry',
 'asker',
 'aster',
 'atheneum',
 'automatism',
 'bacchant',
 'balanitis',
 'baldpate',
 'ballcock',
 'barracuda',
 'basidiospore',
 'bastille',
 'batter',
 'beadle',
 'beginning',
 'bequest',
 'beth',
 'bicorn',
 'blimp',
 'bmus',
 'boater',
 'bodybuilder',
 'bole',
 'bounder',
 'bourse',
 'boutonniere',
 'bowline',
 'bowling',
 'brachyuran',
 'branding',
 'brassica',
 'budge',
 'butte',
 'buxus',
 'byway',
 'campsite',
 'cantonment',
 'carcinoid',
 'caret',
 'carrycot',
 'castrato',
 'catechism',
 'category',
 'cauliflower',
 'celandine',
 'cervid',
 'chandlery',
 'chanfron',
 'chico',
 'chief'

In [40]:
tokens_subset = tokens[tokens['lemma'].isin(transparent_lemmas_subset)]
tokens_subset

Unnamed: 0,text,lemma,gender,number,animate,count
93,90s,90,,Plur,False,8183
94,90s,90,,Sing,False,11
429,acceptance,acceptance,,Sing,False,24159
430,acceptances,acceptance,,Plur,False,144
483,accordance,accordance,,Sing,False,28110
...,...,...,...,...,...,...
73533,xivs,xiv,,Plur,False,6
73632,yardstick,yardstick,,Sing,False,307
73633,yardsticks,yardstick,,Plur,False,38
73962,zoo,zoo,,Sing,False,14005


In [41]:
tokens_subset.to_csv(f'/home/echeng/morph_systems_entropy/wiki/{lang}/{subset}_tokens_contextentropy_subsample_{lang}_wikipedia.csv')

In [42]:
noun_list = f'/home/echeng/morph_systems_entropy/wiki/{lang}/{subset}_tokens_contextentropy_subsample_{lang}_wikipedia.csv'
noun_list = list(pd.read_csv(noun_list)['text'])

In [52]:
import os
containing_directory = f'/home/echeng/morph_systems_entropy/wiki/contexts/{lang}/{subset}'
if not os.path.exists(containing_directory):
    os.makedirs(containing_directory)

In [43]:
savepaths = {noun: f'/home/echeng/morph_systems_entropy/wiki/contexts/{lang}/{subset}/{noun}10_10.txt' for noun in noun_list}
savepaths

{'90s': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/90s10_10.txt',
 'acceptance': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/acceptance10_10.txt',
 'acceptances': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/acceptances10_10.txt',
 'accordance': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/accordance10_10.txt',
 'accordances': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/accordances10_10.txt',
 'actuaries': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/actuaries10_10.txt',
 'actuary': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/actuary10_10.txt',
 'adam': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/adam10_10.txt',
 'adams': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/adams10_10.txt',
 'adherence': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/adherence10_10.txt',
 'adherences': '/home/echeng/morph_systems_entropy/wiki/contexts/en/all/adherences10_10.txt',
 'admitt

In [46]:
# Maintain a buffer of 21 words
from collections import deque
import time
max_len = 21

In [None]:
# Stream in filepath. Step through each line, updating the deque. 
for fpath in fpaths[2:]:
    buffer = deque(maxlen=max_len)
    if not os.path.exists(fpath): continue
    with open(fpath, 'r') as file:
        line = True
        counter = 0
        t = time.time()

        while line:
            line = file.readline()
            if '#' in line or len(line) < 2: continue

            # Read new token
            split_line = line.split('\t')
            text = split_line[1].lower()
            pos = split_line[3]

            if pos == '_': continue

            # Add it to the deque
            buffer.append(text + '_' + pos) # abacus_NOUN

            # Check if the center noun is in the subset of nouns of interest
            if len(buffer) < max_len: continue
            center_word = buffer[10].split('_')[0]

            if center_word in savepaths:
                # Append the buffer to the relevant file
                with open(savepaths[center_word], 'a') as w:
                    w.write(' '.join(buffer) + '\n')        