## modules, globals, and functions

In [11]:
doreco_folder = '/home/barend/Google Drive/data/doreco_v2/' # replace by location of doreco corpus

import os
import re
import pandas as pd
import numpy as np
import json
from unidecode import unidecode
from collections import Counter, defaultdict

In [2]:
# this generates a master list

punct = '–"()¡!., ¿?%@#;\t'

def eliminate_asterisks(x):
    return list(filter(lambda x : re.match(r'^([\*#]*|\<p\:\>)$', x) == None or x == '', x))

def filter_wd(x, doc):
    if re.search('[012456789]', x) != None: return ''
    elif '3' in x and doc != 'arap1274': return ''
    if re.search(r'^(\(\(|[-\[]).*([\]-]|\)\))', x) != None : return '' # comments and truncations
    elif ' ' in x.strip(' '): return ''
    x = re.sub(r'(.*)\((.*)\)(.*)', r'\1\2\3', 
                      re.sub(r'[.\-*]', '', re.sub(r'\<.*\>', '', re.sub(r'\<\<.*\>(.+)\>', r'\1', x)).strip(punct)))
    return re.sub(r'<<[a-z]+>(.*)>', r'\1', x)

def mb_gl_align(mb, gl, doc):
    if doc == 'goro1270':
        mb = [m for M in mb for m in re.sub(r'[^\s]=', ' =', M).split(' ')]
        if len(mb) == len(gl): return mb, gl
    pfx, sfx = [], []
    while len(mb) > 0 and len(gl) > 0 and re.search(r'^([=\-A-Z0-9].*|.*[-=])$', gl[0]) != None:
        pfx.append((mb.pop(0),gl.pop(0)))
    while len(mb) > 0 and len(gl) > 0 and re.search(r'^([=\-A-Z0-9].*|.*[-=])$', gl[-1]) != None:
        sfx.append((mb.pop(-1), gl.pop(-1)))
    stem = [('.'.join(mb), '.'.join(gl))]
    mb,gl = zip(*pfx + stem + sfx)
    return mb, gl

## morphological list generation

### character normalization

In [3]:
from unidecode import unidecode

bichar_transducer = json.loads(open('combined_character_transduction.json').read())

def train_global_combining_character_transducer():
    combining_char = set(range(768,880))
    char_seen = set()
    combining_digraph_ctr = Counter()
    for f in sorted(filter(lambda f : f.endswith('master.tsv'), os.listdir(outdir))):
        char_f, str_f = set(), set()
        for w,seg,freq in map(lambda x : x.split('\t'), open(outdir + f)):
            char_f |= set(w+seg.lower())
            str_f |= {w.lower(), seg.lower()}
        char_seen |= char_f
        #print(f, len(char_f))
        if next((True for c in char_f if ord(c) in combining_char),False):
            for s in str_f:
                for ci,c in enumerate(s[1:]):
                    if ord(c) in combining_char:
                        combining_digraph_ctr[s[ci:ci+2]] += 1
    transducer = {}
    for c, v in combining_digraph_ctr.most_common():
        if len(c) != 2: print(c,v); continue
        ci,cj = c
        replacement = next((c for c in map(chr, range(0,767))
                             if c.islower() and c not in char_seen and c != unidecode(ci) and unidecode(c) == unidecode(ci)),
                           next((c for c in map(chr, chain(range(161,191))) if c not in char_seen),None))
        char_seen.add(replacement)
        #print('<<<' if replacement == None else '   ', 'from:', c, 'to:', replacement, 'U=%d' % (ord(replacement) if replacement != None else -1), v)
        transducer[c] = replacement
    return transducer
            
def bicharacter_transduction(string, bichar_transducer=bichar_transducer):
    new_string = ''
    string = list(string)
    while string != []:
        c = string.pop(0)
        if len(string) > 0 and c + string[0] in bichar_transducer:
            new_string += bichar_transducer[c+string[0]]
            string.pop(0)
        else:
            new_string += c
    return new_string

BT = bicharacter_transduction

In [8]:
with open('combined_character_transduction.json', 'w') as fout:
    fout.write(json.dumps(bichar_transducer, ensure_ascii=False))

In [4]:
PUNCT_WD = '()_…:"—§”.,?!<>[]\':-='
APOSTROPHE_GLOTTAL_LANGUAGES = ['arap1274', 'cabe1245', 'even1259', 'goem1240', 'kama1351', 'sanz1248','movi1243', 'vera1241', 'urum1249',
                    'trin1278', 'taba1259', ]
COLON_LENGTHENING_LANGUAGES = ['apah1238','movi1243']
APOSTROPHE = [42892, 39, 8217, 700]

def clean_elt(w, langname, typ = 'wd'):
    if langname in APOSTROPHE_GLOTTAL_LANGUAGES and typ in {'wd','mb'}: 
        for cid in APOSTROPHE:
            w = w.replace(chr(cid), 'ʔ')
    if langname in COLON_LENGTHENING_LANGUAGES and typ in {'wd','mb'}: 
        w = w.replace(':',  'ː')
    w = w.strip(PUNCT_WD)
    return w

### write master list

In [5]:
# FOR PS:
ignore_ps = ['apah1238', 'bain1259', 'cabe1245', 'cash1254', 'jeha1242', 'jeju1234','kama1351', 
             'nisv1234', 'orko1234','sumi1235','taba1259']
single_ps = ['dolg1241', 'nort2641', 'nort2875','sanz1248', 'savo1255', 'urum1249'] # all caps, parens, nums
further_check_ps = ['goro1270', 'kaka1265', 'komn1238', 'movi1243', 'ngal1292', 'port1286', 'teop1238', 'trin1278', 'vera1241']
# replace <Not Sure> by  'X', and 'Attaches to any ...' by sth too

In [6]:
# FOR MB/GL:
# arap follow dashes instead of misalign strategy
# qual control of alignments check gl per mb and see if makes sense when multiple

misalign_docs_mb = ['apah1238', 'arap1274', 'bain1259', 'jeju1234', 'kama1351', 'nort2875', 'orko1234', 'sout2856', 'trin1278', 'urum1249']

In [12]:
if not os.path.isdir('./generated/'): os.mkdir('./generated/')
if not os.path.isdir('./generated/doreco_morphology'): os.mkdir('./generated/doreco_morphology')

for f in sorted(os.listdir(doreco_folder)):
    if not os.path.isdir(doreco_folder + f): continue
    doc = f.split('_')[1]
    print(f, doc, end = ' ')
    sf = next((fi for fi in os.listdir(doreco_folder + f) if os.path.isdir(doreco_folder + f + '/' + fi)),None)
    if sf == None: break
    df = pd.read_csv(doreco_folder + f + '/' + sf + '/doreco_%s_wd.csv' % doc)
    print(len(df))
    errors = []
    markers = Counter()
    nerrors = 0
    for i,r in df.iterrows():
        wd, mb, ps, gl = r['wd'], r['mb'], r['ps'], r['gl']
        if pd.isna(wd) or pd.isna(mb) or pd.isna(gl) or gl.strip('* -?') == '' or mb.strip('* -?') == '': 
            errors.append({'category':'has nan', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        if pd.isna(ps): ps = ''
        #
        # WD
        wdn = clean_elt(filter_wd(wd, doc).lower(), doc, 'wd')
        if wdn == '': errors.append({'category':'no word', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        #
        # GL and MB
        if 'wordplay' in gl: errors.append({'category':'wordplay', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        mbn = list(filter(lambda x : x != '', map(lambda x : clean_elt(x.lower(), doc, 'mb'), eliminate_asterisks(mb.split(' ')))))
        gln = eliminate_asterisks(map(lambda x : clean_elt(x, doc, 'gl'), gl.split(' ')))
        if len(mbn) < len(gln) and doc in misalign_docs_mb:
            mbc, glc = mb_gl_align(mbn, gln, doc)
        else: mbc, glc = mbn, gln
        if len(mbc) == 0: errors.append({'category':'no mb', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        elif len(glc) == 0: errors.append({'category':'no gl', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        elif len(mbc)!=len(glc): errors.append({'category':'unalignable', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        elif next((True for e in mbc + glc if e == ''),False): errors.append({'category':'unalignable', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        #
        # PS
        psn = eliminate_asterisks(ps.split(' '))
        # if len(psn) != len(mbc) and doc not in ignore_ps + further_check_ps + single_ps: 
        #     errors.append({'category':'no ps', 'wd' : wd, 'mb' : mb, 'gl' : gl, 'ps' : ps}); continue
        ps_good = doc not in ignore_ps + further_check_ps + single_ps
        marker = (wdn, tuple(zip(mbc, glc, (psn if ps_good else ['X']*len(mbc)))))
        markers[marker] += 1
        #if f == 'ngal1292.csv': ps = eliminate_asterisks(ps.split('-')) if not pd.isna(ps) else []
        #else: ps = eliminate_asterisks(ps.split(' ')) if not pd.isna(ps) else []

    print(doc, len(markers))
    if len(markers) == 0: continue
    pd.DataFrame(errors).to_excel('./generated/doreco_morphology/errors_%s.xlsx' % doc)
    with open('./generated/doreco_morphology/%s.master.tsv' % doc, 'w') as fout:
        fout.write('\n'.join(map(lambda x : '%s\t%s\t%d' % 
                                 (x[0][0], ' '.join('%s:%s_%s' % xi for xi in x[0][1]), x[1]),
                                 sorted(markers.items(), key = lambda x : -x[1]))))
    #break

doreco_anal1239_extended_v2.0 anal1239 19014
anal1239 0
doreco_apah1238_extended_v2.0 apah1238 20479
apah1238 3039
doreco_arap1274_extended_v2.0 arap1274 15541
arap1274 4667
doreco_bain1259_extended_v2.0 bain1259 18701
bain1259 3785
doreco_beja1238_extended_v2.0 beja1238 20134
beja1238 8534
doreco_bora1263_extended_v2.0 bora1263 48340


  df = pd.read_csv(doreco_folder + f + '/' + sf + '/doreco_%s_wd.csv' % doc)


bora1263 12740
doreco_cabe1245_extended_v2.1 cabe1245 24280
cabe1245 3727
doreco_cash1254_extended_v2.0 cash1254 13669
cash1254 2766
doreco_dolg1241_extended_v2.0 dolg1241 22301
dolg1241 6227
doreco_even1259_extended_v2.0 even1259 14609
even1259 5723
doreco_goem1240_extended_v2.0 goem1240 28957
goem1240 1680
doreco_goro1270_extended_v2.0 goro1270 26085
goro1270 5300
doreco_guri1247_extended_v2.0 guri1247 11127
guri1247 0
doreco_hoch1243_extended_v2.0 hoch1243 12523
hoch1243 2947
doreco_jeha1242_extended_v2.0 jeha1242 11411
jeha1242 991
doreco_jeju1234_extended_v2.0 jeju1234 13264
jeju1234 3853
doreco_kaka1265_extended_v2.0 kaka1265 52615
kaka1265 5167
doreco_kama1351_extended_v2.1 kama1351 47359
kama1351 5488
doreco_kark1256_extended_v2.0 kark1256 13787
kark1256 0
doreco_komn1238_extended_v2.0 komn1238 43515
komn1238 6651
doreco_ligh1234_extended_v2.0 ligh1234 11265
ligh1234 0
doreco_lowe1385_extended_v2.0 lowe1385 14727
lowe1385 0
doreco_movi1243_extended_v2.0 movi1243 14554
movi1243 

  df = pd.read_csv(doreco_folder + f + '/' + sf + '/doreco_%s_wd.csv' % doc)


sout3282 6818
doreco_stan1290_extended_v2.0 stan1290 16867
stan1290 0
doreco_sumi1235_extended_v2.0 sumi1235 16026
sumi1235 3607
doreco_svan1243_extended_v2.0 svan1243 13343
svan1243 0
doreco_taba1259_extended_v2.0 taba1259 6864
taba1259 2255
doreco_teop1238_extended_v2.0 teop1238 15341
teop1238 1705
doreco_texi1237_extended_v2.0 texi1237 14571
texi1237 2087
doreco_toto1304_extended_v2.0 toto1304 16837
toto1304 2818
doreco_trin1278_extended_v2.0 trin1278 24333
trin1278 5334
doreco_tsim1256_extended_v2.0 tsim1256 11581
tsim1256 0
doreco_urum1249_extended_v2.1 urum1249 26462
urum1249 6376
doreco_vera1241_extended_v2.0 vera1241 21663
vera1241 2723
doreco_warl1254_extended_v2.0 warl1254 11211
warl1254 0
doreco_yong1270_extended_v2.0 yong1270 10035
yong1270 0
doreco_yuca1254_extended_v2.0 yuca1254 14223
yuca1254 0
doreco_yura1255_extended_v2.0 yura1255 46439
yura1255 0


### bitext generation

In [28]:
if not os.path.isdir('./generated/doreco_bitexts'): os.mkdir('./generated/doreco_bitexts')

for f in sorted(os.listdir(doreco_folder)):
    all_wds = Counter()
    if not os.path.isdir(doreco_folder + f): continue
    doc = f.split('_')[1]
    print(f, doc, end = ' ')
    sf = next((fi for fi in os.listdir(doreco_folder + f) if os.path.isdir(doreco_folder + f + '/' + fi)),None)
    if sf == None: break
    df = pd.read_csv(doreco_folder + f + '/' + sf + '/doreco_%s_wd.csv' % doc)
    bitext_builder = {}
    ctr = Counter()
    for i,r in df.iterrows():
        wd, ref, ft = r['wd'], r['ref'], r['ft']
        if pd.isna(wd) or ' ' in wd: continue
        wdn = clean_elt(BT(filter_wd(wd, doc)).lower(), doc)
        if wdn == '': continue
        ctr[wdn] += 1
        if pd.isna(ref) or pd.isna(wd) or pd.isna(ft): continue
        if re.match(r'\*+', ft): continue
        
        if wdn == '': continue
        #
        if ref not in bitext_builder: bitext_builder[ref] = [[], list(map(lambda x : x.strip(punct), ft.split()))]
        #
        bitext_builder[ref][0] += [wdn]
    print('%s N lines = %d; N types = %d' %
          (doc, len(bitext_builder), len(set([w for l in bitext_builder.values() for w in l[0]]))))
    if len(bitext_builder) == 0: continue
    with open('./generated/doreco_bitexts/%s.txt' % doc, 'w') as fout:
        fout.write('\n'.join('%s ||| %s' % (' '.join(v[0]), ' '.join(v[1])) for k,v in sorted(bitext_builder.items()) 
                             if len(v[0]) > 0 and len(v[1]) > 0 and len(''.join(v[1])) > 0))

doreco_anal1239_extended_v2.0 anal1239 anal1239 N lines = 2855; N types = 4043
doreco_apah1238_extended_v2.0 apah1238 apah1238 N lines = 3268; N types = 2062
doreco_arap1274_extended_v2.0 arap1274 arap1274 N lines = 3231; N types = 4751
doreco_bain1259_extended_v2.0 bain1259 bain1259 N lines = 2933; N types = 4583
doreco_beja1238_extended_v2.0 beja1238 beja1238 N lines = 6330; N types = 7241
doreco_bora1263_extended_v2.0 bora1263 

  df = pd.read_csv(doreco_folder + f + '/' + sf + '/doreco_%s_wd.csv' % doc)


bora1263 N lines = 4027; N types = 11211
doreco_cabe1245_extended_v2.1 cabe1245 cabe1245 N lines = 2058; N types = 2863
doreco_cash1254_extended_v2.0 cash1254 cash1254 N lines = 1767; N types = 3487
doreco_dolg1241_extended_v2.0 dolg1241 dolg1241 N lines = 2429; N types = 5847
doreco_even1259_extended_v2.0 even1259 even1259 N lines = 2312; N types = 5171
doreco_goem1240_extended_v2.0 goem1240 goem1240 N lines = 2606; N types = 1381
doreco_goro1270_extended_v2.0 goro1270 goro1270 N lines = 4333; N types = 4612
doreco_guri1247_extended_v2.0 guri1247 guri1247 N lines = 1326; N types = 2203
doreco_hoch1243_extended_v2.0 hoch1243 hoch1243 N lines = 972; N types = 2802
doreco_jeha1242_extended_v2.0 jeha1242 jeha1242 N lines = 1332; N types = 1358
doreco_jeju1234_extended_v2.0 jeju1234 jeju1234 N lines = 1973; N types = 3773
doreco_kaka1265_extended_v2.0 kaka1265 kaka1265 N lines = 4610; N types = 4608
doreco_kama1351_extended_v2.1 kama1351 kama1351 N lines = 9099; N types = 6422
doreco_kark1

  df = pd.read_csv(doreco_folder + f + '/' + sf + '/doreco_%s_wd.csv' % doc)


sout3282 N lines = 2281; N types = 3475
doreco_stan1290_extended_v2.0 stan1290 stan1290 N lines = 0; N types = 0
doreco_sumi1235_extended_v2.0 sumi1235 sumi1235 N lines = 2478; N types = 3375
doreco_svan1243_extended_v2.0 svan1243 svan1243 N lines = 1301; N types = 4586
doreco_taba1259_extended_v2.0 taba1259 taba1259 N lines = 632; N types = 1881
doreco_teop1238_extended_v2.0 teop1238 teop1238 N lines = 1989; N types = 1153
doreco_texi1237_extended_v2.0 texi1237 texi1237 N lines = 2824; N types = 1974
doreco_toto1304_extended_v2.0 toto1304 toto1304 N lines = 3697; N types = 2972
doreco_trin1278_extended_v2.0 trin1278 trin1278 N lines = 2007; N types = 5421
doreco_tsim1256_extended_v2.0 tsim1256 tsim1256 N lines = 1596; N types = 3185
doreco_urum1249_extended_v2.1 urum1249 urum1249 N lines = 2308; N types = 5772
doreco_vera1241_extended_v2.0 vera1241 vera1241 N lines = 1938; N types = 1801
doreco_warl1254_extended_v2.0 warl1254 warl1254 N lines = 2012; N types = 2222
doreco_yong1270_ext

## spacy English

In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [30]:
english_ft = list(map(lambda l : l.strip('\n'), open('./doreco_english_doculects.txt').readlines()))
for lg in english_ft:
    new_bit = []
    for li,l in enumerate(open('./generated/doreco_bitexts/%s.txt' % lg).readlines()):
        spacied = ['/'.join(map(lambda e : str(e).replace('/','#').replace(' ', '#'), [w.text, w.i, w.lemma_, w.pos_, w.tag_, w.dep_, w.head.i]))
                   for w in nlp(l.strip('\n').split(' ||| ')[1])]
        new_bit.append((l.strip('\n').split(' ||| ')[0], ' '.join(spacied)))
        if li % 500 == 0: print(lg, li, new_bit[-1])
    with open('./generated/doreco_bitexts/%s.spc' % lg, 'w') as fout:
        fout.write('\n'.join('%s ||| %s' % ln for ln in new_bit))

anal1239 0 ('anal jeral', 'Anal/0/anal/ADJ/JJ/amod/1 proverbs/1/proverb/NOUN/NNS/ROOT/1')
anal1239 500 ('vahmi khol thung tingnǟl eh din chee choo nu̠', 'There/0/there/PRON/EX/expl/1 are/1/be/VERB/VBP/ROOT/1 lots/2/lot/NOUN/NNS/attr/1 of/3/of/ADP/IN/prep/2 earthworms/4/earthworm/NOUN/NNS/pobj/3 inside/5/inside/ADP/IN/prep/4 her/6/her/PRON/PRP$/poss/7 eyeholes/7/eyehole/NOUN/NNS/pobj/5')
anal1239 1000 ('leha vanu̠ meeting toh vaṭo val je', 'he/0/he/PRON/PRP/nsubj/1 went/1/go/VERB/VBD/ROOT/1 far/2/far/ADV/RB/advmod/3 away/3/away/ADV/RB/advmod/1 and/4/and/CCONJ/CC/cc/1 had/5/have/VERB/VBD/conj/1 a/6/a/DET/DT/det/7 meeting/7/meeting/NOUN/NN/dobj/5')
anal1239 1500 ('sinnu̠ maṭumnu̠ kanali oh chara kǟṭhope naka ve datinu̠', 'you/0/you/PRON/PRP/nsubj/1 told/1/tell/VERB/VBD/ROOT/1 me/2/I/PRON/PRP/dobj/1 that/3/that/SCONJ/IN/mark/10 the/4/the/DET/DT/det/6 youngest/5/young/ADJ/JJS/amod/6 girl/6/girl/NOUN/NN/nsubj/10 you/7/you/PRON/PRP/nsubj/8 married/8/marry/VERB/VBD/relcl/6 will/9/will/AUX/M

In [26]:
english_ft = list(map(lambda l : l.strip('\n'), open('./doreco_english_doculects.txt').readlines()))
metadata = pd.read_csv('/home/barend/Google Drive/data/doreco_v2/doreco_languages_metadata_v2.0.csv')
cols = ['%s (%s, %s, %s: \\citealp{doreco-%s}) & %d' % (r.Language, r.Glottocode, r.Family, r.Area, r.Glottocode.lower(), r['Extended word tokens'])
  for i,r in metadata[metadata.Glottocode.isin(english_ft)].iterrows()]

for i in range(len(cols)//2):
    print(cols[i] + '  &  ' + cols[i+len(cols)//2] + '\\\\')

Anal (anal1239, Sino-Tibetan, Eurasia: \citealp{doreco-anal1239}) & 14026  &  Nǁng (nngg1234, Tuu, Africa: \citealp{doreco-nngg1234}) & 27035\\
Yali (Apahapsili) (apah1238, Nuclear Trans New Guinea, Papunesia: \citealp{doreco-apah1238}) & 15243  &  Northern Kurdish (Kurmanji) (nort2641, Indo-European, Eurasia: \citealp{doreco-nort2641}) & 9657\\
Arapaho (arap1274, Algic, North America: \citealp{doreco-arap1274}) & 10279  &  Northern Alta (nort2875, Austronesian, Papunesia: \citealp{doreco-nort2875}) & 11137\\
Baïnounk Gubëeher (bain1259, Atlantic-Congo, Africa: \citealp{doreco-bain1259}) & 12522  &  Fanbyak (orko1234, Austronesian, Papunesia: \citealp{doreco-orko1234}) & 18928\\
Beja (beja1238, Afro-Asiatic, Africa: \citealp{doreco-beja1238}) & 15454  &  Pnar (pnar1238, Austroasiatic, Eurasia: \citealp{doreco-pnar1238}) & 20485\\
Cabécar (cabe1245, Chibchan, North America: \citealp{doreco-cabe1245}) & 17528  &  Daakie (port1286, Austronesian, Papunesia: \citealp{doreco-port1286}) & 118