## Imports and globals

In [1]:
from collections import Counter, defaultdict
from itertools import chain
import os

from unidecode import unidecode
import pandas as pd
import re
import spacy

## Bible preprocessing code
See ENGWEB.csv for an example of a corpus file

In [27]:
def clean_word(w, do_unidecode=False):
    """
    input is a word (string) w
    the function strips a word of punctuation (loaded from the morphology module)
    and depending do_unicode, further replaces non-unicode characters by unicode characters
    returns the resulting string
    """
    if do_unidecode: 
        return re.sub(' ', '', unidecode(w.lower(), errors='preserve').strip(punct).lower())
    else: 
        return re.sub(' ', '', w.strip(punct).lower())

def clean_sent(s):
    """
    takes a sentence s (a list of strings) and adds start-of-string and end-of-string $ characters to
    the cleaned words (string)
    """
    return list(filter(lambda w : w != '$$', 
                       map(lambda w : '$' + clean_word(w) + '$', s.split())))

def read_in_bible(path):
    """
    reads in a Bible translation from the csv into a dictionary with keys (triples of book, chapter, verse)
    mapped to values (cleaned sentences)
    """
    tb = pd.read_csv(path)
    tbd = {(t.book,t.chapter,t.verse):clean_sent(t.text) for i,t in tb.iterrows() if isinstance(t.text, str)}
    tbd = {k:v for k,v in tbd.items() if len(v) > 0}
    return tbd

In [49]:
def write_bitext(l, sbd):
    tbd = read_in_bible(bib_dir + '%s.csv' % l)
    with open('./bitexts/%s.txt' % l, 'w') as fout:
        for v in sbd:
            try: ts = ' '.join(tbd[v])
            except: ts = ''
            fout.write(ts + '\n')

## CELEX functions

In [47]:
def read_celex():
    emw = [l.strip('\n').split('\\') for l in open(emw_path)]
    eml = [l.strip('\n').split('\\') for l in open(eml_path)]
    #
    w_map = {e[1]:e[3] for e in emw}
    l_map = {e[0]: re.findall(r'\(\w+\)\[.*?\]', e[21]) for e in eml if e[21] != ''}
    return w_map, l_map   

In [29]:
def transform_celex_parse(p):
    labs = {'N' : 'NOUN', 'V' : 'VERB', 'P' : 'ADP', 'A' : 'ADJ', 'B' : 'ADV', 'Q' : 'NUM', 
            'O' : 'PRON', 'D' : 'DET', 'X' : 'AFX', 'C' : 'SCONJ'}
    buf = ''
    bufcat = None
    done = []
    for pi in p:
        pw,pc = re.sub(r'\((\w+)\).*', r'\1', pi), re.sub(r'.*\[([^\]]+)\]', r'\1', pi)
        if 'x' in pc:
            buf += ('_' if buf != '' else '') + pw
            bufcat = pc.split('|')[0] if set(pc.split('|')[1]) == {'.', 'x'} else 'X'
        else:
            if '|' in pc:
                pc = 'X'
            done.append(pw + '/' + labs[pc])
    
    if buf != '': 
        done.append(buf + '/' + labs[bufcat])
    return done

## Globals

In [None]:
nlp = spacy.load('en_core_web_sm')

punct = '.?!,"\':;()[]‘’“”–-«»\t\n\r!"#$%&\'()*+,-./:;<=>?[]^_`{|}~\¡£§©ª«¬´¶·»¿×ʻʼˆˇˈˉˊˋˍː˜ˬ˻˼̵̶̸̧̨̠̣̤̥̦̬̭̯̰̱̲̀́̂̃̄̆̇̈̌̍̓͏͘;·،؛؟‐‑–—―‘’‚‛“”„‟•…‸‹›⅓⅟ↄ→−≪≫⌊⌋⌞⌟、。〉「」『』【】〔〕ꓸꓹꓺꓻꓼꓽ꓾꓿꞉꞊ꞌ﹐﹕﹖﹗！（），－．：；＜＞？'
bib_dir = '../../../../data/bibles_2021/' # replace by path to directory where bible files live
emw_path = './emw.cd' # path to CELEX file ewm.cd
eml_path = './eml.cd' # path to CELEX file eml.cd

## Execution

In [46]:
good_books = {'JAS', '2JN', '1JN', 'EPH', '2TH', 'JHN', 'COL', 'REV', 'GAL', 'ACT', 'MRK', 'JUD', '1TH', '1PE', '3JN', 'LUK', 
              'ROM', '2PE', '2CO', 'PHP', 'TIT', '1TI', 'PHM', '2TI', '1CO', 'HEB', 'MAT'}

In [18]:
# read in seed bible
sbd_full_raw = {(r.book, r.chapter, r.verse) : r.text for i,r in pd.read_csv(bib_dir + 'ENGWEB.csv').iterrows()
                if r.book in good_books and isinstance(r.text, str)}
sbd_full_spc = {k : nlp(v) for k,v in sbd_full_raw.items()}

In [37]:
# celex
wmap, lmap = read_celex()
swords = Counter([w.lemma_ for l in sbd_full_spc.values() for w in l])
substitutions = {}
for w in swords:
    ww = w.split('/')[0]
    if ww in wmap and wmap[ww] in lmap and len(lmap[wmap[ww]]) > 1:
        substitutions[w] = transform_celex_parse(lmap[wmap[ww]])

In [69]:
# write out seed doculect files
if not os.path.isdir('./bitexts/'): 
    os.mkdir('./bitexts')
    
with open('./bitexts/seed.txt', 'w') as fout:
    for v,l in sbd_full_spc.items():
        rep_line = chain(*map(lambda w : (substitutions[w.lemma_] if w.lemma_ in substitutions else [w.lemma_+'/'+w.pos_]),
                              filter(lambda w : w.pos_ != 'PUNCT', l)))
        fout.write(' '.join(rep_line) + '\n')

with open('./bitexts/seed.dep', 'w') as fout:
    for v,l in sbd_full_spc.items():
        rep_line = chain(*map(lambda w : (map(lambda x : '%s/%s/%s' % (x, w.i, w.head.i), substitutions[w.lemma_]) 
                                          if w.lemma_ in substitutions 
                                          else ['/'.join(map(str,(w.lemma_,w.pos_,w.i,w.head.i)))]), 
                              filter(lambda w : w.pos_ != 'PUNCT', l)))
        fout.write(' '.join(rep_line) + '\n')

In [56]:
# write out target doculects
doculects = pd.read_csv('./files/language_sample.csv',sep='\t').language.to_list()
#
for doc in sorted(doculects):
    print(doc, end = ' ')
    write_bitext(doc, sbd_full_spc)

AAUWBT ACMAS3 ACUTBL AGGPNG AGMWBT ALYXXX AMFSIM AMKWBT AMMWBT AMNPNG AMPWBT AMRTBL AMUMVR AOJFIL ARLTBL AVAANT AVTWBT AZZTBL BBOBSM BDHWBT BFDWBT BIBWBT BKLLAI BOATBL BORWYI BRUNXB BSCWBT BVRXXX BVZYSS BYRWBT BYXWBT CABNVS CAPSBB CASNTM CAXSBB CBITBL CBTTBL CCOTBL CHEIBT CHRPDV CJPTJV CMEWBT CONWBT CRHIBT CRKWCV CRNWBT CRXWYI CSKATB CTGBSB DESWBT DIDWBT DIFXXX DJKWBT DTSABM DUDWYI ELLELL ESEE06 ESSWYI EUSNLT FRDWBT FUVLTBL GAHPNG GBILAI GHSPNG GRTBBS GUCTBL GUHWBT GUKBSE GUPXXX HADLAI HAKTHV HTOWBT HUNK90 HUVTBL HWCWYI IANPNG IBATIV IFBTBL INDASV IRKBST ITAR27 IZZTBL JAMBSW JAVNRF JBUIBS JICWBT KABCEB KBHWBT KERABT KFBNTA KGRLAI KHGNTV KHQBIV KIAWBT KMOWBT KMSPNG KNJSBI KORSYS KPVIBT KPWPNG KRLNEW KRSWYI KTOWBT KYCPNG LEFTBL LMEABT MAKLAI MBCWBT MCAWBT MDYBSE MEJTBL MFEBSM MFYWBT MHIBSU MHRIBT MIFWBT MILTBL MIQSBN MLPTBL MOPWBT MORBSS MPMTBL MPTWBT MSYPNG MTOTBL MWWHDV MZMWBT NABWBT NAFWBT NASPNG NHXNFB NIAIBS NIJLAI NLDHSV NOAWBT NTJXXX NTPTBL NUYXXX OPMTBL OTQTBL PADWBT PAMPBS PAONA

In [None]:
# create morphologically segmented target language files
for targtype in ['mrph','mrax']:
    for doc in sorted(doculects):
        print(targtype, doc)
        morph = dict(map(lambda x : (x.split('\t')[0], x.strip('\n').split('\t')[1].split()), open('./vorm/%s.lab' % doc)))
        with open('./bitexts/%s.%s' % (doc,targtype),'w') as fout:
            for l1 in open('./bitexts/%s.txt' % doc):
                morphemes = chain(*map(lambda x : morph[x.strip('$')], 
                                       l1.strip('\n').split()))
                if targtype == 'mrph': 
                    stc = ' '.join(filter(lambda m : m.split('/')[1] == 'STM', morphemes))
                elif targtype == 'mrax': 
                    stc = ' '.join(morphemes) 
                fout.write(stc + '\n')