In [1]:
import json
import re
from collections import defaultdict

In [20]:
with open('./data/dictionary.txt','r') as f:
    dictionary_arr = [json.loads(part) for part in f.readlines()]

In [19]:
dictionary_arr[0]

{'headers': ['aba', 'abo'],
 'entry': '<heading><strong>aba</strong></heading>, <heading><strong>abo</strong></heading>, <lang>свн.</lang> <lex>abe</lex> (нар.) <em>вниз, прочь, далеко</em>; <lex>aba</lex>, <lex>abe</lex>, <lex>ab</lex> (предл. с дат.) <em>с, от</em>, <lang>свн.</lang> <lex>abe</lex>, <lex>ab</lex>; <lang>нн.</lang> <lex>ab</lex>; <lang>дс.</lang> <lang>ди.</lang> <lang>го.</lang> <lex>af</lex>, <lang>да.</lang> <lang>а.</lang> <lex>of</lex>.',
 'sources': [],
 'other words': ['aba', 'abe', 'of', 'af', 'ab'],
 'linked words': [],
 'linked entries': []}

In [21]:
entries = {key: value for key, value in enumerate(dictionary_arr)}

In [21]:
entries[0]

{'headers': ['aba', 'abo'],
 'entry': '<heading><strong>aba</strong></heading>, <heading><strong>abo</strong></heading>, <lang>свн.</lang> <lex>abe</lex> (нар.) <em>вниз, прочь, далеко</em>; <lex>aba</lex>, <lex>abe</lex>, <lex>ab</lex> (предл. с дат.) <em>с, от</em>, <lang>свн.</lang> <lex>abe</lex>, <lex>ab</lex>; <lang>нн.</lang> <lex>ab</lex>; <lang>дс.</lang> <lang>ди.</lang> <lang>го.</lang> <lex>af</lex>, <lang>да.</lang> <lang>а.</lang> <lex>of</lex>.',
 'sources': [],
 'other words': ['aba', 'abe', 'of', 'af', 'ab'],
 'linked words': [],
 'linked entries': []}

In [22]:
re.sub('<.*?>','',dictionary_arr[0]['entry'])

'aba, abo, свн. abe (нар.) вниз, прочь, далеко; aba, abe, ab (предл. с дат.) с, от, свн. abe, ab; нн. ab; дс. ди. го. af, да. а. of.'

In [22]:
maps = defaultdict(list)

In [7]:
maps

defaultdict(list, {})

In [23]:
for entry in entries:
    variants = set(entries[entry]['headers']+entries[entry]['other words'])
    for var in variants:
        maps[var].append(entry)

In [24]:
maps = dict(maps)

In [25]:
maps['of']

[0, 1393]

In [27]:
entries[1393]

{'headers': ['ibu', 'ipu', 'ubi', 'obe'],
 'entry': '<heading><strong>ibu</strong></heading>, <heading><strong>ipu</strong></heading>, <heading><strong>ubi</strong></heading>, <heading><strong>obe</strong></heading>, <lang>свн.</lang> <lex variants="obe, ob">ob(e)</lex>, <lex>op</lex> (сз.) <em>если, как будто, хотя (и), ли</em>; <lang>нн.</lang> <lex>ob</lex>; <lang>дс.</lang> <lex>ef</lex>, <lex>of</lex>, <lang>ди.</lang> <lex>ef</lex>, <lang>го.</lang> <lex variants="ibai, iba">iba(i)</lex>.',
 'sources': [],
 'other words': ['ef', 'iba', 'obe', 'op', 'ob', 'ibai', 'of'],
 'linked words': [],
 'linked entries': []}

In [28]:
len(maps)

10648

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
import string
punkt = string.punctuation 
punkt

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def text_to_json(arr, latin=[], rom=[], num=[], t=()):
    result = []
    for sentence in arr:
        s = word_tokenize(sentence)
        d = {'text':'', 'words':[]}
        for word in s:
            if word.startswith(t):
                d['words'].append({'wf':word, 'wtype':'nontext'})
            elif word in rom:
                d['words'].append({'wf':word, 'wtype':'word', 'language':'rom'})
            elif word in latin:
                d['words'].append({'wf':word, 'wtype':'word', 'language':'latin'})
            elif word in num:
                d['words'].append({'wf':word, 'wtype':'num', 'language':'OHG'})
            elif word in punkt:
                d['words'].append({'wf':word, 'wtype':'punct'})
            else:
                if word in maps:
                    d['words'].append({'wf':word, 'wtype':'word', 'language':'OHG', 'entry':maps[word]})
                else:
                    d['words'].append({'wf':word, 'wtype':'word', 'language':'OHG'})
        text = ''
        #print (self.doc['sentences'][k])
        for key, word in enumerate(d['words']):
            if 'wf' not in word:
                continue
            word['off_start'] = len(text)
            if word['wtype'] == 'word':
                text += word['wf'] + ' '
                word['off_end'] = len(text) - 1
            elif word['wtype'] == 'punctl':
                text += word['wf']
                word['wtype'] = 'punct'
                word['off_end'] = len(text)
            elif word['wtype'] == 'punctr':
                if text.endswith(' '):
                    word['off_start'] -= 1
                    text = text[:-1]
                text += word['wf'] + ' '
                word['wtype'] = 'punct'
                word['off_end'] = len(text) - 1
            else:
                if word['wf'].startswith(('(', '[', '{', '<', '“')):
                    text += word['wf']
                    word['off_end'] = len(text)
                elif word['wf'].startswith((')', ']', '}', '>', '.', ':',',', '?', '!', '”', '…')):
                    if text.endswith(' '):
                        word['off_start'] -= 1
                        text = text[:-1]
                    text += word['wf'] + ' '
                    word['off_end'] = len(text) - 1
                else:
                    text += word['wf'] + ' '
                    word['off_end'] = len(text) - 1
            word['sentence_index'] = key
            word['next_word'] = key+1
        d['text'] = text
        result.append(d)
    return result

In [37]:
def read_return_text(meta, sep = '\n', u = True):
    text = open('./ahd-texts/texts/{}'.format(meta[0]),'r').read()

    text = re.sub('<latin>\n','\n<latin>', text).strip()

    latin = set(re.findall('<latin>(.*?)</latin>', text))
    rom = set(re.findall('<rom>(.*?)</rom>', text))
    num = set(re.findall('<num>(.*?)</num>', text))
    if u: text = re.sub('\n{2,}','\n', text)
    text = re.sub('(Verse): ([0-9]{1,3})',r'\1_\2', text)
    arr = text.split(sep)
    arr = [re.sub('<.*?>','', i).replace('\n','').strip() for i in arr]
    return arr, latin, rom, num

def preprocess_and_save(meta, arr, latin, rom, num, t):
    with open ('./ready_texts/{}'.format(meta[0]),'w') as f:
        text = text_to_json(arr, latin, rom, t)
        text = {'meta': {
            'german_title': meta[1],
            'russian_title':meta[2]
        }, 'sentences':text}
        json.dump(text, f, ensure_ascii=False)

In [7]:
total = open('./ahd-texts/ahd_texts.csv','r').read()

In [15]:
total = [item.split(',') for item in total.split('\n')]
total

[['merseburger1.txt',
  'Der Erste Merseburger Zauberspruch',
  'Первое Мерзебургское заклинание'],
 ['merseburger2.txt',
  'Der Zweite Merseburger Zauberspruch',
  'Второе Мерзебургское заклинание'],
 ['hildebrandslied.txt', 'Das Hildebrandslied', 'Песнь о Хильтибранте'],
 ['wessobrunner.txt', 'Das Wessobrunner Gebet', 'Вессобруннская молитва'],
 ['isidor.txt', 'Der althochdeutsche Isidor', 'Древневерхненемецкий Исидор'],
 ['muspilli.txt', 'Muspilli', 'Mуспилли'],
 ['tatian.txt', 'Der althochdeutsche Tatian', 'Древневерхненемецкий Татиан'],
 ['eide.txt', 'Die Straßburger Eide ', 'Страсбургские клятвы'],
 ['evangelienbuch.txt', 'Otrfrids Evangelienbuch', 'Евангелие Отфрида'],
 ['ludwigslied.txt', 'Ludwigslied ', 'Песнь о Людвиге'],
 ['glossen.txt',
  'Kasseler Glossen (Kasseler Gespräche)',
  'Кассельские глоссы'],
 ['notker.txt',
  'Notker - Boethius Consolatio philosophiae',
  'Ноткер - Перевод Боэция - Утешение философией'],
 ['physyologus.txt', 'Der Ältere Physyologus', 'Старший Фи

# Texts

In [33]:
#Merseburger1
k = 0
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [35]:
#Merseburger2
k = 1
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [36]:
#Hildebrandslied
k = 2
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [38]:
#Wessobrunner
k = 3
arr, latin, rom, num = read_return_text(total[k], sep = '\n\n', u=False)
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [None]:
#Isidor
k = 4
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [None]:
#Muspilli
k = 5
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [None]:
#Tatian
k = 6
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [31]:
# Eide
k = 7
arr, latin, rom, num = read_return_text(total[k], sep = '\n\n', u = False)
preprocess_and_save(total[7], arr, latin, rom, num, t=())

In [32]:
#Evangelienbuch
k = 8
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [None]:
#Ludwigslied
k = 9
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [None]:
#Glossen
k = 10
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [None]:
#Notker
k = 11
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())

In [None]:
#Physyologus
k = 12
arr, latin, rom, num = read_return_text(total[k], sep = '\n')
preprocess_and_save(total[k], arr, latin, rom, num, t=())