In [18]:
from collections import Counter, defaultdict
from bs4 import BeautifulSoup
import os

###  Создадим словарь морфологических обозначений

In [9]:
def open_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        xml = f.read()
        soup = BeautifulSoup(xml, 'lxml')
    return soup

In [10]:
soup = open_file('Формулы/xml/sagHdr.xml')

In [11]:
CODE_POS = {}

for i in soup.find_all("list")[27]:
    if i != '\n':
        CODE_POS[i.attrs['id']] = i.string

Определяем часть речи по коду

In [20]:
pos_short = {
    'unspecified': 'UNK',
    'numeral': 'NUM',
    'verb': 'V',
    'noun': 'N',
    'adjective': 'ADJ',
    'article': 'ART',
    'pronoun': 'PRO',
    'foreign-word': 'F',
    'conjunction': 'CONJ',
    'comparative-degree': 'COMP',
    'superlative-degree': 'SLAT',
    'preposition': 'PREP',
    'exclamation': 'EX',
    'abbreviation': 'ABB',
    'adverb': 'ADV',
}

In [14]:
def POS(code, CODE_POS):
    '''
    Функция выдает часть речи по коду
    '''
    
    pos = CODE_POS.get(code)
    if pos:
        pos = pos.split(' ')[0]
        pos = pos_short[pos]
    
    return pos

In [15]:
POS('ao', CODE_POS)

'PREP'

### Открываем данные

In [12]:
path = 'Формулы/xml'

In [67]:

def get_lemmas_from_xml(soup, word_lem_morph):

    data, data_lemmas, data_morph, data_all = [], [], [], []

    for sent in soup.find_all("s"):
    
        s1, s2, s3, s4 = [], [], [], []
    
        for wo in sent:
        
            if wo.name == 'w':
                x = wo.attrs['type']
                y = wo.attrs['lemma'].lower()
                z = wo.text.lower()
                w = POS(x, CODE_POS)
                s1.append(z)
                s2.append(y)
                s3.append(x)
                s4.append(y + '_' + w)
                s4.append(z + '_' + w)
                
               # if (y, x) not in word_lem_morph[z]:
               #     word_lem_morph[z].append((y, x))
       
            if wo.name == 'c':
                s1.append(wo.text)
                s2.append(wo.text)
                s3.append(wo.text)
                s4.append(wo.text)
        
        data.append(s1)
        data_lemmas.append(s2)
        data_morph.append(s3)
        data_all.append(s4)
    
    return data, data_lemmas, data_morph, data_all


In [69]:

all_xmls = []
all_words = []
all_lemmas = []
all_morphs = []
all_data = []
word_lem_morph = defaultdict(list)


for file_name in os.listdir(path):
    
    if file_name not in ('.DS_Store', 'sagHdr.xml'):
        
        p = path + '/' + file_name
        soup = open_file(p)
        data, data_lemmas, data_morph, data_all = get_lemmas_from_xml(soup, word_lem_morph)
        all_xmls.append([data, data_lemmas, data_morph])
        
        all_w = [l for i in data for l in i]
        all_w_lem = [l for i in data_lemmas for l in i]
        all_w_mor = [l for i in data_morph for l in i]
        all_ = [l for i in data_all for l in i]
        
        #print(file_name,': ', str(len(all_w)))
        
        all_words += all_w
        all_lemmas += all_w_lem
        all_morphs += all_w_mor
        all_data += all_

# COMPOUNDS

In [70]:
all_ = all_words + all_lemmas
num = len(all_)

count = Counter(all_)
for word in count:
    count[word] = count[word] / num

In [71]:
num = len(all_data)

count_pos = Counter(all_data)
for word in count_pos:
    count_pos[word] = count_pos[word] / num

In [72]:
list(count_pos.keys())[:10]

['ófeigur_N',
 'heita_V',
 'hét_V',
 'maður_N',
 '.',
 'hann_PRO',
 'búa_V',
 'bjó_V',
 'norður_ADV',
 'í_PREP']

In [73]:
list(count.keys())[:10]

['ófeigur', 'hét', 'maður', '.', 'hann', 'bjó', 'norður', 'í', 'miðfirði', 'á']

In [76]:
count['maður']

0.0063796127421547

In [75]:
count_pos['maður_N']

0.006730307456802969

In [31]:
def top_ngrams(text, num_n, top, all_=False):
    
    grams = ngrams(text, num_n)
    gram_freq = collections.Counter(grams)
    
    if all_: return gram_freq
    return gram_freq.most_common(top)

In [30]:
def add_values(beg_ngrams, ng, beg):
    val = beg_ngrams.get(ng)
    if val is None: val = 0
    beg[ng] = val

In [120]:

def compound_part(word, count, end=False):
    
    length = len(word)
    variants = {}

    for n in range(3, 20):
        
        if end:
            ind = length - n - 2
            if ind < 0: ind = 0
            w = word[ind:]
        else:
            w = word[:n]
    
        val = count.get(w)
        if val is None: val = 0
        variants[w] = val
    
    m = max(variants.values())
    return max([i for i in variants if variants[i]==m])
        


In [245]:
def compound_splitter(word, n):
    
    if n == 2:
        w2 = compound_part(word, count_pos, end=True)
        word = word[:len(word) - len(w2)]
        w1 = compound_part(word, count)
        ans = [w1, w2[:-2]]
        
    elif n > 2:

        w = compound_part(word, count_pos, end=True)
        word = word[:len(word) - len(w)]
        ans = []
        
        if word != '':
        
            for i in range(n-1):
    
                w1 = compound_part(word, count)
                ans.append(w1)       
                word = word[len(w1):]
        
        ans.append(w[:-2])
    
    return ans
    

In [237]:
compound_splitter('ólífismaður_N', 2)

['ólífis', 'maður']

In [238]:
compound_splitter('ólífisdóttirmaður_N', 3)

ólífisdóttir


['ólífis', 'dóttir', 'maður']

In [218]:
compound_splitter('höskuldsdóttir_N', 2) 

['höskulds', 'dóttir']

In [233]:
compound_splitter('мамаdóttir_N', 2) 

['мама', 'dóttir']

In [248]:
compound_splitter('langskipamaður_N', 3) 

['langskip', 'a', 'maður']