In [1]:
ud_path = 'ud/'

In [2]:
def make_text_file(filename, outfile):
    with open(filename, "r", encoding='utf8') as f, open(outfile, 'w', encoding='utf8') as o:
        i=0
        for line in f:
            if line[:9] == '# text = ': 
                text = line[9:]
                i+=1
                o.write(text)
    print(f'Wrote {i} sentences to file.')

In [118]:
make_text_file('conllu-files/en_formal.conllu', 'en_formal.txt')
make_text_file('conllu-files/en_literature.conllu', 'en_literature.txt')
make_text_file('conllu-files/en_news.conllu', 'en_news.txt')

Wrote 5243 sentences to file.
Wrote 4054 sentences to file.
Wrote 1000 sentences to file.


In [119]:
make_text_file('conllu-files/sv_formal.conllu', 'sv_formal.txt')
make_text_file('conllu-files/sv_literature.conllu', 'sv_literature.txt')
make_text_file('conllu-files/sv_news.conllu', 'sv_news.txt')

Wrote 5243 sentences to file.
Wrote 4054 sentences to file.
Wrote 1000 sentences to file.


In [83]:
def read_conllu(filename):
    '''Returns a dict of sentence_text:{'upos':[upos_tags...], 'xpos':[xpos_tags}'''
    
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        tokens = []
        upos_tags = []
        xpos_tags = []
        deprels = []
        
        
        for line in f:
            if line[0] == '#': #skip '# newdoc id = n01001', '# sent_id = n01001011'
                continue
            columns = line.split() # 10 cols
            if columns == []: # When reading a blank line => finish reading one sentence
                sentence_text = tuple(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['upos'] = upos_tags
                sentences[sentence_text]['xpos'] = xpos_tags
                sentences[sentence_text]['deprel'] = deprels
                tokens, upos_tags, xpos_tags, deprels = [], [], [], [] # Reset the pos lists
                continue
            tokens.append(columns[1])
            upos_tags.append(columns[3])
            xpos_tags.append(columns[4])
            deprels.append((columns[6], columns[7])) # head id, deprel tag
            

            
    print(f'Corpus contains {len(sentences)} sentences.')
    return sentences

In [117]:
eng_formal = read_conllu('conllu-files/en_formal.conllu')
eng_literature = read_conllu('conllu-files/en_literature.conllu')
eng_news = read_conllu('conllu-files/en_news.conllu')


Corpus contains 5227 sentences.
Corpus contains 4046 sentences.
Corpus contains 1000 sentences.


In [85]:
import re
def read_sparv_xml(filename):
    '''Reads the XML file from Sparv annotation and return the texts and 
    the POS and MSD tags'''
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        
        for line in f:
            if line[:3]=='<se':
                tokens = []
                pos_tags = []
                msd_tags = []
                deprels = [] 
                
            if line[:3]=='<w ':
                token = re.findall('(?<=>)[^\s]+(?=</w>)', line)[0]
                pos = re.findall('(?<=pos=")[^\s]+(?="\sm)', line)[0]
                msd = re.findall('(?<=msd=")[^\s]+(?="\s)', line)[0]
                try:
                    head_id = re.findall('(?<=dephead=")[^\s]*(?="\s)', line)[0]
                    deprel_tag = re.findall('(?<=deprel=")[^\s]+(?=">)', line)[0]
                    
                except IndexError:
                    head_id, deprel_tag = '_', '_'
                
                if msd[0]=='F' and msd[1:].islower():
                    msd = token # change tag to the punctuation itself
                if pos=='PROPN' and len(token.split('_'))>1: #split multiword PropN 
                    propn_tokens = token.split('_')
                    tokens.extend(propn_tokens)
                    pos_tags.extend([pos]*len(propn_tokens))
                    msd_tags.extend([msd]*len(propn_tokens))
                    deprels.extend([(head_id, deprel_tag)]*len(propn_tokens))
                    continue
                
                tokens.append(token)
                pos_tags.append(pos)
                msd_tags.append(msd)
                deprels.append((head_id, deprel_tag))
            
            if line[:3]=='</s':
                sentence_text = tuple(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['pos'] = pos_tags
                sentences[sentence_text]['msd'] = msd_tags
                sentences[sentence_text]['deprel'] = deprels
                
    print(f'Parser output contains {len(sentences)} sentences.')       
    return sentences


In [87]:
sparv_output = read_sparv_xml('ud-eng-pud-parsed.xml')

Parser output contains 986 sentences.


In [88]:
sparv_output_sv = read_sparv_xml('korpus.xml')

# TODDO the Sparv parser outputs only 986 sentences... find a way to force it to output 1000?

Parser output contains 22 sentences.


In [89]:
def match(list1, list2):
    # if num of tokens is the same
    if len(list1)==len(list2): 
        matched_count = sum(1 if list1[i]==list2[i] else 0 for i in range(len(list1)))
        
    else:
        matched_count = 0
    
    return matched_count / len(list1) # matched / num_tokens


In [90]:
overlaps = [x for x in sparv_output.keys() if x in ud_eng_pud.keys()]
len(overlaps)

674

In [91]:
sparv_pos_set=set()
for text, anno in sparv_output.items():
    pos = anno['pos']
    sparv_pos_set.update(pos)
print(sorted(sparv_pos_set))

corpus_pos_set=set()
for text, anno in ud_eng_pud.items():
    pos = anno['upos']
    corpus_pos_set.update(pos)
print(sorted(corpus_pos_set))

['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'VERB']
['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


In [92]:
score_accumulated = 0
eval_count=0
for key in overlaps:
    try:
        parsed_pos = sparv_output[key]['pos']
        gold_pos = ud_eng_pud[key]['upos']
        score = match(parsed_pos, gold_pos)
        score_accumulated+=score
        eval_count+=1
    except KeyError:
        pass
score_accumulated/eval_count

0.8296837289373824

In [102]:
def eng_to_swe_equivalent(eng_corpus, swe_corpus, out_fn):
    
    with open(out_fn, "w", encoding='utf8') as o:
    
        for eng_corpora, swe_corpora in list(zip(eng_corpus,swe_corpus)):
            with open(eng_corpora, "r", encoding='utf8') as en, open(swe_corpora, "r", encoding='utf8') as sv:

                swe_sent_ids = []
                for line in en:
                    if line[:14] == '# sent_id = en': # sent_id = en_lines-ud-dev-doc2-3296
                        swe_sent_id = 'sv' + line[14:].rstrip('\n')
                        swe_sent_ids.append(swe_sent_id)


                sv_lines = [l.rstrip('\n') for l in sv]
                for line in sv_lines:
                    if line[:12] == '# sent_id = ':
                        send_id = line[12:]
                        if send_id in swe_sent_ids:
                            i = sv_lines.index(line)
                            while sv_lines[i]!='':
                                o.write(sv_lines[i]+'\n') # write send_id ~ last token
                                i+=1
                            o.write('\n')
        
            

In [103]:

eng_to_swe_equivalent([f'domains/literature/en_lines-ud-{x}.conllu' for x in ['train', 'dev', 'test']], 
                     [f'UD/UD_Swedish-LinES/sv_lines-ud-{x}.conllu' for x in ['train', 'dev', 'test']],
                     'sv_literature.conllu')

In [105]:
len(read_conllu('sv_literature.conllu'))

Corpus contains 4048 sentences.


4048

In [107]:
def merge_corpus(corpus, out_fn):
    with open(out_fn, "w", encoding='utf8') as o:
        for corpora in corpus:
            with open(corpora, "r", encoding='utf8') as f:
                for line in f:
                    o.write(line)
merge_corpus([f'domains/literature/en_lines-ud-{x}.conllu' for x in ['train', 'dev', 'test']],
            'en_literature.conllu')


In [108]:
len(read_conllu('en_literature.conllu'))

Corpus contains 4046 sentences.


4046

In [113]:
def formal_corpus(lines_fn, lit_fn, out_fn):
    # LinEs - Literature = formal
    with open(out_fn, "w", encoding='utf8') as o:

        with open(lines_fn, "r", encoding='utf8') as lines, open(lit_fn, "r", encoding='utf8') as lit:
            
            lit_ids = []
            for line in lit:
                if line[:14] == '# sent_id = en': # sent_id = en_lines-ud-dev-doc2-3296
                    lit_id = 'sv' + line[12:].rstrip('\n')
                    lit_ids.append(lit_id)
                    
            lines_l = [l.rstrip('\n') for l in lines]
            for line in lines_l:
                if line[:12] == '# sent_id = ':
                    send_id = line[12:]
                    if send_id not in lit_ids:
                        i = lines_l.index(line)
                        while lines_l[i]!='':
                            o.write(lines_l[i]+'\n') # write send_id ~ last token
                            i+=1
                        o.write('\n')
                            
