In [1]:
ud_path = 'ud/'

In [2]:
def make_text_file(filename, outfile):
    with open(filename, "r", encoding='utf8') as f, open(outfile, 'w', encoding='utf8') as o:
        i=0
        for line in f:
            if line[:9] == '# text = ': 
                text = line[9:]
                i+=1
                o.write(text)
    print(f'Wrote {i} sentences to file.')

In [3]:
make_text_file(ud_path+'UD_English-PUD/en_pud-ud-test.conllu', 'ud-eng-pud-sents.txt')

Wrote 1000 sentences to file.


In [17]:
def read_conllu(filename):
    '''Returns a dict of sentence_text:{'upos':[upos_tags...], 'xpos':[xpos_tags}'''
    
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        tokens = []
        upos_tags = []
        xpos_tags = []
        
        
        for line in f:
            if line[0] == '#': #skip '# newdoc id = n01001', '# sent_id = n01001011'
                continue
            columns = line.split() # 10 cols
            if columns == []: # When reading a blank line => finish reading one sentence
                sentence_text = ' '.join(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['upos'] = upos_tags
                sentences[sentence_text]['xpos'] = xpos_tags
                tokens, upos_tags, xpos_tags = [], [], [] # Reset the pos lists
                continue
            tokens.append(columns[1])
            upos_tags.append(columns[3])
            xpos_tags.append(columns[4])
            

            
    print(f'Corpus contains {len(sentences)} sentences.')
    return sentences

In [18]:
ud_eng_pud = read_conllu(ud_path+'UD_English-PUD/en_pud-ud-test.conllu')


Corpus contains 1000 sentences.


In [54]:
t = '<w pos="PM" msd="PM.NOM" lemma="|Ikea|" lex="|Ikea..pm.1|" sense="|IKEA..1:-1.000|Ikea..1:-1.000|" complemgram="|" compwf="|" ref="01" dephead="" deprel="SS">Ikea</w>'
re.findall('(?<=msd=")[^\s]+(?="\s)', t)[0]

'PM.NOM'

In [55]:
import re
def read_sparv_xml(filename):
    '''Reads the XML file from Sparv annotation and return the texts and 
    the POS and MSD tags'''
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        
        for line in f:
            if line[:3]=='<se':
                tokens = []
                pos_tags = []
                msd_tags = []
                deprels = [] 
                
            if line[:3]=='<w ':
                token = re.findall('(?<=>)[^\s]+(?=</w>)', line)[0]
                pos = re.findall('(?<=pos=")[^\s]+(?="\sm)', line)[0]
                msd = re.findall('(?<=msd=")[^\s]+(?="\s)', line)[0]
                try:
                    head_id = re.findall('(?<=dephead=")[^\s]*(?="\s)', line)[0]
                    deprel_tag = re.findall('(?<=deprel=")[^\s]+(?=">)', line)[0]
                    
                except IndexError:
                    head_id, deprel_tag = '_', '_'
                
                if msd[0]=='F' and msd[1:].islower():
                    msd = token # change tag to the punctuation itself
                if pos=='PROPN' and len(token.split('_'))>1: #split multiword PropN 
                    propn_tokens = token.split('_')
                    tokens.extend(propn_tokens)
                    pos_tags.extend([pos]*len(propn_tokens))
                    msd_tags.extend([msd]*len(propn_tokens))
                    deprels.extend([(head_id, deprel_tag)]*len(propn_tokens))
                    continue
                
                tokens.append(token)
                pos_tags.append(pos)
                msd_tags.append(msd)
                deprels.append((head_id, deprel_tag))
            
            if line[:3]=='</s':
                sentence_text = " ".join(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['pos'] = pos_tags
                sentences[sentence_text]['msd'] = msd_tags
                sentences[sentence_text]['deprel'] = deprels
                
    print(f'Parser output contains {len(sentences)} sentences.')       
    return sentences


In [41]:
sparv_output = read_sparv_xml('ud-eng-pud-parsed.xml')

Parser output contains 986 sentences.


In [56]:
sparv_output_sv = read_sparv_xml('korpus.xml')

# TODDO the Sparv parser outputs only 986 sentences... find a way to force it to output 1000?

Parser output contains 22 sentences.


In [124]:
def match(list1, list2):
    if len(list1)==len(list2):
        matched_count = sum(1 if list1[i]==list2[i] else 0 for i in range(len(list1)))
        return matched_count
    else:
        return 0
match(ud_eng_pud[key1]['xpos'], sparv_output[key1]['msd'])

26

In [22]:
overlaps = [x for x in sparv_output.keys() if x in ud_eng_pud.keys()]
len(overlaps)

674

In [57]:
sparv_output_sv

{'Ikea ( namnet är bildat av initialerna för Ingvar Kamprad Elmtaryd Agunnaryd ) är ett multinationellt möbelföretag som grundades 1943 av Ingvar Kamprad .': {'pos': ['PM',
   'PAD',
   'NN',
   'VB',
   'VB',
   'PP',
   'NN',
   'PP',
   'PM',
   'PM',
   'PM',
   'PM',
   'PAD',
   'VB',
   'DT',
   'JJ',
   'NN',
   'HP',
   'VB',
   'RG',
   'PP',
   'PM',
   'PM',
   'MAD'],
  'msd': ['PM.NOM',
   'PAD',
   'NN.NEU.SIN.DEF.NOM',
   'VB.PRS.AKT',
   'VB.SUP.AKT',
   'PP',
   'NN.UTR.PLU.DEF.NOM',
   'PP',
   'PM.NOM',
   'PM.NOM',
   'PM.NOM',
   'PM.NOM',
   'PAD',
   'VB.PRS.AKT',
   'DT.NEU.SIN.IND',
   'JJ.POS.NEU.SIN.IND.NOM',
   'NN.NEU.SIN.IND.NOM',
   'HP.-.-.-',
   'VB.PRT.SFO',
   'RG.NOM',
   'PP',
   'PM.NOM',
   'PM.NOM',
   'MAD'],
  'deprel': [('14', 'SS'),
   ('01', 'IR'),
   ('04', 'SS'),
   ('01', 'MS'),
   ('04', '+F'),
   ('05', 'AG'),
   ('06', 'PA'),
   ('07', 'ET'),
   ('08', 'PA'),
   ('09', 'HD'),
   ('09', 'HD'),
   ('09', 'HD'),
   ('01', 'JR'),
   ('', 