In [1]:
ud_path = 'ud/'

In [2]:
def make_text_file(filename, outfile):
    with open(filename, "r", encoding='utf8') as f, open(outfile, 'w', encoding='utf8') as o:
        i=0
        for line in f:
            if line[:9] == '# text = ': 
                text = line[9:]
                i+=1
                o.write(text)
    print(f'Wrote {i} sentences to file.')

In [3]:
make_text_file(ud_path+'UD_English-PUD/en_pud-ud-test.conllu', 'ud-eng-pud-sents.txt')

Wrote 1000 sentences to file.


In [5]:
def read_conllu(filename):
    '''Returns a dict of sentence_text:{'upos':[upos_tags...], 'xpos':[xpos_tags}'''
    
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        tokens = []
        upos_tags = []
        xpos_tags = []
        
        
        for line in f:
            if line[0] == '#': #skip '# newdoc id = n01001', '# sent_id = n01001011'
                continue
            columns = line.split() # 10 cols
            if columns == []: # When reading a blank line => finish reading one sentence
                sentence_text = ' '.join(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['upos'] = upos_tags
                sentences[sentence_text]['xpos'] = xpos_tags
                upos_tags, xpos_tags = [], [] # Reset the pos lists
                continue
            tokens.append(columns[1])
            upos_tags.append(columns[3])
            xpos_tags.append(columns[4])
            

            
    print(f'Corpus contains {len(sentences)} sentences.')
    return sentences

In [6]:
ud_eng_pud = read_conllu(ud_path+'UD_English-PUD/en_pud-ud-test.conllu')


Corpus contains 1000 sentences.


In [9]:
import re
def read_sparv_xml(filename):
    '''Reads the XML file from Sparv annotation and return the texts and 
    the POS and MSD tags'''
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        
        for line in f:
            if line[:3]=='<se':
                tokens = []
                pos_tags = []
                msd_tags = []
                
            if line[:3]=='<w ':
                token = re.findall('(?<=>).+(?=</w>)', line)[0]
                pos = re.findall('(?<=pos=").+(?="\sm)', line)[0]
                msd = re.findall('(?<=msd=").+(?="\s)', line)[0]
                
                if msd[0]=='F' and msd[1:].islower():
                    msd = token # change tag to the punctuation itself
                if pos=='PROPN' and len(token.split('_'))>1: #split multiword PropN 
                    propn_tokens = token.split('_')
                    tokens.extend(propn_tokens)
                    pos_tags.extend([pos]*len(propn_tokens))
                    msd_tags.extend([msd]*len(propn_tokens))
                    continue
                
                tokens.append(token)
                pos_tags.append(pos)
                msd_tags.append(msd)
            
            if line[:3]=='</s':
                sentence_text = " ".join(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['pos'] = pos_tags
                sentences[sentence_text]['msd'] = msd_tags
                
    print(f'Parser output contains {len(sentences)} sentences.')       
    return sentences


In [25]:
sparv_output = read_sparv_xml('ud-eng-pud-parsed.xml')

# TODDO the Sparv parser outputs only 986 sentences... find a way to force it to output 1000?

Parser output contains 986 sentences.


In [124]:
def match(list1, list2):
    if len(list1)==len(list2):
        matched_count = sum(1 if list1[i]==list2[i] else 0 for i in range(len(list1)))
        return matched_count
    else:
        return 0
match(ud_eng_pud[key1]['xpos'], sparv_output[key1]['msd'])

26

In [26]:
sparv_output

{'“ While much of the digital transition is unprecedented in the United States , the peaceful transition of power is not , ” Obama special assistant Kori Schulman wrote in a blog post Monday .': {'pos': ['PUNCT',
   'ADP',
   'ADV',
   'ADP',
   'DET',
   'ADJ',
   'NOUN',
   'VERB',
   'ADJ',
   'ADP',
   'DET',
   'PROPN',
   'PROPN',
   'PUNCT',
   'DET',
   'ADJ',
   'NOUN',
   'ADP',
   'NOUN',
   'VERB',
   'ADV',
   'PUNCT',
   'PUNCT',
   'PROPN',
   'ADJ',
   'NOUN',
   'PROPN',
   'PROPN',
   'VERB',
   'ADP',
   'DET',
   'NOUN',
   'NOUN',
   'NUM',
   'PUNCT'],
  'msd': ['“',
   'IN',
   'RB',
   'IN',
   'DT',
   'JJ',
   'NN',
   'VBZ',
   'JJ',
   'IN',
   'DT',
   'NP',
   'NP',
   ',',
   'DT',
   'JJ',
   'NN',
   'IN',
   'NN',
   'VBZ',
   'RB',
   ',',
   '”',
   'NP',
   'JJ',
   'NN',
   'NP',
   'NP',
   'VBD',
   'IN',
   'DT',
   'NN',
   'NN',
   'W',
   '.']},
 'For those who follow social media transitions on Capitol Hill , this will be a little different 