In [124]:
def make_text_file(filename, outfile, limit=1000):
    with open(filename, "r", encoding='utf8') as f, open(outfile, 'w', encoding='utf8') as o:
        i=0
        for line in f:
            if line[:9] == '# text = ': 
                text = line[9:]
                i+=1
                o.write(text)
                
                if i==limit:
                    break
    print(f'Wrote {i} sentences to file.')

In [125]:
make_text_file('conllu-files/en_formal.conllu', 'en_formal.txt')
make_text_file('conllu-files/en_literature.conllu', 'en_literature.txt')
make_text_file('conllu-files/en_news.conllu', 'en_news.txt')

Wrote 1000 sentences to file.
Wrote 1000 sentences to file.
Wrote 1000 sentences to file.


In [126]:
make_text_file('conllu-files/sv_formal.conllu', 'sv_formal.txt')
make_text_file('conllu-files/sv_literature.conllu', 'sv_literature.txt')
make_text_file('conllu-files/sv_news.conllu', 'sv_news.txt')

Wrote 1000 sentences to file.
Wrote 1000 sentences to file.
Wrote 1000 sentences to file.


In [186]:
def read_conllu(filename, limit=1000):
    '''Returns a dict of (sent_tokens...):{'upos':[upos_tags...], 'xpos':[xpos_tags}'''
    
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        tokens = []
        upos_tags = []
        xpos_tags = []
        deprels = []
        
        
        for line in f:
            if line[0] == '#': #skip '# newdoc id = n01001', '# sent_id = n01001011'
                continue
            columns = line.split() # 10 cols
            if columns == []: # When reading a blank line => finish reading one sentence
                sentence_text = tuple(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['upos'] = upos_tags
                sentences[sentence_text]['xpos'] = xpos_tags
                sentences[sentence_text]['deprel'] = deprels
                tokens, upos_tags, xpos_tags, deprels = [], [], [], [] # Reset the pos lists
                
                
                if len(sentences)>=limit:
                    break
                continue
                
            tokens.append(columns[1])
            upos_tags.append(columns[3])
            xpos_tags.append(columns[4])
            deprels.append(columns[6]+'_'+columns[7]) # head id, deprel tag
            

            
    print(f'Corpus contains {len(sentences)} sentences.')
    return sentences

In [187]:
# key = tuple of tokens of a sentence
# dict[key]['upos'] = true POS labels
eng_formal = read_conllu('conllu-files/en_formal.conllu')
eng_literature = read_conllu('conllu-files/en_literature.conllu')
eng_news = read_conllu('conllu-files/en_news.conllu')

swe_formal = read_conllu('conllu-files/sv_formal.conllu')
swe_literature = read_conllu('conllu-files/sv_literature.conllu')
swe_news = read_conllu('conllu-files/sv_news.conllu')


Corpus contains 1000 sentences.
Corpus contains 1000 sentences.
Corpus contains 1000 sentences.
Corpus contains 1000 sentences.
Corpus contains 1000 sentences.
Corpus contains 1000 sentences.


In [188]:
import re
def read_sparv_xml(filename):
    '''Reads the XML file from Sparv annotation and return the texts and 
    the POS and MSD tags'''
    with open(filename, "r", encoding='utf8') as f:
        
        sentences = {}
        
        for line in f:
            if line[:3]=='<se':
                tokens = []
                pos_tags = []
                msd_tags = []
                deprels = [] 
                
            if line[:3]=='<w ':
                token = re.findall('(?<=>)[^\s]+(?=</w>)', line)[0]
                pos = re.findall('(?<=pos=")[^\s]+(?="\sm)', line)[0]
                msd = re.findall('(?<=msd=")[^\s]+(?="\s)', line)[0]
                try:
                    head_id = re.findall('(?<=dephead=")[^\s]*(?="\s)', line)[0]
                    deprel_tag = re.findall('(?<=deprel=")[^\s]+(?=">)', line)[0]
                    
                except IndexError:
                    head_id, deprel_tag = '_', '_'
                
                if msd[0]=='F' and msd[1:].islower():
                    msd = token # change tag to the punctuation itself
                if pos=='PROPN' and len(token.split('_'))>1: #split multiword PropN 
                    propn_tokens = token.split('_')
                    tokens.extend(propn_tokens)
                    pos_tags.extend([pos]*len(propn_tokens))
                    msd_tags.extend([msd]*len(propn_tokens))
                    deprels.extend([head_id+'_'+deprel_tag]*len(propn_tokens))
                    continue
                
                tokens.append(token)
                pos_tags.append(pos)
                msd_tags.append(msd)
                deprels.append(head_id+'_'+deprel_tag)
            
            if line[:3]=='</s':
                sentence_text = tuple(tokens)
                sentences[sentence_text] = {}
                sentences[sentence_text]['pos'] = pos_tags
                sentences[sentence_text]['msd'] = msd_tags
                sentences[sentence_text]['deprel'] = deprels
                
    print(f'Parser output contains {len(sentences)} sentences.')       
    return sentences


In [189]:
# key = tuple of tokenized sentence
# dict[key]['pos'] = predicted POS labels
sparv_en_news = read_sparv_xml('parsed/en_news_parsed.xml')
sparv_sv_news = read_sparv_xml('parsed/sv_news_parsed.xml')

sparv_en_lit = read_sparv_xml('parsed/en_lit_parsed.xml')
sparv_sv_lit = read_sparv_xml('parsed/sv_lit_parsed.xml')

sparv_en_formal = read_sparv_xml('parsed/en_formal_parsed.xml')
sparv_sv_formal = read_sparv_xml('parsed/sv_formal_parsed.xml')


Parser output contains 986 sentences.
Parser output contains 993 sentences.
Parser output contains 948 sentences.
Parser output contains 1025 sentences.
Parser output contains 866 sentences.
Parser output contains 911 sentences.


In [151]:
def shared_keys(dict1, dict2):
    d1_set = set(dict1)
    d2_set = set(dict2)
    return d1_set.intersection(d2_set)

In [193]:
from typing import List, Dict
from sklearn.metrics import f1_score, accuracy_score

def model_performance(y_true: List[List[str]],
                      y_pred: List[List[str]]) -> Dict[str, float]:
    """Accuracy calculation function
    
    Args:
      y_true: List of true labels of the tokenized sentense.
      y_pred: List of predicted labels of the tokenized sentense.
      
    Returns:
      Dict of metrics:
      
        {
          "accuracy": float,
          "f1_micro": float,
          "f1_macro": float,
          "f1_weighted": float,
        }
    
    Raises:
      ValueError: Exception occurred when input lists' length don't match.
    """
    if len(y_true) == 0:
        return None
    
    if len(y_true) != len(y_pred):
        raise ValueError("Lengths of input lists don't match.")
    
    def _list_flattener(inpt: List[List[str]]) -> List[str]:
        """Flattener for list of lists into a single list."""
        output = []
        for i in inpt:
            output.extend(i)
        return output

    y_true = _list_flattener(y_true)
    y_pred = _list_flattener(y_pred)

    if len(y_true) != len(y_pred):
        raise ValueError("Numper of tokens don't match between y_true and y_pred.")
    
    try:
        metrics = {
          "accuracy": accuracy_score(y_true, y_pred),
          "f1_micro": f1_score(y_true, y_pred, average='micro'),
          "f1_macro": f1_score(y_true, y_pred, average='macro'),
          "f1_weighted": f1_score(y_true, y_pred, average='weighted'),
        }
    except Exception as ex:
        raise Exception(f"Metrics calculation error: {ex}")
    return metrics

def eval_domain(corpora_dict, sparv_dict):
    
    common_keys = shared_keys(corpora_dict, sparv_dict)
    common_keys = list(common_keys)
    print(f'UD corpora has {len(corpora_dict)} sentences.')
    print(f'Sparv parsing has {len(sparv_dict)} sentences.')
    print(f'{len(common_keys)} sentences in common.')
    print(f'Evaluate {len(common_keys)} sentences and their POS tagging prediction.')
    print()
    
    true_pos = [corpora_dict[key]['upos'] for key in common_keys]
    pred_pos = [sparv_dict[key]['pos'] for key in common_keys]
    results = model_performance(true_pos, pred_pos)
    for x in results.items():
        print(x)

### Evaluations

In [194]:
# News - English
eval_domain(eng_news, sparv_en_news)

UD corpora has 1000 sentences.
Sparv parsing has 986 sentences.
674 sentences in common.
Evaluate 674 sentences and their POS tagging prediction.

('accuracy', 0.8282548476454293)
('f1_micro', 0.8282548476454293)
('f1_macro', 0.5532475082428272)
('f1_weighted', 0.803869911880206)


In [195]:
# Formal - English
eval_domain(eng_formal, sparv_en_formal)

UD corpora has 1000 sentences.
Sparv parsing has 866 sentences.
563 sentences in common.
Evaluate 563 sentences and their POS tagging prediction.

('accuracy', 0.8055015552099534)
('f1_micro', 0.8055015552099534)
('f1_macro', 0.5339737946716463)
('f1_weighted', 0.7783730346142022)


In [196]:
# Lit - English
eval_domain(eng_literature, sparv_en_lit)

UD corpora has 1000 sentences.
Sparv parsing has 948 sentences.
680 sentences in common.
Evaluate 680 sentences and their POS tagging prediction.

('accuracy', 0.8209931368591038)
('f1_micro', 0.8209931368591038)
('f1_macro', 0.5411824644851406)
('f1_weighted', 0.7964196366156389)


In [190]:
# News - Swedish
eval_domain(swe_news, sparv_sv_news)

UD corpora has 1000 sentences.
Sparv parsing has 993 sentences.
941 sentences in common.
Evaluate 941 sentences and their POS tagging prediction.

('accuracy', 0.0)
('f1_micro', 0.0)
('f1_macro', 0.0)
('f1_weighted', 0.0)


In [191]:
# Formal - Swedish
eval_domain(swe_formal, sparv_sv_formal)

UD corpora has 1000 sentences.
Sparv parsing has 911 sentences.
763 sentences in common.
Evaluate 763 sentences and their POS tagging prediction.

('accuracy', 0.0)
('f1_micro', 0.0)
('f1_macro', 0.0)
('f1_weighted', 0.0)


In [192]:
# Lit - Swedish
eval_domain(swe_literature, sparv_sv_lit)

UD corpora has 1000 sentences.
Sparv parsing has 1025 sentences.
935 sentences in common.
Evaluate 935 sentences and their POS tagging prediction.

('accuracy', 0.0)
('f1_micro', 0.0)
('f1_macro', 0.0)
('f1_weighted', 0.0)


In [102]:
def eng_to_swe_equivalent(eng_corpus, swe_corpus, out_fn):
    
    with open(out_fn, "w", encoding='utf8') as o:
    
        for eng_corpora, swe_corpora in list(zip(eng_corpus,swe_corpus)):
            with open(eng_corpora, "r", encoding='utf8') as en, open(swe_corpora, "r", encoding='utf8') as sv:

                swe_sent_ids = []
                for line in en:
                    if line[:14] == '# sent_id = en': # sent_id = en_lines-ud-dev-doc2-3296
                        swe_sent_id = 'sv' + line[14:].rstrip('\n')
                        swe_sent_ids.append(swe_sent_id)


                sv_lines = [l.rstrip('\n') for l in sv]
                for line in sv_lines:
                    if line[:12] == '# sent_id = ':
                        send_id = line[12:]
                        if send_id in swe_sent_ids:
                            i = sv_lines.index(line)
                            while sv_lines[i]!='':
                                o.write(sv_lines[i]+'\n') # write send_id ~ last token
                                i+=1
                            o.write('\n')
        
            

In [103]:

eng_to_swe_equivalent([f'domains/literature/en_lines-ud-{x}.conllu' for x in ['train', 'dev', 'test']], 
                     [f'UD/UD_Swedish-LinES/sv_lines-ud-{x}.conllu' for x in ['train', 'dev', 'test']],
                     'sv_literature.conllu')

In [107]:
def merge_corpus(corpus, out_fn):
    with open(out_fn, "w", encoding='utf8') as o:
        for corpora in corpus:
            with open(corpora, "r", encoding='utf8') as f:
                for line in f:
                    o.write(line)
merge_corpus([f'domains/literature/en_lines-ud-{x}.conllu' for x in ['train', 'dev', 'test']],
            'en_literature.conllu')


In [113]:
def formal_corpus(lines_fn, lit_fn, out_fn):
    # LinEs - Literature = formal
    with open(out_fn, "w", encoding='utf8') as o:

        with open(lines_fn, "r", encoding='utf8') as lines, open(lit_fn, "r", encoding='utf8') as lit:
            
            lit_ids = []
            for line in lit:
                if line[:14] == '# sent_id = en': # sent_id = en_lines-ud-dev-doc2-3296
                    lit_id = 'sv' + line[12:].rstrip('\n')
                    lit_ids.append(lit_id)
                    
            lines_l = [l.rstrip('\n') for l in lines]
            for line in lines_l:
                if line[:12] == '# sent_id = ':
                    send_id = line[12:]
                    if send_id not in lit_ids:
                        i = lines_l.index(line)
                        while lines_l[i]!='':
                            o.write(lines_l[i]+'\n') # write send_id ~ last token
                            i+=1
                        o.write('\n')
                            


In [180]:
sparv_sv_news

{('”',
  'Fast',
  'mycket',
  'av',
  'den',
  'digitala',
  'övergången',
  'är',
  'utan',
  'tidigare',
  'motstycke',
  'i',
  'USA',
  ',',
  'är',
  'det',
  'fredliga',
  'överlämnandet',
  'av',
  'makten',
  'inte',
  'det',
  '”',
  ',',
  'skrev',
  'Obamas',
  'specialassistent',
  'Kori',
  'Schulman',
  'i',
  'ett',
  'blogginlägg',
  'i',
  'måndags',
  '.'): {'pos': ['PAD',
   'KN',
   'PN',
   'PP',
   'DT',
   'JJ',
   'NN',
   'VB',
   'PP',
   'JJ',
   'NN',
   'PP',
   'PM',
   'MID',
   'VB',
   'DT',
   'JJ',
   'NN',
   'PP',
   'NN',
   'AB',
   'PN',
   'PAD',
   'MID',
   'VB',
   'PM',
   'NN',
   'PM',
   'PM',
   'PP',
   'DT',
   'NN',
   'PP',
   'NN',
   'MAD'],
  'msd': ['PAD',
   'KN',
   'PN.NEU.SIN.IND.SUB+OBJ',
   'PP',
   'DT.UTR.SIN.DEF',
   'JJ.POS.UTR+NEU.SIN.DEF.NOM',
   'NN.UTR.SIN.DEF.NOM',
   'VB.PRS.AKT',
   'PP',
   'JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.NOM',
   'NN.NEU.SIN.IND.NOM',
   'PP',
   'PM.NOM',
   'MID',
   'VB.PRS.AKT',
   'DT.NEU