# Universal Dependencies

In [2]:
import os
import re
import glob
import pickle
import logging
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
from io import StringIO
from datetime import datetime
from collections import Counter
from tqdm import tqdm
tqdm.monitor_interval = 0
import spacy
import numpy as np
import pandas as pd

logging.basicConfig(filename='logs/UD.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

EN_LINKS = ['https://github.com/UniversalDependencies/UD_English-ParTUT.git',
             'https://github.com/UniversalDependencies/UD_English-GUM.git',
             'https://github.com/UniversalDependencies/UD_English-EWT.git',
             'https://github.com/UniversalDependencies/UD_English-PUD.git',
             'https://github.com/UniversalDependencies/UD_English-LinES.git']

FR_LINKS = ['https://github.com/UniversalDependencies/UD_French-ParTUT.git',
             'https://github.com/UniversalDependencies/UD_French-GSD.git',
             'https://github.com/UniversalDependencies/UD_French-Sequoia.git',
             'https://github.com/UniversalDependencies/UD_French-PUD.git',
             #'https://github.com/UniversalDependencies/UD_French-FTB.git' # licence restrictions so must copy in
            ]

IT_LINKS = ['https://github.com/UniversalDependencies/UD_Italian-ISDT.git',
             'https://github.com/UniversalDependencies/UD_Italian-ParTUT.git',
             'https://github.com/UniversalDependencies/UD_Italian-PUD.git']

ES_LINKS = ['https://github.com/UniversalDependencies/UD_Spanish-AnCora.git',
            'https://github.com/UniversalDependencies/UD_Spanish-GSD.git',
            'https://github.com/UniversalDependencies/UD_Spanish-PUD.git']


DATASETS = {'en': EN_LINKS,
            'fr': FR_LINKS,
            'it': IT_LINKS,
            'es':  ES_LINKS}

OUT_DIR = 'UD'

In [84]:
def download(url, path):
    cmd = 'git clone {} {}'.format(url, path)
    os.system(cmd)

def fetch_datasets(datasets):
    for lg, links in datasets.items():
        for link in tqdm(links):
            dirname = os.path.splitext(os.path.basename(link))[0]
            local_path = os.path.join(OUT_DIR, lg, dirname)
            if not os.path.exists(local_path):
                download(link, local_path)

fetch_datasets(DATASETS)

100%|██████████| 5/5 [00:00<00:00, 5614.86it/s]
100%|██████████| 4/4 [00:00<00:00, 4902.75it/s]
100%|██████████| 3/3 [00:00<00:00, 4696.87it/s]
100%|██████████| 3/3 [00:00<00:00, 4735.76it/s]


In [85]:
class Sentence:
    
    TEXT_PATTERN = re.compile(r'#\s+?text\s+?=\s+?(.*)\n')
    WORDLINES_PATTERN = re.compile(r'\n(1.*\n)\n', re.DOTALL)
    TENSE_PATTERN = re.compile(r'Tense=([A-Z0-9][a-zA-Z0-9]*)')
    VERBFORM_PATTERN = re.compile(r'VerbForm=([A-Z0-9][a-zA-Z0-9]*)')
    NULL_VALUE = '_'
    COLUMNS = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']
    
    def __init__(self, raw):
        self.text = self.TEXT_PATTERN.search(raw).group(1)
        self.words = self.parse_words(raw)
        self.root_id = self.words[self.words['head'] == '0'].index[0]
        self.tense_id = self.find_tense_id()
        self.extract_features()
    
    def parse_words(self, raw):
        words = self.WORDLINES_PATTERN.search(raw).group(1)
        words = pd.read_csv(StringIO(words), sep='\t', names=self.COLUMNS, quoting=3, dtype={'id': str, 'head': str})
        is_emptynode = words['id'].str.contains(r'\.')
        words = words[~is_emptynode]
        words.set_index('id', inplace=True)
        words['tense'] = words['feats'].str.extract(self.TENSE_PATTERN)
        words['verbform'] = words['feats'].str.extract(self.VERBFORM_PATTERN)
        words.fillna('_', inplace=True)
        words['multiword'] = self.NULL_VALUE
        is_range = words.index.str.contains('-')
        for idx in words.index[is_range]:
            start, end = idx.split('-')
            ids = map(str, range(int(start), int(end)+1))
            for i in ids:
                words.loc[i]['multiword'] = idx
        return words.drop(['xpos', 'feats', 'deps', 'misc'], axis=1)
         
    def extract_features(self):
        self.length = len(self.words[self.words['multiword'] == self.NULL_VALUE])
        if self.tense_id is not None:
            self.tense = self.words.loc[self.tense_id]['tense']
            self.tense_wordform = self.get_tensed_wordform()
            self.tense_lemma = self.words.loc[self.tense_id]['lemma']
            self.relation_to_head = self.words.loc[self.tense_id]['deprel']
            
            all_tensed_words = self.words[self.words['tense'] != self.NULL_VALUE]
            self.other_tensed_words = len(all_tensed_words) - 1
            self.other_tensed_words_conflicting = (all_tensed_words['tense'] != self.tense).sum()
            
            next_word = str(int(self.tense_id)+1)
            if int(next_word) > int(self.words.index[-1]):
                self.dist_from_right = 0
                self.other_tensed_words_right = 0
                self.other_tensed_words_conflicting_right = 0
            else:
                words_to_the_right = self.words.loc[next_word:]
                self.dist_from_right = (words_to_the_right['multiword'] == self.NULL_VALUE).sum()
                tensed_words_to_the_right = words_to_the_right[words_to_the_right['tense'] != self.NULL_VALUE]
                self.other_tensed_words_right = len(tensed_words_to_the_right)
                self.other_tensed_words_conflicting_right = (tensed_words_to_the_right['tense'] != self.tense).sum()
             
        else:
            self.tense = None
            self.tense_wordform = None
            self.tense_lemma = None
            self.relation_to_head = None
            self.dist_from_right = None 
            self.other_tensed_words = None
            self.other_tensed_words_conflicting = None
            self.other_tensed_words_right = None
            self.other_tensed_words_conflicting_right = None
    
    def get_tensed_wordform(self):
        multiword_id = self.words.loc[self.tense_id]['multiword']
        if multiword_id != self.NULL_VALUE:
            return self.words.loc[multiword_id]['form']
        return self.words.loc[self.tense_id]['form']

class EnglishSentence(Sentence):
    
    def find_tense_id(self):
        """
        1) AUX direct dependent of root, related by (aux, cop or aux:pass), is finite and has tense
        2) If found above but no tense, 
            if lemma is will, take
            otherwise UD is wrong
        3) VERB that is root, is finite and has tense
        """
        is_aux = self.words['upos'] == 'AUX'
        is_dep_head = self.words['head'] == self.root_id
        is_aux_cop_deprel = self.words['deprel'].isin(['aux', 'aux:pass', 'cop'])
        is_finite = self.words['verbform'] == 'Fin'
        has_tense = self.words['tense'] != self.NULL_VALUE
        finite_aux = self.words[is_aux & is_dep_head & is_aux_cop_deprel & is_finite]
        
        if len(finite_aux):
            tensed_finite_aux = finite_aux[has_tense]
            if len(tensed_finite_aux):
                return tensed_finite_aux.index[0]
            elif finite_aux['lemma'][0] == 'will':
                return finite_aux.index[0]
            else:
                return None
        else:
            root = self.words.loc[self.root_id]
            if (root['upos'] == 'VERB') and (root['verbform'] == 'Fin') and (root['tense'] != self.NULL_VALUE):
                return self.root_id
        return None

class ItalianSentence(Sentence):
    
    def find_tense_id(self):
        """"""
        root_is_verb = self.words.loc[self.root_id]['upos'] == 'VERB'
        root_is_fin = self.words.loc[self.root_id]['verbform'] == 'Fin'
        root_has_tense = self.words.loc[self.root_id]['tense'] != self.NULL_VALUE
        if root_is_verb and root_is_fin and root_has_tense:
            return self.root_id
        is_aux = self.words['upos'] == 'AUX'
        is_dep_head = self.words['head'] == self.root_id
        is_cop = self.words['deprel'] == 'cop'
        has_tense = self.words['tense'] != self.NULL_VALUE
        is_finite = self.words['verbform'] == 'Fin'
        finite_aux = self.words[is_aux & is_dep_head & is_cop & is_finite & has_tense]
        if len(finite_aux):
            return finite_aux.index[0]
        is_aux_pass = self.words['deprel'] == 'aux:pass'
        finite_aux_pass = self.words[is_aux & is_dep_head & is_aux_pass & is_finite & has_tense]
        root_is_part = self.words.loc[self.root_id]['verbform'] == 'Part'
        if len(finite_aux_pass) and root_is_part:
            return finite_aux_pass.index[0]
        is_aux_deprel = self.words['deprel'] == 'aux'
        finite_aux = self.words[is_aux & is_dep_head & is_aux_deprel & is_finite & has_tense]
        if len(finite_aux):
            is_inf = self.words['verbform'] == 'Inf'
            nonfinite_aux = self.words[is_aux & is_dep_head & is_aux_pass & (~has_tense) & is_inf]
            if len(nonfinite_aux):
                if root_is_verb and root_has_tense and root_is_part:
                    return finite_aux.index[0]
            is_part = self.words['verbform'] == 'Part'
            part_aux = self.words[is_aux & is_dep_head & is_aux_deprel & has_tense & is_part]
            root_is_inf = self.words.loc[self.root_id]['verbform'] == 'Inf'
            if len(part_aux):
                if root_is_verb and (not root_has_tense) and root_is_inf:
                    return part_aux.index[0]
            if root_is_verb and root_has_tense and root_is_part:
                return self.root_id
            if root_is_verb and (not root_has_tense) and root_is_inf:
                return finite_aux.index[0]
        return None
        
class FrenchSentence(Sentence):
    
    def find_tense_id(self):
        """
        1) Root is verb, is finite and has tense
        2) AUX points to head as aux:pass or cop, has tense and is finite
        3) AUX points to head, is aux, has tense, is finite
            Root is verb, has tense and is part <-
            AUX points to head as cop, has tense and is part <-
        
        4) Root is verb, is non-finite
            AUX points to head, is aux, has tense, is finite
                if lemma is "aller", future
                otherwise AUX
        """
        root_is_verb = self.words.loc[self.root_id]['upos'] == 'VERB'
        root_is_fin = self.words.loc[self.root_id]['verbform'] == 'Fin'
        root_has_tense = self.words.loc[self.root_id]['tense'] != self.NULL_VALUE
        if root_is_verb and root_is_fin and root_has_tense:
            return self.root_id
        is_aux = self.words['upos'] == 'AUX'
        is_dep_head = self.words['head'] == self.root_id
        is_finite = self.words['verbform'] == 'Fin'
        has_tense = self.words['tense'] != self.NULL_VALUE
        finite_aux = self.words[is_aux & is_dep_head & is_finite & has_tense]
        if len(finite_aux):
            is_auxpass_cop_deprel = self.words['deprel'].isin(['aux:pass', 'cop'])
            aux_cop_finite_aux = finite_aux[is_auxpass_cop_deprel]
            if len(aux_cop_finite_aux):
                return aux_cop_finite_aux.index[0]
            else:
                if root_is_verb and root_has_tense and self.words.loc[self.root_id]['verbform'] == 'Part':
                    return self.root_id
                else:
                    is_cop = self.words['deprel'] == 'cop'
                    is_participle = self.words['verbform'] == 'Part'
                    aux = self.words[is_aux & is_dep_head & is_cop & has_tense & is_participle]
                    if len(aux):
                        return aux.index[0]
        if root_is_verb and self.words.loc[self.root_id]['verbform'] == 'Inf':
            is_aux_deprel = self.words['deprel'] == 'aux'
            aux = self.words[is_aux & is_dep_head & is_aux_deprel & has_tense & is_finite]
            if len(aux):
                return aux.index[0] # doesn't take aller into account
        return None

class SpanishSentence(Sentence):
    
    def find_tense_id(self):
        root_is_verb = self.words.loc[self.root_id]['upos'] == 'VERB'
        root_is_fin = self.words.loc[self.root_id]['verbform'] == 'Fin'
        root_has_tense = self.words.loc[self.root_id]['tense'] != self.NULL_VALUE
        if root_is_verb and root_is_fin and root_has_tense:
            return self.root_id
        is_aux = self.words['upos'] == 'AUX'
        is_dep_head = self.words['head'] == self.root_id
        is_finite = self.words['verbform'] == 'Fin'
        has_tense = self.words['tense'] != self.NULL_VALUE
        is_auxpass_aux_deprel = self.words['deprel'].isin(['aux:pass', 'aux'])
        finite_aux = self.words[is_aux & is_dep_head & is_finite & has_tense & is_auxpass_aux_deprel]
        if len(finite_aux):
            root_is_part = self.words.loc[self.root_id]['verbform'] == 'Part'
            if root_is_verb and root_has_tense and root_is_part:
                return self.root_id
        is_verb = self.words['upos'] == 'VERB'
        is_cop = self.words['deprel'] == 'cop'
        finite_cop = self.words[is_verb & is_dep_head & is_cop & has_tense & is_finite]
        if len(finite_cop):
            return finite_cop.index[0]
        is_aux_deprel = self.words['deprel'] == 'aux'
        finite_aux = self.words[is_aux & is_dep_head & is_aux_deprel & has_tense & is_finite]
        if len(finite_aux):
            root_is_inf = self.words.loc[self.root_id]['verbform'] == 'Inf'
            if root_is_verb and (not root_has_tense) and root_is_inf:
                return finite_aux.index[0]
        return None

class CoNLLUReader:
    
    SENTENCE_PATTERN = re.compile(r'# sent_id.*?\n\n', re.DOTALL)
    
    SENTENCE_READERS = {'en': EnglishSentence,
                        'it': ItalianSentence,
                        'fr': FrenchSentence,
                        'es': SpanishSentence}
    
    def __init__(self, lg, fname):
        self.lg = lg
        self.sentences = self.parse_sentences(self.read(fname))
        
    def read(self, fname):
        with open(fname, encoding='utf-8') as f:
            return f.read()
        
    def parse_sentences(self, raw):
        SentenceReader = self.SENTENCE_READERS[self.lg]
        sentences = []
        raw = re.sub('’', "'", raw)
        for sent in self.SENTENCE_PATTERN.finditer(raw):
            sent = SentenceReader(sent.group(0))
            sentences.append(sent)
        return sentences

In [92]:
class UDReader:
    
    FEATURES = ['text', 'length', 'tense', 'tense_wordform', 'tense_lemma', 'relation_to_head',
                'dist_from_right', 'other_tensed_words', 'other_tensed_words_conflicting',
                'other_tensed_words_right', 'other_tensed_words_conflicting_right']
    
    def __init__(self, lg):
        self.lg = lg
        nlp = spacy.load(lg, disable=['tagger', 'parser', 'ner', 'textcat'])
        self.tokenizer = nlp.tokenizer
        self.fnames = glob.glob(os.path.join(OUT_DIR, lg, '**/*.conllu'))
        
    def prepare(self):
        self.tensed_types = set() # built up during prepare_gt_and_tensed_types()
        self.ground_truth = self.prepare_gt_and_tensed_types()
        self.freq_dist = self.prepare_freq_dist()
        self.responsible_types = self.prepare_responsible()
        # here is where I make sure that the tensed types and responsible types have the same tokenization
        # as the wikipedia data
        self.tensed_types = self.ensure_same_tokenization(self.tensed_types.copy())
        self.responsible_types = self.ensure_same_tokenization(self.responsible_types.copy())
        self.write()
    
    def ensure_same_tokenization(self, words):
        s = ' '.join(words)
        result = set()
        for token in self.tokenizer(s):
            token = token.text
            result.add(token)
        return result
    
    def prepare_gt_and_tensed_types(self):
        gt = []
        for fname in tqdm(self.fnames):
            reader = CoNLLUReader(self.lg, fname)
            for s in reader.sentences:
                if s.tense:
                    g = self.extract(s, self.FEATURES)
                    gt.append(g)
                    self.add_tensed_words(s)
        df = pd.DataFrame(gt, columns=self.FEATURES)
        df['text'] = df['text'].str.lower()
        df['tense_wordform'] = df['tense_wordform'].str.lower()
        return df
        
    def prepare_responsible(self):
        return set(self.ground_truth['tense_wordform'])
    
    def add_tensed_words(self, sent):
        ids = sent.words[sent.words['tense'] != Sentence.NULL_VALUE].index
        for i in ids:
            multiword_i = sent.words.loc[i]['multiword']
            if multiword_i != Sentence.NULL_VALUE:
                i = multiword_i
            t = sent.words.loc[i]['form'].lower()
            self.tensed_types.add(t)
    
    def prepare_freq_dist(self):
        freq_dist = Counter()
        for sent in self.ground_truth['text']:
            tokens = [t.text for t in self.tokenizer(sent)]
            freq_dist.update(tokens)
        return freq_dist
            
    def extract(self, sent, features):
        return [getattr(sent, ft) for ft in features]
    
    def write(self):
        ofile = os.path.join(OUT_DIR, self.lg, 'ground_truth.csv')
        self.ground_truth.to_csv(ofile, index=False)
        ofile = os.path.join(OUT_DIR, self.lg, 'metadata.pkl')
        obj = {'freq_dist': self.freq_dist, 'tensed_types': self.tensed_types, 
               'responsible_types': self.responsible_types}
        with open(ofile, mode='wb') as f:
            pickle.dump(obj, f)

In [88]:
def main(lgs):
    for lg in lgs:
        start = datetime.now()
        reader = UDReader(lg)
        reader.prepare()
        end = datetime.now()
        num_sents = len(reader.ground_truth)
        msg = 'Processing {} sentences for {} took {}'.format(num_sents, lg, end-start)
        logging.info(msg)

In [None]:
main(['en', 'fr', 'it', 'es'])

## Postprocessing

In [3]:
def contains(tokens, types):
        """Returns whether at least one thing in types is in tokens"""
        return bool(set(tokens).intersection(set(types)))

def postprocess_ud(lg):
    fname = os.path.join('UD', lg, 'ground_truth.csv')
    df = pd.read_csv(fname)
    fname = os.path.join('wikipedia', lg, 'metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    wiki_freq_dist = obj['freq_dist']
    df['freq_wordform'] = df['tense_wordform'].apply(lambda w: wiki_freq_dist[w])
    df['pres_other_tensed_words'] = df['other_tensed_words'] > 0
    df['pres_other_tensed_words_conflicting'] = df['other_tensed_words_conflicting'] > 0
    df['pres_other_tensed_words_right'] = df['other_tensed_words_right'] > 0
    df['pres_other_tensed_words_conflicting_right'] = df['other_tensed_words_conflicting_right'] > 0
    fname = os.path.join('wikipedia', lg, 'lemmata.pkl')
    with open(fname, 'rb') as f:
        lemmata = pickle.load(f)
    df['freq_lemma'] = df['tense_lemma'].apply(lambda w: lemmata[w])
    fname = os.path.join('UD', lg, 'metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    responsible = obj['responsible_types']
    tensed_types = obj['tensed_types']
    fname = os.path.join('wikipedia', lg, 'unk-metadata.pkl')
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    not_in_wiki = obj['not-in-wiki']
    missing_tensed = not_in_wiki.intersection(tensed_types)
    missing_responsible = not_in_wiki.intersection(responsible)
    nlp = spacy.load(lg)
    tokenizer = lambda s: [t.text for t in nlp.tokenizer(s)]
    df['tokens'] = df['text'].apply(tokenizer)
    contains_tensed = lambda tokens: contains(tokens, missing_tensed)
    df['contains_tensed'] = df['tokens'].apply(contains_tensed)
    contains_responsible = lambda tokens: contains(tokens, missing_responsible)
    df['contains_responsible'] = df['tokens'].apply(contains_responsible)
    p = 0.7
    good_split = False
    while not good_split:
        train_set = df['tense_wordform'].value_counts().sample(frac=p).index
        df['train'] = df['tense_wordform'].isin(train_set)
        achieved_p = df['train'].value_counts(normalize=True)[True]
        if abs(p-achieved_p) < 0.05:
            good_split = True
    fname = os.path.join('UD', lg, 'prepared.csv')
    df.to_csv(fname, index=False)
    return df

In [4]:
df = postprocess_ud('fr')
df.head()

Unnamed: 0,text,length,tense,tense_wordform,tense_lemma,relation_to_head,dist_from_right,other_tensed_words,other_tensed_words_conflicting,other_tensed_words_right,...,freq_wordform,pres_other_tensed_words,pres_other_tensed_words_conflicting,pres_other_tensed_words_right,pres_other_tensed_words_conflicting_right,freq_lemma,tokens,contains_tensed,contains_responsible,train
0,l'association a changé les décors et avec l'ai...,34,Past,changé,changer,root,30,2,1,1,...,561,True,True,True,False,3443,"[l', association, a, changé, les, décors, et, ...",False,False,False
1,"quant au sous-préfet, il apprécie l'énergie dé...",14,Pres,apprécie,apprécier,root,8,1,1,1,...,268,True,True,True,True,1469,"[quant, au, sous-préfet, ,, il, apprécie, l', ...",False,False,True
2,les membres du club auront l'occasion de s'ent...,20,Fut,auront,avoir,root,15,0,0,0,...,385,False,False,False,False,290023,"[les, membres, du, club, auront, l', occasion,...",False,False,False
3,m. hosneld avait 44 ans.,6,Imp,avait,avoir,root,3,0,0,0,...,14837,False,False,False,False,290023,"[m., hosneld, avait, 44, ans, .]",False,False,True
4,"c'est le cas de ce brave joseph bari, toujours...",44,Pres,est,être,cop,42,2,2,2,...,304797,True,True,True,True,604255,"[c', est, le, cas, de, ce, brave, joseph, bari...",False,False,True
