In [61]:
import re

from conllu import parse  # https://github.com/EmilStenstrom/conllu
from conllu import parse_tree

In [74]:
DEV = '/Users/desialex/Dropbox/Maitrise/Corpora/UD_French-FTB/CoNLL-U_no_contractions/fr_ftb-ud-dev.conllu'
TEST = '/Users/desialex/Dropbox/Maitrise/Corpora/UD_French-FTB/CoNLL-U_no_contractions/fr_ftb-ud-test.conllu'
TRAIN = '/Users/desialex/Dropbox/Maitrise/Corpora/UD_French-FTB/CoNLL-U_no_contractions/fr_ftb-ud-train.conllu'

In [73]:
def load_conll(file_name: str):
    """
    Skip lines containing contractions. 
    Return conll file as a single string.
    """
    with open(file_name, 'r') as conll:
        lines = [line for line in conll.readlines() 
                      if not re.search('\d+-\d+', line)]
    return ''.join(lines)

def parse_conll(file_name=str):
    """
    Read a CoNLL file and return
    a list of sentences as TokenTree objects.
    """
    return parse_tree(load_conll(file_name))


In [75]:
dev = parse_conll(DEV)
test = parse_conll(TEST)
train = parse_conll(TRAIN)
corpus = dev+test+train

## conllu library basic functionality

In [79]:
sentences = parse(load_conll(DEV))  # parse the CoNNL-U file as a list of linear sentences =TokenList
sentences[:2]

[TokenList<Nous, prions, les, cinéastes, et, tous, nos, lecteurs, de, bien, vouloir, nous, en, excuser, .>,
 TokenList<La, diffusion, de, les, prévisions, météorologiques, était, fortement, perturbée, ,, mardi, 7, janvier, ,, par, le, mouvement, de, grève, nationale, de, trente, -, six, heures, déclenché, la, veille, à, le, soir, ,, à, l', appel, de, les, délégués, CGT, ,, CFDT, et, FO, de, le, personnel, technique, de, Météo, -, France, .>]

In [80]:
sentence = sentences[0]  # the sentence as a list of tokens linearly ordered
sentence

TokenList<Nous, prions, les, cinéastes, et, tous, nos, lecteurs, de, bien, vouloir, nous, en, excuser, .>

In [81]:
token = sentence[0]  # the token as a nested ordered dictionary
token

OrderedDict([('id', 1),
             ('form', 'Nous'),
             ('lemma', 'il'),
             ('upostag', 'PRON'),
             ('xpostag', None),
             ('feats',
              OrderedDict([('Gender', 'Masc'),
                           ('Number', 'Plur'),
                           ('Person', '1')])),
             ('head', 2),
             ('deprel', 'nsubj'),
             ('deps', None),
             ('misc', None)])

In [82]:
structures = parse_tree(load_conll(DEV))  # the corpus into a list of arborescent sentences =TokenTree

In [219]:
structures[:5]

[TokenTree<token={id=2, form=prions}, children=[...]>,
 TokenTree<token={id=(3, '-', 4), form=des}, children=None>,
 TokenTree<token={id=7, form=veulent}, children=[...]>,
 TokenTree<token={id=3, form=protestent}, children=[...]>,
 TokenTree<token={id=5, form=empêche}, children=[...]>]

In [259]:
structure = structures[0]
structure

TokenTree<token={id=2, form=prions}, children=[...]>

In [260]:
structure.print_tree()

(deprel:root) form:prions lemma:prier upostag:VERB [2]
    (deprel:nsubj) form:Nous lemma:il upostag:PRON [1]
    (deprel:obj) form:cinéastes lemma:cinéaste upostag:NOUN [4]
        (deprel:det) form:les lemma:le upostag:DET [3]
        (deprel:conj) form:lecteurs lemma:lecteur upostag:NOUN [8]
            (deprel:cc) form:et lemma:et upostag:CCONJ [5]
            (deprel:amod) form:tous lemma:tout upostag:ADJ [6]
            (deprel:det) form:nos lemma:son upostag:DET [7]
    (deprel:xcomp) form:vouloir lemma:vouloir upostag:AUX [11]
        (deprel:mark) form:de lemma:de upostag:ADP [9]
        (deprel:advmod) form:bien lemma:bien upostag:ADV [10]
        (deprel:xcomp) form:excuser lemma:excuser upostag:VERB [14]
            (deprel:iobj) form:nous lemma:le/lui upostag:PRON [12]
            (deprel:obj) form:en lemma:en upostag:PRON [13]
    (deprel:punct) form:. lemma:. upostag:PUNCT [15]


In [79]:
structure.children  # returns a list of the children as TokenTree objects

[TokenTree<token={id=1, form=Nous}, children=None>,
 TokenTree<token={id=4, form=cinéastes}, children=[...]>,
 TokenTree<token={id=11, form=vouloir}, children=[...]>,
 TokenTree<token={id=15, form=.}, children=None>]

In [76]:
structure.token  # returns the token as an OrderedDict

OrderedDict([('id', 2),
             ('form', 'prions'),
             ('lemma', 'prier'),
             ('upostag', 'VERB'),
             ('xpostag', None),
             ('feats',
              OrderedDict([('Mood', 'Ind'),
                           ('Number', 'Plur'),
                           ('Person', '1'),
                           ('Tense', 'Pres'),
                           ('VerbForm', 'Fin')])),
             ('head', 0),
             ('deprel', 'root'),
             ('deps', None),
             ('misc', None)])

## POS -rel-> POS query

In [3]:
# Search the forest tree by tree

def search_trees(forest, relation, gov, dep):
    for root in forest:
        meta = root.metadata['text']
        find_relations(root, meta, relation, gov, dep)


In [4]:
# Walk through the tree and find matching (POS rel POS) triplets

def find_relations(root, meta, relation, gov, dep):
    if root.children:
        for child in root.children:
            if root.token['upostag'] == gov.upper():
                if child.token['upostag'] == dep.upper() and child.token['deprel'] == relation:
                    print(root.token['upostag'], 
                          root.token['form'], 
                          '---',
                          relation, 
                          '-->',
                          child.token['upostag'], 
                          child.token['form'],
                          '\n',
                          meta,
                          '\n')
            find_relations(child, meta, relation, gov, dep)

In [5]:
search_trees(dev, 'csubj', 'noun', 'verb')

NOUN corvée --- csubj --> VERB remplir 
 PENDANT des années, remplir les bulletins de paie de l'indispensable et industrieuse "employée de maison" (terme plus noble que celui, légèrement méprisant, de "femme de ménage") était une corvée pesante à assumer. 

NOUN façon --- csubj --> VERB calculer 
 Un de nos lecteurs, retraité et polytechnicien, a écrit à l'URSSAF pour faire remarquer que calculer les cotisations à partir du salaire net (après déduction des cotisations salariales) en ignorant complètement le salaire brut n'était sans doute pas la meilleure façon de procéder. 



In [None]:
search_trees(test, 'csubj', 'verb', 'noun')

In [None]:
search_trees(train, 'det', 'noun', 'num')

## Children per branch
For each branch (=relation), get all possible configurations of children attached to that branch, then get the total for each configuration. 
> ex: For the relation **det** get all possible configurations of children of the node introduced by this relation (0, 1, 2, 3, 4, 5, 6, 8) and the total for each configuration (81325, 231, 2528, 29, 16, 3, 1, 1)

(See the results in the file FTB_rel_stats.numbers, under tab Children)

In [22]:
def children_per_branch(forest):
    dictionary = dict()  # defines the dictionary var
    for root in forest:
        climb_tree(root, dictionary)
    return dictionary

In [23]:
def climb_tree(root, dictionary):
    count_children(root, dictionary)
    if root.children:
        for child in root.children:
            climb_tree(child, dictionary)

In [24]:
def count_children(root, dictionary):
    if root.token['deprel'] in dictionary:
        if str(len(root.children)) in dictionary[root.token['deprel']]:
            dictionary[root.token['deprel']][str(len(root.children))] += 1
        else:
            dictionary[root.token['deprel']][str(len(root.children))] = 1
    else:
        dictionary[root.token['deprel']] = {str(len(root.children)):1}

In [25]:
children = children_per_branch(dev+test+train)

## Tree rel stats
Creates a dictionary where for each phrase (identified by the text_id), we have: 
* the text
* the sentence object
* the set of relations and for each relation, 
    * the number of children of the node introduced by this relation.

In [26]:
def tree_rel_stats(forest):
    stats = dict()
    for root in forest:
        tree = str(root.metadata['sent_id'])
        stats[tree] = {'text':root.metadata['text']}
        stats[tree].update({'object':root})
        get_rels(tree, root, stats)
    return stats

In [27]:
def get_rels(tree, root, stats):
    stats[tree].update({root.token['deprel']:len(root.children)})
    if root.children:
        for child in root.children:
            get_rels(tree, child, stats)

In [46]:
forest_stats = tree_rel_stats(test)

In [50]:
list(forest_stats)[0]

'flmf7ad1co-290'

In [52]:
forest_stats['flmf7ad1co-290']

{'acl': 1,
 'advmod': 0,
 'amod': 0,
 'aux:pass': 0,
 'case': 0,
 'cc': 0,
 'conj': 4,
 'det': 0,
 'fixed': 0,
 'nmod': 1,
 'nsubj': 2,
 'nummod': 0,
 'object': TokenTree<token={id=35, form=fixée}, children=[...]>,
 'obl': 3,
 'punct': 0,
 'root': 6,
 'text': "La limite des intérêts des emprunts contractés depuis le 18 septembre 1991 pour l'acquisition d'un logement neuf (intérêts déductibles des impôts à raison de 25%) est fixée à 20 000 francs pour les personnes seules contre 15 000 francs précédemment et à 40 000 francs pour les couples mariés soumis à une imposition commune, contre 30 000 francs précédemment."}

In [None]:
# Get the arborescent structure of an entry
forest_stats['flmf7ad1co-290']['object'].print_tree()

### Query the tree rel stats dictionary

In [54]:
# Get all structures with a given relation and number of children.

def get_branches(forest, relation: str, children: int):
    branches = []
    for tree in forest:
        if relation in forest[tree].keys():
            if forest[tree][relation] == children:
                branches.append(forest[tree])
    return branches

In [33]:
dets = get_branches(forest_stats, 'det', 2)

In [44]:
dets[3]['text']

"Sur cela se greffe la perspective d'une augmentation de capital d'Hachette au cours de 76,30 francs, ce qui représente une décote de 20% par rapport au dernier cours enregistré (95,40 francs)."

In [35]:
dets[3]['object'].print_tree()

(deprel:root) form:greffe lemma:greffer upostag:VERB [4]
    (deprel:obl) form:cela lemma:cela upostag:PRON [2]
        (deprel:case) form:Sur lemma:sur upostag:ADP [1]
    (deprel:expl) form:se lemma:le/lui upostag:PRON [3]
    (deprel:nsubj) form:perspective lemma:perspective upostag:NOUN [6]
        (deprel:det) form:la lemma:le upostag:DET [5]
        (deprel:nmod) form:augmentation lemma:augmentation upostag:NOUN [9]
            (deprel:case) form:d' lemma:de upostag:ADP [7]
            (deprel:det) form:une lemma:un upostag:DET [8]
            (deprel:nmod) form:capital lemma:capital upostag:NOUN [11]
                (deprel:case) form:de lemma:de upostag:ADP [10]
            (deprel:nmod) form:Hachette lemma:Hachette upostag:PROPN [13]
                (deprel:case) form:d' lemma:de upostag:ADP [12]
            (deprel:nmod) form:cours lemma:cours upostag:NOUN [16]
                (deprel:case) form:à lemma:à upostag:ADP [14]
                (deprel:det) form:le lemma:le upostag:

In [201]:
# Phrases with multiple subjects

for root in dev:
    if root.children:
        children = []
        for child in root.children:
            children.append(child.token['deprel'])
        if children.count('nsubj') > 1:
            print(root.metadata['sent_id'])
            #print(root.print_tree())
        

flmf7ab2ep-777
flmf7ab2ep-834
flmf7ab2ep-941
flmf7ab2ep-987
flmf7ae1ep-400
flmf7af2ep-591
flmf7af2ep-717
flmf7af2ep-719
flmf7af2ep-763
flmf7af2ep-801
flmf7af2ep-908
flmf7af2ep-1007
