# Complexity Measures

In [1]:
import os
from statistics import mean, median, StatisticsError
from itertools import combinations
from operator import mul

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from textdistance import levenshtein

import conllu
from conllu import parse, parse_tree
from conllu.exceptions import ParseException

## Load annotated files

In [4]:
data = pd.read_csv('SyntCompCorpus.tsv', sep='\t')
print(data.shape)
data.head(3)

(49397, 29)


  data = pd.read_csv('SyntCompCorpus.tsv', sep='\t')


Unnamed: 0,corpus,subcorpus,language,speaker_type,dialect,language_background,text,spell_checked,annotated,error_annotation,...,gender,age,L1,level,institution,programme,study_year,term,module,week
0,actr,,russian,L1,,,Прокатившись по многим городам и странам за по...,Прокатившись по многим городам и странам за по...,"# generator = UDPipe 2, https://lindat.mff.cun...",,...,,,,Heritage 1,,,,,,
1,actr,,russian,L1,,,В моей жизни я встречала много разных людей. У...,В моей жизни я встречала много разных людей. У...,"# generator = UDPipe 2, https://lindat.mff.cun...",,...,,,,Heritage 1,,,,,,
2,actr,,russian,L1,,,"В прошлем году, я провела четыре месяцев в Лон...","В прошлом году, я провела четыре месяцев в Лон...","# generator = UDPipe 2, https://lindat.mff.cun...",,...,,,,Heritage 1,,,,,,


## Clause/T-unit extraction

### Clauses
* root: root of the sentence
* acl: clausal modifier of a noun (adnominal clause)
* acl:relcl: relative clause modifier
* advcl: adverbial clause modifier
* advcl:relcl: adverbial relative clause modifier
* ccomp: clausal complement
* csubj: clausal subject
* csubj:outer: outer clause clausal subject
* nsubj:outer: outer clause nominal subject
* parataxis: parataxis
* xcomp: open clausal complement (only for tokens with upos = VERB)
* conj: conjunct (only for tokens with upos = VERB)

### T-units
* root
* parataxis
* conj between verbs

In [6]:
class SentenceComplexity:

    def __init__(self, tokenlist, tree, verbose=False):
        self.tokenlist = tokenlist
        self.tree = tree
        self.text = tokenlist.metadata['text']
        if verbose:
            tree.print_tree()

        self.length = 0
        self.c_heads, self.t_heads, self.np_heads = [], [], []
        self.pos_chain, self.dep_chain, self.dep_dists = [], [], []
        nodes, nonterminal = [], []

        for token in self.tokenlist:
            if token['upos'] not in {'PUNCT', 'SYM', '_'}:
                self.length += 1
                # terminal and non-terminal nodes
                nodes.append(token['id'])
                nonterminal.append(token['head'])
                # pos/deprel chains
                self.pos_chain.append(token['upos'])
                self.dep_chain.append(token['deprel'])
                # dependency distances
                self.dep_dists.append(abs(token['head'] - token['id']))
                # T-unit extraction
                if token['deprel'] in {'root', 'parataxis'} or (token['deprel'] == 'conj' and token['upos'] == 'VERB'):
                    self.t_heads.append(token['id'])
                    self.c_heads.append(token['id'])
                # clause extraction
                if token['deprel'] in {'advcl', 'advcl:relcl', 'acl', 'acl:relcl', 'ccomp', 'nsubj:outer', 'csubj:outer', 'csubj'
                                       } or (token['deprel'] == 'xcomp' and token['upos'] == 'VERB'):
                    self.c_heads.append(token['id'])
                # NP extraction
                if token['upos'] in {'NOUN', 'PROPN', 'PRON'}:
                    self.np_heads.append(token['id'])

        self.terminal = set(nodes).difference(nonterminal)
        self.nonterminal = set(nonterminal)

        # extract clauses    
        self.clauses = self.get_clauses()
        self.num_cl = len(self.clauses)

        # extract t-units    
        self.t_units = self.get_tunits()
        self.num_tu = len(self.t_units)

        # extract NPs
        self.nps = self.get_nps()
        self.num_np = len(self.nps)

        # extract tree depth
        self.tree_depth = self.get_tree_depth(self.tree)

    def __len__(self):
        return self.length

    def get_tree_depth(self, root):
        if not root.children:
            return 1
        else:
            return 1 + max(self.get_tree_depth(child) for child in root.children)

    def get_curr_node(self, root, curr_id):
        if root.token['id'] == curr_id:
            return root
        for child in root.children:
            curr_id = self.get_curr_node(child, curr_id)
        return curr_id

    def get_descendants(self, curr_token, heads):
        descendants = []
        def recurse(curr_token):
            for child in curr_token.children:
                if child.token['id'] not in heads and child.token['upos'] not in {'_', 'PUNCT', 'SYM'}:
                    descendants.append(child.token['id'])
                    recurse(child)
        recurse(curr_token)
        return descendants

    def get_noun_descendants(self, curr_token, heads):
        descendants = []
        def recurse(curr_token):
            for child in curr_token.children:
                if (child.token['upos'] not in {'_', 'PUNCT', 'SYM'} and child.token['deprel'] in {
                        'nmod', 'nmod:poss', 'nmod:tmod', 'appos', 'amod', 'nummod', 'nummod:gov',
                        'det', 'case'}):
                    descendants.append(child.token['id'])
                    recurse(child)
        recurse(curr_token)
        return descendants

    def get_clauses(self):
        clauses = []
        for head_id in self.c_heads:
            head_node = self.get_curr_node(self.tree, head_id)
            descendants = [self.tokenlist.filter(id=child_id)[0]
                           for child_id in self.get_descendants(head_node, self.c_heads)]
            id_to_text = {head_id: head_node.token['form']}
            for dep in descendants:
                id_to_text[dep['id']] = dep['form']
            clause = {'head_id': head_id,
                      'head_node': head_node,
                      'dep_ids': [dep['id'] for dep in descendants],
                      'dep_nodes': descendants,
                      'rel_type': head_node.token['deprel'],
                      'text': ' '.join(dict(sorted(id_to_text.items())).values())}
            clauses.append(clause)
        return clauses

    def get_tunits(self):
        t_units = []
        for head_id in self.t_heads:
            head_node = self.get_curr_node(self.tree, head_id)
            descendants = [self.tokenlist.filter(id=child_id)[0]
                           for child_id in self.get_descendants(head_node, self.t_heads)]
            id_to_text = {head_id: head_node.token['form']}
            for dep in descendants:
                id_to_text[dep['id']] = dep['form']
            t_unit = {'head_id': head_id,
                      'head_node': head_node,
                      'dep_ids': [dep['id'] for dep in descendants],
                      'dep_nodes': descendants,
                      'rel_type': head_node.token['deprel'],
                      'text': ' '.join(dict(sorted(id_to_text.items())).values())}
            t_units.append(t_unit)
        return t_units

    def get_nps(self):
        nps = []
        all_descendants = []
        for head_id in self.np_heads:
            head_node = self.get_curr_node(self.tree, head_id)
            descendants = [self.tokenlist.filter(id=child_id)[0]
                           for child_id in self.get_noun_descendants(head_node, self.t_heads)]
            if head_id in all_descendants:
                continue
            all_descendants.extend([dep['id'] for dep in descendants])
            id_to_text = {head_id: head_node.token['form']}
            for dep in descendants:
                id_to_text[dep['id']] = dep['form']
            np = {'head_id': head_id,
                  'head_node': head_node,
                  'dep_ids': [dep['id'] for dep in descendants],
                  'dep_nodes': descendants,
                  'rel_type': head_node.token['deprel'],
                  'length': len(id_to_text),
                  'text': ' '.join(dict(sorted(id_to_text.items())).values())}
            nps.append(np)
        return nps

In [7]:
text = data['annotated'].tolist()[0]

sentences = parse(text)
trees = parse_tree(text)

sent = sentences[0]
tree = trees[0]

In [8]:
ex = SentenceComplexity(sent, tree, verbose=True)

(deprel:root) form:запомнилось lemma:запомниться upos:VERB [12]
    (deprel:advcl) form:Прокатившись lemma:прокатиться upos:VERB [1]
        (deprel:obl) form:городам lemma:город upos:NOUN [4]
            (deprel:case) form:по lemma:по upos:ADP [2]
            (deprel:amod) form:многим lemma:много upos:NUM [3]
            (deprel:conj) form:странам lemma:страна upos:NOUN [6]
                (deprel:cc) form:и lemma:и upos:CCONJ [5]
        (deprel:obl) form:лет lemma:год upos:NOUN [10]
            (deprel:case) form:за lemma:за upos:ADP [7]
            (deprel:nummod) form:несколько lemma:несколько upos:NUM [9]
                (deprel:amod) form:последние lemma:последний upos:ADJ [8]
        (deprel:punct) form:, lemma:, upos:PUNCT [11]
    (deprel:nsubj) form:количество lemma:количество upos:NOUN [14]
        (deprel:amod) form:большое lemma:большой upos:ADJ [13]
        (deprel:nmod) form:мест lemma:место upos:NOUN [16]
            (deprel:amod) form:интереснейших lemma:интересный up

In [9]:
[clause['text'] for clause in ex.clauses]

['Прокатившись по многим городам и странам за последние несколько лет',
 'запомнилось большое количество интереснейших мест из которых',
 'страшно не хотелось',
 'уезжать']

## Whole text complexity

In [10]:
class TextComplexity:

    def __init__(self, annotation, verbose=False):

        if isinstance(annotation, str):
            self.sentences = parse(annotation)
            self.trees = [sent.to_tree() for sent in self.sentences]
        elif isinstance(annotation, conllu.models.SentenceList):
            self.sentences = annotation
            self.trees = [sent.to_tree() for sent in annotation]
        else:
            raise TypeError('Input must be either a string in CoNLL-U format ' +
                            'or a conllu.models.SentenceList!')

        # initialize SentenceComplexity instances
        self.sent_comp = []
        self.num_w, self.num_cl, self.num_tu = 0, 0, 0
        self.pos_chains, self.dep_chains, self.tree_depths = [], [], []
        dep_dists, terminal, nonterminal, nps = [], [], [], []
        self.clause_counter = dict.fromkeys(['root', 'acl', 'acl:relcl', 'advcl', 'advcl:relcl',
                                             'ccomp', 'csubj', 'csubj:outer', 'nsubj:outer',
                                             'parataxis', 'xcomp', 'conj'], 0)

        for i, sent in enumerate(self.sentences):
            sent = SentenceComplexity(sent, self.trees[i])
            self.sent_comp.append(sent)
            if len(sent) == 0:  # exclude broken sentences
                continue
            self.num_w += len(sent)  # number of words
            dep_dists.extend(sent.dep_dists)  # dependency distances
            terminal.extend(sent.terminal)  # terminal nodes
            nonterminal.extend(sent.nonterminal)  # nonterminal nodes
            nps.extend(sent.nps)  # noun phrases
            self.num_cl += sent.num_cl
            self.num_tu += sent.num_tu
            self.pos_chains.append(sent.pos_chain)
            self.dep_chains.append(sent.dep_chain)
            self.tree_depths.append(sent.tree_depth)
            for clause in sent.clauses:
                self.clause_counter[clause['rel_type']] += 1

        if self.num_w == 0:
            raise ValueError('The annotation is empty!')
        self.num_s = len(self.sent_comp)  # number of sentences

        self.msl = self.num_w / self.num_s  # mean sentence length
        self.mcl = self.num_w / self.num_cl  # mean clause length
        self.mtl = self.num_w / self.num_tu  # mean t-unit length

        self.cps = self.num_cl / self.num_s  # clauses per sentence
        self.cpt = self.num_cl / self.num_tu  # clauses per T-unit

        try:
            self.lev_pos = mean(self.pairwise_levenshtein(self.pos_chains))  # avg Levenshtein distance for POS
            self.lev_dep = mean(self.pairwise_levenshtein(self.dep_chains))  # avg Levenshtein distance for deprel
        except StatisticsError:
            self.lev_pos, self.lev_dep = 0, 0

        self.mtd = mean(self.tree_depths)  # mean tree depth
        self.mdtd = median(self.tree_depths)  # median tree depth
        self.mxtd = max(self.tree_depths)  # max tree depth
        self.mntd = min(self.tree_depths)  # min tree depth

        self.mdd = mean(dep_dists)  # mean dependency distance
        self.node_to_term = len(nonterminal) / len(terminal)  # node to terminal node ratio

        # clausal measures
        self.clause_counter = {rel: num / self.num_cl for rel, num in self.clause_counter.items()}
        self.comb = self.num_cl - self.num_s  # combined clauses  !!! НЕВЕРНО (исправлено ниже)
        self.coord = self.clause_counter['conj'] + self.clause_counter['parataxis']
        self.subord = self.comb - self.coord

        try:
            self.coord_to_comb = self.coord / self.comb  # coordinate to combined clause ratio
        except ZeroDivisionError:
            self.coord_to_comb = 0

        try:
            self.subord_to_comb = self.subord / self.comb  # coordinate to combined clause ratio
        except ZeroDivisionError:
            self.subord_to_comb = 0

        self.coord_to_sent = self.coord / self.num_s  # coordinate clause to sentence ratio
        self.subord_to_sent = self.subord / self.num_s  # subordinate clause to sentence ratio

        self.avg_np_len = mean([np['length'] for np in nps])  # average NP length
        self.comp_np_ratio = len([np for np in nps if np['length'] > 1]) / len(nps)  # complex NPs per clause

    def pairwise_levenshtein(self, chains):
        return [levenshtein.distance(a, b) for a, b in combinations(chains, 2)]

In [11]:
comp = TextComplexity(sentences)

In [12]:
comp.mtd

4.583333333333333

## Calculate measures for all data

In [13]:
texts = data['annotated'].tolist()
comp_dict = {}

In [20]:
for i, text in tqdm(enumerate(texts), total=len(texts)):
    if i in comp_dict:
        continue
    try:
        comp_dict[i] = TextComplexity(text)
    except (ValueError, ParseException):
        comp_dict[i] = np.nan

  0%|          | 0/49397 [00:00<?, ?it/s]

In [21]:
len(comp_dict)

49397

In [33]:
df = data[['corpus', 'language', 'speaker_type', 'language_background',
           'text', 'spell_checked', 'annotated', 'mark', 'level', 'text_type']]
print(df.shape)
df.head(3)

(49397, 10)


Unnamed: 0,corpus,language,speaker_type,language_background,text,spell_checked,annotated,mark,level,text_type
0,actr,russian,L1,,Прокатившись по многим городам и странам за по...,Прокатившись по многим городам и странам за по...,"# generator = UDPipe 2, https://lindat.mff.cun...",,Heritage 1,
1,actr,russian,L1,,В моей жизни я встречала много разных людей. У...,В моей жизни я встречала много разных людей. У...,"# generator = UDPipe 2, https://lindat.mff.cun...",,Heritage 1,
2,actr,russian,L1,,"В прошлем году, я провела четыре месяцев в Лон...","В прошлом году, я провела четыре месяцев в Лон...","# generator = UDPipe 2, https://lindat.mff.cun...",,Heritage 1,


In [36]:
recs = df.to_dict(orient='records')

for i, rec in tqdm(enumerate(recs)):
    text = comp_dict[i]
    if pd.isnull(text):
        continue

    rec['comp_object'] = text

    rec['num_s'] = text.num_s
    rec['num_w'] = text.num_w
    rec['num_cl'] = text.num_cl
    rec['num_tu'] = text.num_tu

    rec['msl'] = text.msl
    rec['mcl'] = text.num_w / text.num_cl
    rec['mtl'] = text.num_w / text.num_tu
    rec['cps'] = text.cps
    rec['cpt'] = text.cpt

    rec['lev_pos'] = text.lev_pos
    rec['lev_dep'] = text.lev_dep

    rec['mtd'] = text.mtd
    rec['mdtd'] = text.mdtd
    rec['mxtd'] = text.mxtd
    rec['mntd'] = text.mntd

    rec['mdd'] = text.mdd
    rec['node_to_term'] = text.node_to_term

    rec['pos_chains'] = text.pos_chains
    rec['dep_chains'] = text.dep_chains
    rec['tree_depth'] = text.tree_depths

    # coordination/subordination measures
    rec['clause_percentage'] = text.clause_counter
    rec['clause_counter'] = {key: val * text.num_cl for key, val in text.clause_counter.items()}
    rec['comb'] = text.comb
    rec['coord'] = rec['clause_counter']['conj'] + rec['clause_counter']['parataxis']
    rec['subord'] = text.comb - rec['coord']

    try:
        rec['coord_to_comb'] = rec['coord'] / rec['comb']
    except ZeroDivisionError:
        rec['coord_to_comb'] = 0
    try:
        rec['subord_to_comb'] = rec['subord'] / rec['comb']
    except ZeroDivisionError:
        rec['subord_to_comb'] = 0
    try:
        rec['coord_to_subord'] = rec['coord'] / rec['subord']
    except ZeroDivisionError:
        rec['coord_to_subord'] = 0
    rec['coord_to_sent'] = rec['coord'] / rec['num_s']
    rec['subord_to_sent'] = rec['subord'] / rec['num_s']

    # phrasal measures
    rec['avg_np_len'] = text.avg_np_len
    rec['comp_np_ratio'] = text.comp_np_ratio

0it [00:00, ?it/s]

In [None]:
df = pd.DataFrame(recs)
df.dropna(subset=['comp_object'], inplace=True)
temp = df.drop(columns=['comp_object'])
temp.to_csv('CompMeasures.tsv', sep='\t', index=False)