In [1]:
import re 
import os

class Sentence:
    def __init__(self, text, tokens):
        self.text = text 
        self.tokens = tokens 

class Token:
    def __init__(self, idx, form, lemma, upos, xpos, feats, head, deprel, deps, misc, semslot, semclass):
        if '-' not in idx:
            self.idx = int(idx) if '.' not in idx else float(idx)
        else:
            self.idx = idx
        self.form = form 
        self.lemma = lemma 
        self.upos = upos 
        self.xpos = xpos 
        self.feats = feats 
        try:
            self.head = int(head) if '.' not in head else float(head) 
        except ValueError:
            self.head = head
        self.deprel = deprel 
        self.deps = deps 
        self.misc = misc 
        self.semslot = semslot 
        self.semclass = semclass 

    def __repr__(self):
        return f"{self.__class__.__name__}(idx={self.idx}, form={self.form}, lemma={self.lemma}, head={self.head})"

class FailedToken(Exception):
    def __init__(self, num):
        self.num = num
    def __str__(self):
        return f'Token length is {self.num} instead of 12'

class Tester:
    def __init__(self, path, log=None):
        self.path = path 
        if log:
            self.log = open(log, 'w', encoding='utf8')
        else:
            self.log = log
    
    def process(self):
        name = ''
        sent = Sentence('', [])
        with open(self.path, 'r', encoding='utf8') as file:
            for line in file:
                if line.startswith('# sent_id'):
                    name = line.strip()
                elif line.startswith('# text'):
                    sent.text = line[10:].strip()
                elif line[0].isdigit():
                    token = line.strip().split('\t')
                    if len(token) != 12:
                        raise FailedToken(len(token))
                    sent.tokens.append(Token(*token))
                elif line == '\n':
                    self.check(sent, name)
                    sent = Sentence('', [])
        if self.log:
            self.log.close()
    
    def check(self, sent, name):
        logmsg = ''
        indxs = self.indexes(sent)
        hs = self.heads(sent)
        deps = self.depscheck(sent)
        if indxs or hs or deps:
            logmsg += f'{name}:\n'
        if indxs: 
            logmsg += f"INDEX: {indxs}"
        if hs:
            logmsg += f"HEADS: {hs}"
        if deps:
            logmsg += f"DEPS: {deps}"
        if logmsg and self.log:
            print(logmsg, file=self.log)
        elif logmsg:
            print(logmsg)

    def indexes(self, sent):
        errors = set()
        idxs = [token.idx for token in sent.tokens]

        if len(set(idxs)) != len(idxs):
            errors.add('Duplicates')
        if max([token.head for token in sent.tokens if isinstance(token.head, (float, int))]) > max([token.idx for token in sent.tokens if isinstance(token.idx, (float, int))]):
            errors.add('Maximum index out of limits')
        for i in range(1, len(idxs)):

            if isinstance(idxs[i - 1], int) and isinstance(idxs[i], int) and idxs[i] - 1 != idxs[i - 1]:
                errors.add('Index order interrupted')
                continue
            if isinstance(idxs[i], str):
                nextids = re.findall(r'(\d+)-(\d+)', idxs[i])
                if len(nextids) != 1 or len(nextids[0]) != 2:
                    errors.add(r'Indexes \d-\d wrong: ' + str(nextids))
                    continue 
                try:
                    if int(nextids[0][0]) != idxs[i + 1] or int(nextids[0][1]) != idxs[i + 2]:
                        errors.add(r'Indexes \d-\d: the following indexes interrupted: ' + f"{nextids[0]} != {idxs[i + 1]}, {idxs[i + 2]}")
                        continue 
                except IndexError:
                    errors.add(r'Indexes \d-\d: at the end of the sentence')
                    continue 
        return errors
    
    def heads(self, sent):
        errors = set()
        floats = [t for t in sent.tokens if isinstance(t.head, float)]
        _ = [t for t in sent.tokens if t.head == '_' and isinstance(t.idx, int)]
        if floats:
            errors.add(f"Has floating heads: {' '.join([f'idx={t.idx}, head={t.head}' for t in floats])}")
        if _:
            errors.add(f"Has _ heads: {' '.join([f'idx={t.idx}, head={t.head}' for t in _])}")
        return errors
    
    def depscheck(self, sent):
        errors = set()
        deps = [token.deps for token in sent.tokens if isinstance(token.idx, (float, int))]
        for dep in deps:
            if not re.fullmatch(r"[\d.]+:[\w:]+(\|[\d.]+:[\w:]+)*", dep):
                errors.add(f"Has troubles with deps: {dep}")
            heads = re.findall(r"[\d.]+", dep)
            if heads and len(heads) != len(set(heads)):
                if ':as|' not in dep: # self-loop seems to be okay in some constructions with ref tag
                    errors.add(f"Has self-loops: {dep}")
        return errors

In [2]:
test = Tester(os.getcwd()[:-4] + r'data\res.conllu', 'log.txt')
test.process()