In [102]:
import attr
import re

from collections import defaultdict

In [61]:
def parse_int(text):
    matches = re.findall('[0-9]+', text)
    return int(matches[0]) if matches else None

In [62]:
@attr.s
class Token:
    
    text = attr.ib()
    document_id = attr.ib()
    coref_id = attr.ib()

In [95]:
class Document:
    
    def __init__(self, tokens):
        self.tokens = tokens

    def __repr__(self):
        return 'Document<%d tokens>' % len(self.tokens)

In [103]:
class GoldFile:
    
    def __init__(self, path):
        self.path = path
    
    def lines(self):
        """Split lines into cols. Skip comments / blank lines.
        """
        with open(self.path) as fh:
            for line in fh:
                line = line.strip()
                if line and not line.startswith('#'):
                    yield line.split()
                    
    def tokens(self):
        """Generate tokens.
        """
        open_tag = None
        for line in self.lines():
            
            digit = parse_int(line[-1])
            
            if digit is not None and line[-1].startswith('('):
                open_tag = digit
                
            yield Token(line[3], int(line[1]), open_tag)
                
            if line[-1].endswith(')'):
                open_tag = None
                    
    def documents(self):
        """Group tokens by document.
        """`
        groups = defaultdict(list)
        
        for token in self.tokens():
            groups[token.document_id].append(token)
            
        for tokens in groups.values():
            yield Document(tokens)

In [104]:
gf = GoldFile('../../data/conll-2012-english/v4/data/train/data/english/annotations/pt/nt/40/nt_4001.v4_auto_conll')

In [105]:
list(gf.documents())

[Document<670 tokens>]