In [1]:
# Define file path
filepath = "./a-proof/sample_data/INCEpTION_output/Avelli+wk_project_2020-07-24_1202/annotation/notities_2017_deel1_cleaned.csv---2503.conll/avelli.tsv"

In [2]:
class BertContainer:
    def __init__(self, key, sen_id, sen, encoding):
        self.key = key
        self.sen_id = sen_id
        self.sen = sen
        self.encoding = encoding
        
        self.annot = []

    
    def add_anno(self, anno):
        self.annot.append(anno)
        
    def print_container(self):
        info = []
        print(self.key)
        print(self.sen_id)
        print(self.sen)
        for anno in self.annot:
            anno.print_annotation()
        print(self.encoding)

In [3]:
class Annotation:
    def __init__(self, tokens, label):
        self.tokens = tokens
        self.label = label

    def print_annotation(self):
        print(self.tokens)
        print(self.label)

In [4]:
# Example

key = 'Notities_VUmc_2017'
sen_id = 17
sen = 'Patiënt loopt moeilijk'
encoding = 1880983
tokens1 =  [("t1", "Patient"),("t2", "loopt")]
label1 = 'Lopen'

tokens2 =  [("t3", "moeilijk")]
label2 = 'Stemming'

anno1 = Annotation(tokens=tokens1, label=label1)
anno2 = Annotation(tokens2, label2)

 
instance = BertContainer(key, sen_id, sen, encoding)
instance.add_anno(anno1)
instance.add_anno(anno2)

In [5]:
instance.print_container()

Notities_VUmc_2017
17
Patiënt loopt moeilijk
[('t1', 'Patient'), ('t2', 'loopt')]
Lopen
[('t3', 'moeilijk')]
Stemming
1880983


In [6]:
def read_tsv(filepath):
    """
    Reads tsv file. Skips lines starting with '#' (except for '#Text=') and empty lines.
    :param filepath: filepath to tsv file
    :return: data in list of list.
    """
    with open(filepath, 'r') as infile:
        data = []
        for line in infile:
            # Remove unnecessary lines
            if line.startswith('#') and not line.startswith('#Text='):
                continue
            if line.startswith('\t'):
                continue
            # Remove '\n' at end of line
            line = line[:-1]
            # Split line on tab
            line = line.split('\t')

            data.append(line)
    return data


def get_sentence_lvl(data):
    """
    Creates list of lists on sentence level. Each element of resulting list is a list where the first element
    is a str of the sentence. The second element is a list of the rows of tokens. 
    :param data: List of list with sentence elements starting with '#Text='
    :return: List of list. Text separated by sentence.
    """
    text_list = []

    for index, line_list in enumerate(data):
        # If line is whole sentence
        if line_list[0].startswith('#'):
            # If not the first sentence
            if index != 0:
                # Add info from previous sentence
                text_list.append(sentence_list)
            # Create empty sentence list and append string
            sentence_list = []
            sentence_list.append(line_list[0][6:])
        # If line is last in text
        elif index == len(data) - 1:
            sentence_list.append(line_list)
            text_list.append(sentence_list)

        # Else append token level info
        else:
            sentence_list.append(line_list)
    return text_list


def get_labels_tokens(sentence_obj):
    """
    Collects tokens related to same label. First loops through all tokens in sentence to collect all labels in a set. 
    Then loops through labels and through all tokens in a sentence to gather the tokens with that label. 
    This code could be made more efficient.
    :param sentence: List containing rows which belong to a single sentence.
    :return: dictionary of labels with matching tokens {'label_id': [('t1', 'token_1'), ('t2', 'token_2')], ...}
    """
    # Define set to collect labels in this sentence
    label_set = set()
    # For token in sentence
    for index, row in enumerate(sentence_obj):
        # Continue only if row contains single token (so is not the full sentence) and has a label
        if type(row) == list and row[3] != '_':
            # Split if there are multiple labels for token. If-statement is for bug fix
            if type(row[3]) == str:
                row[3] = row[3].split('|')
            # Add label to set
            label_set.update(row[3])
    
    # Create dictionary to match label to tokens
    label_token_dict = dict()
    # Loop through labels and create dictionary entry
    for label in label_set:
        label_token_dict[label] = []
        # Loop through sentence, if the row has the label, then create token tuple and add to dict
        for index, row in enumerate(sentence_obj):
            if label in row[3]:
                token_tuple = ('t' + str(index), row[2])
                label_token_dict[label].append(token_tuple)

    return label_token_dict

In [None]:
sentence_1 = "NF : Pijn in de rug , verlicht met warme handdoek en pcm ."
sentence_2 = "Mw was emotioneel , schoonzoon hoort morgen de uitslag van een aantal onderzoeken en mw maakt zich hier veel zorgen om ."

def get_BERTje_encoding(sentence):
    # Write function
    
    return encoding

In [10]:
# Read in data and get it in correct format
data = read_tsv(filepath)
text_list = get_sentence_lvl(data)

# Extract key and annotator name from file name
key = 'Notities_xyz' # Extract this info
# Include annotator name in BertContainer()

# For every sentence in the text
for sentence_obj in text_list:
    # Extract sentence, sentence_id and encoding
    sen = sentence_obj[0]
    sen_id = sentence_obj[1][0].split('-')[0]
    encoding = 123148 # get_BERTje_encoding(sen)

    # Define BertContainer instance
    instance = BertContainer(key, sen_id, sen, encoding)

    # Create label dictionary and loop through it
    label_dict = get_labels_tokens(sentence_obj)
    for label, token_list in label_dict.items():
        label_clean = label.split('[')[0]
        # Add the labels to the BertContainer instance
        anno = Annotation(token_list, label_clean)
        instance.add_anno(anno)
        
    instance.print_container()
    print()

Notities_xyz
1
NF : Pijn in de rug , verlicht met warme handdoek en pcm .
123148

Notities_xyz
2
Mw was emotioneel , schoonzoon hoort morgen de uitslag van een aantal onderzoeken en mw maakt zich hier veel zorgen om .
[('t3', 'emotioneel')]
.B152: Stemming
[('t19', 'veel'), ('t20', 'zorgen')]
STM 0
[('t5', 'schoonzoon'), ('t6', 'hoort'), ('t7', 'morgen'), ('t8', 'de'), ('t9', 'uitslag'), ('t10', 'van'), ('t11', 'een'), ('t12', 'aantal'), ('t13', 'onderzoeken')]
stm\_reaction
[('t16', 'maakt'), ('t17', 'zich'), ('t18', 'hier'), ('t19', 'veel'), ('t20', 'zorgen')]
.B152: Stemming
[('t3', 'emotioneel')]
STM 1
123148

