# Annotation to BIO
This Notebook converts the manually annotated file "annotation_test.tsv" and "annotation_dev.tsv" to a ready-to-evaluation format.

The output file only contains the conversation id, sentence id, token id, token, and the BIO system labels.

In [1]:
import pandas as pd
import csv

In [2]:
def initialise_label_list(input_file):
    """
    Initialise a list of labels with "O"s for each token in the input file.
    The list will be used to store the labels of the tokens in the input file.
    """
    
    # Count the number of tokens in the file to create an initialised list of labels (full of "O"s)
    line_count = 0

    # Read the file and count the row number
    with open(input_file, 'r', encoding='utf-8') as file:

        # Skip empty lines
        for row in file:
            if row[0].strip() == "":
                continue
            
            # Skip the whole sentences
            try:
                int(row[0])
            except ValueError:
                continue

            # Increment the line count when a token is found
            line_count += 1

    print('Number of tokens in the file: {}'.format(line_count))

    # Initialise the list of labels with "O"s
    label_list = ['O'] * line_count

    # Check the label list
    # print(label_list)
    
    return label_list

In [3]:
def add_B_prefixes(input_file, label_list):
    """
    Add the B- prefixes to the labels in the label list.
    The B- prefixes are added to the labels that are not '-'
    """
    
    # Read the file and update the label list by adding the B- prefixes

    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        # writer = csv.writer(out_f, delimiter='\t')
        
        # Read the header
        header = next(reader)
        
        row_index = 0
        for row in reader:
            
            # Skip empty lines
            if row[0].strip() == "":
                continue
            
            # Skip if it is a whole sentence
            try:
                int(row[0])
            except ValueError:
                # print(row)
                continue
            
            for i in range (4, len(row)):
                if row[i].strip() != '-':
                    label_list[row_index] = f"B-{header[i]}"
            row_index += 1
    print(row_index)
    # print(label_list)

    return label_list

In [4]:
def update_labels(label_list):
    """
    Update the label list by adding the I- prefixes when adjacent tokens have the same label.
    """
    
    # Update the label list by adding the I- prefixes when adjacent tokens have the same label

    # Create a new list to store the updated labels
    new_label_list = []

    # Iterate over the label list
    for i in range(len(label_list)):

        # If the current label is the first one or different from the previous one, add it to the new list
        if i == 0 or label_list[i] != label_list[i - 1]:
            new_label_list.append(label_list[i])
        else:
            # If the current label is the same as the previous one, update the label as I- prefixed

            if label_list[i].startswith('B-'):
                # Replace the 'B-' prefix with 'I-'
                new_label_list.append('I-' + label_list[i][2:])
            else:
                # If the label does not start with 'B-', add it to the new list
                new_label_list.append(label_list[i])

    # Check the updated label list
    # print(new_label_list)

    return new_label_list


In [5]:
def write_updated_labels(input_file, output_file, new_label_list):
    """
    Write the updated labels to the output file as a new column.
    """
    
    # Write the updated labels to the output file as a new column

    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as out_f:
        reader = csv.reader(f, delimiter='\t')
        writer = csv.writer(out_f, delimiter='\t')
        
        # Read the header
        header = next(reader)
        header = [h.strip() for h in header]
        new_header = header[:4] + ['label']
        writer.writerow(new_header)
        # print(header)
        
        row_index = 0
        for row in reader:
            
            # Skip empty lines
            if row[0].strip() == "":
                continue
            
            # Skip if it is a whole sentence

            try:
                int(row[0])
            except ValueError:
                # print(row)
                continue

            row_index += 1

            # Write the updated labels to the output file
            row[0] = row[0].strip()
            writer.writerow(row[:4] + [new_label_list[row_index - 1]])

In [6]:
def add_BIO_labels(input_file, output_file):
    label_list = initialise_label_list(input_file)
    label_list = add_B_prefixes(input_file, label_list)
    new_label_list = update_labels(label_list)
    write_updated_labels(input_file, output_file, new_label_list)

In [8]:
# Load the data
input_file_test = './response_data/dataset/annotation_test.tsv'
output_file_test = './response_data/dataset/annotation_test_processed.tsv'

input_file_dev = './response_data/dataset/annotation_dev.tsv'
output_file_dev = './response_data/dataset/annotation_dev_processed.tsv'

In [9]:
# Process the test set
add_BIO_labels(input_file_test, output_file_test)

# Process the dev set
add_BIO_labels(input_file_dev, output_file_dev)

Number of tokens in the file: 5695
5695
