# Prepare for Annotation

- In this notebook, the dev and test dataset are pre-processed to a format that accomodates manual annotation.

- After pre-processing, the dataset will have 8 column, namely: 'conversation', 'sent_id', 'token_id', 'token', 'event', 'time', 'place', 'participant'.

- The annotators can mark the 'event', 'time', 'place', 'participant' information to annotate the event-related information.

In [1]:
import os
import glob
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
def tsv_add_id(tsv_file):
    '''
    This function reads a tsv file and adds conversation, sentence, and token IDs to each word in the file.
    Conversation ID is added to the first word of each conversation. Sentence ID is added to the first word of each sentence within the same conversation. Token ID is added to each word within the same sentence.
    '''
    data_with_id = []
    conversation = 1

    with open(tsv_file, 'r') as file:
        next(file)  # Skip the "utterance" line
        sentence_id = 1  # Initialize sentence_id
        for line in file:
            line = line.strip()

            if line.endswith('--------------------'):
                conversation += 1
                data_with_id.append(('', '', '', ''))  # Add an empty line to separate conversations
                sentence_id = 1  # Reset sentence_id for new conversation
            elif line:  # Check if the line is not empty
                # Add a sentence to make the file more readable for annotations
                sentence = line.split('\t')[0]
                data_with_id.append((sentence, '', '', ''))

                doc = nlp(sentence)  # Use spaCy to process the sentence
                token_id = 1
                for token in doc:
                    data_with_id.append((str(conversation), str(sentence_id), str(token_id), token.text))  # Add conversation, sentence, and token IDs to each word
                    token_id += 1  # Increment token_id
                sentence_id += 1  # Increment sentence_id
                data_with_id.append(('', '', '', ''))  # Add an empty line to separate sentences

    return data_with_id


In [3]:
def add_placeholder(data_with_id, output_file):
    '''
    This function reads a list of tuples with conversation, sentence, and token IDs and adds 12 additional columns with '-' as content to each tuple.
    The output is saved to a tsv file.
    '''
    # Create DataFrame from data_with_id
    df = pd.DataFrame(data_with_id, columns=['conversation', 'sent_id', 'token_id', 'token'])
    # Add additional columns with '-' as label
    df[f'event'] = '-'
    df[f'time'] = '-'
    df[f'place'] = '-'
    df[f'participant'] = '-'


    # Save DataFrame to file
    df.to_csv(output_file, sep='\t', index=False)

In [4]:
def files_to_ready_to_annotate(tsv_file, output_file):
    '''
    This function takes a tsv file and adds conversation, sentence, and token IDs to each word in the file.
    The output is saved to a tsv file with 12 additional columns with '-' as content, which will be used for manual annotations.
    '''
    data_with_id = tsv_add_id(tsv_file)
    add_placeholder(data_with_id, output_file)

In [5]:
# Paths to dev and test files
tsv_test = './response_data/resized_files/test.tsv'
tsv_dev = './response_data/resized_files/dev.tsv'

directory = './response_data/dataset'
if not os.path.exists(directory):
    os.makedirs(directory)

# Check if the path exists, if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)
output_file_test = f"{directory}/annotation_test.tsv"
output_file_dev = f"{directory}/annotation_dev.tsv"

# Call the function
files_to_ready_to_annotate(tsv_test, output_file_test)
files_to_ready_to_annotate(tsv_dev, output_file_dev)