# Rule-based System

- This notebook includes the process of defining rules to detect event-related tokens using dependency parser and NER processor of SpaCy.
- It outputs the predictions of thr rule-based system in this dir: 'response_data/dataset/rule-based/'

In [1]:
import spacy
import pandas as pd
import os
import glob
from collections import defaultdict


# Load spaCy model
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Collect the text of all children of a token
def get_all_children_text(token):
    """
    Recursively get the text of the token and its children.
    """
    children_texts = [token.text]
    for child in token.children:
        children_texts.append(get_all_children_text(child))
    return ' '.join(children_texts)

def get_all_children_ids(token):
    """
    Recursively get the token IDs of the token and its children.
    """
    token_ids = [token.i]
    for child in token.children:
        token_ids.extend(get_all_children_ids(child))
    return token_ids


In [7]:
# Extract information from sentences
def is_valid_participant(chunk):
    if chunk.text.lower() in ['it', 'that', 'this', 'there', 'here']:
        return False
    if any(token.text.lower() in ['some', 'any', 'every'] for token in chunk):
        return False
    if any(token.pos_ == 'ADJ' for token in chunk):
        return False
    return True

def extract_information(sentence, sentence_id):
    doc = nlp(sentence)
    
    events = []
    times = []
    places = []
    participants = []
    
    for token in doc:
        # Time
        if token.dep_ == 'tmod' or token.ent_type_ in ['DATE', 'TIME']:  # Date expressions
            time_phrase = token.text
            time_token_ids = [token.i]
            for child in token.children:
                if child.dep_ in ['det', 'poss']:
                    time_phrase = child.text + " " + time_phrase
                    time_token_ids.insert(0, child.i)
            times.append({'time': time_phrase, 'time_sentence_id': sentence_id, 'time_token_ids': time_token_ids})


        # Place
        location_prepositions = {"in", "on", "at", "near", "by"}
        if token.pos_ == 'ADP' and token.text in location_prepositions:  # Prepositions indicating location
            place_phrase = get_all_children_text(token)
            place_token_ids = get_all_children_ids(token)
            places.append({'place': place_phrase, 'place_sentence_id': sentence_id, 'place_token_ids': place_token_ids})


        if token.ent_type_ in ['GPE', 'LOC']:  # Geopolitical entities
            places.append({'place': token.text, 'place_sentence_id': sentence_id, 'place_token_ids': [token.i]})

        # Event
        if token.pos_ == 'VERB':  # Identify verbs
            verb_phrase = token.text
            verb_token_ids = [token.i]
            for child in token.children:
                if child.dep_ == 'dobj':  # Direct object
                    verb_phrase += " " + child.text
                    verb_token_ids.append(child.i)
                    for subchild in child.children:
                        if subchild.dep_ in ['poss', 'det']:
                            verb_phrase = subchild.text + " " + verb_phrase
                            verb_token_ids.append(subchild.i)
                    break
            events.append({'event': verb_phrase, 'event_sentence_id': sentence_id, 'event_token_ids': verb_token_ids})
        

    
    # Combine consecutive time entries
    if times:
        combined_times = {'time': '', 'time_sentence_id': sentence_id, 'time_token_ids': []}
        for t in times:
            if combined_times['time']:
                combined_times['time'] += ' ' + t['time']
            else:
                combined_times['time'] = t['time']
            combined_times['time_token_ids'].extend(t['time_token_ids'])
        times = [combined_times]
    
    # Participants
    for chunk in doc.noun_chunks:
        if (chunk.root.dep_ in ('nsubj') and chunk.root.head.pos_ == 'VERB' and is_valid_participant(chunk)) or \
           (chunk.root.ent_type_ == 'PERSON'):
            participants.append({'participants': chunk.text, 'participants_sentence_id': sentence_id, 'participants_token_ids': [token.i for token in chunk]})
    
    return {
        'events': events,
        'times': times,
        'places': places,
        'participants': participants
    }


def rule_based_extraction(sentences):

    # Process each sentence and aggregate information
    results = []
    for sentence_id, sentence in enumerate(sentences):
        if sentence:
            info = extract_information(sentence, sentence_id)
            results.append(info)

    # Flatten results to get a single JSON structure
    output = []
    for result in results:
        for event in result['events']:
            entry = {
                'event': event['event'],
                'event_sentence_id': event['event_sentence_id'],
                'event_token_ids': event['event_token_ids'],
                'times': result['times'],
                'places':result['places'],
                'participants': result['participants']
            }
            output.append(entry)

    return output

In [9]:
# 


def annotate(df, annotations, conversation_id):
    events_by_sentence = defaultdict(list)
    participants_by_sentence = defaultdict(list)
    places_by_sentence = defaultdict(list)
    times_by_sentence = defaultdict(list)

    # Conbine each argument with the same sentence ID
    for ann in annotations:
        sentence_id = ann['event_sentence_id'] + 1  # convert index from 0-based to 1-based
        for token_id in ann['event_token_ids']:
            events_by_sentence[sentence_id].append(token_id + 1)  # convert index from 0-based to 1-based

        for participant in ann.get('participants', []):
            sentence_id = participant['participants_sentence_id'] + 1 # convert index from 0-based to 1-based
            for token_id in participant['participants_token_ids']:
                participants_by_sentence[sentence_id].append(token_id + 1)  # convert index from 0-based to 1-based

        for place in ann.get('places', []):
            sentence_id = place['place_sentence_id'] + 1 # convert index from 0-based to 1-based
            for token_id in place['place_token_ids']:
                places_by_sentence[sentence_id].append(token_id + 1)  # convert index from 0-based to 1-based

        for time in ann.get('times', []):
            sentence_id = time['time_sentence_id'] + 1 # convert index from 0-based to 1-based
            for token_id in time['time_token_ids']:
                times_by_sentence[sentence_id].append(token_id + 1)  # convert index from 0-based to 1-based


    # Get the whole event-related information for each sentence
    def apply_labels(df, items_by_sentence, label_prefix):
        for sentence_id, token_ids in items_by_sentence.items():
            token_ids.sort() 
            previous_token_id = -2  
            for i, token_id in enumerate(token_ids):
                # If the current token ID is consecutive to the previous one, it is an inside label
                label = 'I-' + label_prefix if token_id == previous_token_id + 1 else 'B-' + label_prefix
                # print(f"Token ID: {token_id}, Label: {label} (Previous Token ID: {previous_token_id})") 
                
                df.loc[
                    (df['conversation'] == conversation_id) & 
                    (df['sent_id'] == sentence_id) & 
                    (df['token_id'] == token_id), 
                    'label'
                ] = label
                previous_token_id = token_id  #Update the previous token ID

    apply_labels(df, events_by_sentence, 'event')
    apply_labels(df, participants_by_sentence, 'participant')
    apply_labels(df, places_by_sentence, 'place')
    apply_labels(df, times_by_sentence, 'time')

    return df


# Iterate over the files, excluding the dev and test sets
for file_path in glob.glob("./response_data/resized_files/*.tsv"):
    if "dev" in file_path or "test" in file_path:
        with open(file_path, 'r') as file:

            # Read the TSV file, skip empty lines
            tsv_file_path = f"./response_data/files_with_id/{file_path.split('/')[-1]}"
            df = pd.read_csv(tsv_file_path, delimiter='\t', skip_blank_lines=True)
            df = df.dropna(subset=['token'])

            # Initialise the label column to 'O'
            df['label'] = 'O'  

            next(file)  # Skip the header

            content = file.read()
            conversations = content.split('--------------------')
            
            # Interate over the conversations
            conversation_id = 1
            for conversation in conversations:

                conversation = conversation.strip()
                if conversation:
                    sentences = conversation.strip().split('\n')
                    if len(sentences) > 1:
                        # Extract information from conversation
                        results = rule_based_extraction(sentences)

                        # Annotate the data
                        df = annotate(df, results, conversation_id)
                        conversation_id += 1

            # Format the columns as integers
            df['conversation'] = df['conversation'].astype(int)
            df['sent_id'] = df['sent_id'].astype(int)
            df['token_id'] = df['token_id'].astype(int)

            # save the annotated data
            rule_based_dir = 'response_data/dataset/rule-based/'
            if not os.path.exists(rule_based_dir):
                os.makedirs(rule_based_dir)

            output_tsv_file_path = rule_based_dir + 'improved_' + file_path.split('/')[-1]
            df.to_csv(output_tsv_file_path, sep='\t', index=False)
