# ACTER dataset


## Prepare Data

In [None]:
%cd ACTER

Training Data: corp, wind

Validation Data: equi

Test Data: htfl

In [None]:
#torch and tranformers for model and training
import torch  
import sys
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import XLMRobertaTokenizerFast, RobertaTokenizerFast              
from transformers import XLMRobertaForTokenClassification, RobertaForTokenClassification
from transformers import AdamW                            
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
import sentencepiece

#sklearn for evaluation
from sklearn import preprocessing                       
from sklearn.metrics import classification_report        
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid         
from sklearn.model_selection import ParameterSampler      
# from sklearn.utils.fixes import loguniform

#nlp preprocessing
from nltk import ngrams                                 
# from spacy.pipeline import SentenceSegmenter
from spacy.lang.en import English
from spacy.pipeline import Sentencizer
from sacremoses import MosesTokenizer, MosesDetokenizer


#utilities
import json
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import glob, os
import time
import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle         # for saving data structures
import sys
from pynvml import *  # for checking gpu memory

In [None]:
# Initial setting
model_name = "FacebookAI/roberta-large"

In [None]:
import os
import re
import numpy as np
import pandas as pd
from IPython.core.debugger import set_trace

master_path = 'dataset/ACTER/en'
trainings_data = []
val_data = []
test_data = []

boi_to_label = {'B': 'B-T', 'I': 'T', 'O': 'n'}
for dataset_name in os.listdir(master_path):
    rows = {}
    all_domain_terms = []
    all_texts = []
    all_term_types = []
    dataset_path = os.path.join(master_path, dataset_name, 'annotated')
    
    if dataset_name == 'cor':
        continue
    
    for category in os.listdir(dataset_path):
        category_path = os.path.join(dataset_path, category)
        if category == 'annotations':
            for seq_annotation in os.listdir(category_path):
                if seq_annotation == 'sequential_annotations':
                    iob_annotation_path = os.path.join(category_path, seq_annotation, 'iob_annotations', 'with_named_entities')
                    for filename in sorted(os.listdir(iob_annotation_path), key=lambda x: int(os.path.splitext(x)[0].split('_')[2])):
                        file_path = os.path.join(iob_annotation_path, filename)
                        lines = pd.read_csv(file_path,  sep='\t', na_values=[], skip_blank_lines=False, names=['word', 'boi'])
                        words = []
                        labels = []
                        for i in range(len(lines)):
                            word = lines.iloc[i]['word']                                  
                            boi = lines.iloc[i]['boi']
                            
                            if word is np.nan and boi is np.nan:
                                if dataset_name == 'corp' or dataset_name == 'wind':
                                    trainings_data.append((words, labels, dataset_name))
                                elif dataset_name == 'equi':
                                    val_data.append((words, labels, dataset_name))
                                elif dataset_name == 'htfl':
                                    test_data.append((words, labels, dataset_name))
                                    
                                words = []
                                labels = []
                                continue
                            
                            if word is np.nan:
                                word = 'None'
                                
                            label = boi_to_label[boi]
                            
                            words.append(word)
                            labels.append(label)

        

In [None]:
#seperate tokens and tags

#train
train_tags=[tup[1] for tup in trainings_data]
train_texts=[tup[0] for tup in trainings_data]
train_domains=[tup[2] for tup in trainings_data]
#val
val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]
val_domains=[tup[2] for tup in val_data]

#test
test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]
test_domains=[tup[2] for tup in test_data]  

# Tokenize 

In [None]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name) if "xlm" in model_name else RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

In [None]:
#align labels with tokenization from XLM-R
label_list=["n", "B-T", "T"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
    tokenized_inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs  


train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)
val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)
test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)



In [None]:
# create dataset that can be used for training with the huggingface trainer
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = OurDataset(train_input_and_labels, train_input_and_labels["labels"])
val_dataset = OurDataset(val_input_and_labels, val_input_and_labels["labels"])
test_dataset = OurDataset(test_input_and_labels, test_input_and_labels["labels"])


# Training

In [None]:
# return the extracted terms given the token level prediction and the original texts
no_term_token = "No term"
def extract_terms(token_predictions, val_texts):
  extracted_terms = list()
  # go over all predictions
  pred = token_predictions
  txt  = val_texts
  for j in range(len(pred)):
    # if right tag build term and add it to the set otherwise just continue
    if pred[j]=="B-T":
      term=txt[j]
      for k in range(j+1,len(pred)):
        if pred[k]=="T": term+=" "+txt[k]
        else: break
      extracted_terms.append(term)
  if len(extracted_terms)==0: 
    extracted_terms.append(no_term_token)
  return extracted_terms

In [None]:
def f1_score(pred, ref, return_error=False):
    _ref = ref[:]
    individual_result = {}  
    
    tp, len_pred, len_ref = 0, len(pred), len(_ref)
    incorrect_pred = []
    for relation in pred:
        if relation in _ref:
            _ref.remove(relation)
            tp += 1
        elif return_error:
            incorrect_pred.append(relation)
            
    if return_error:
        incorrect_ref = _ref
            
    precision, recall, f1_score = 0, 0, 0   
    if len_pred != 0:
        precision = tp / len_pred
    if len_ref != 0:
        recall = tp / len_ref

    if not precision and not recall:
        f1_score = 0
    else:
        f1_score = 2 * precision * recall / (precision + recall)

    individual_result['tp'] = tp
    individual_result['len_pred'] = len_pred
    individual_result['len_ref'] = len_ref
    individual_result['precision'] = precision
    individual_result['recall'] = recall
    individual_result['f1_score'] = f1_score

    if return_error:
        individual_result['incorrect_pred'] = incorrect_pred
        individual_result['incorrect_ref'] = incorrect_ref
        
    return individual_result

def micro_f1_score(
    total_tp: int, 
    total_len_pred: int, 
    total_len_ref: int, 
    ):
    overall_result = {}
    
    precision = total_tp / total_len_pred if total_len_pred > 0 else 0
    recall = total_tp / total_len_ref if total_len_ref > 0 else 0

    if not precision and not recall:
        micro_f1_score = 0
    else:
        micro_f1_score = 2 * precision * recall / (precision + recall)

    overall_result["total_precision"] = precision
    overall_result["total_recall"] = recall
    overall_result["micro_f1_score"] = micro_f1_score

    return overall_result

In [None]:
#compute the metrics TermEval style for Trainer
# this function always uses the val sets. thus, for the test set you need an additional function or exchange the marked values

def eval_metric(p, report_save_path, ref_texts, ref_tags):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    total_tp, total_len_pred, total_len_ref = 0, 0, 0
    total_extracted_terms, total_labels = [], []
    f1_scores, precisions, recalls = [], [], []
    for true_prediction, ref_text, ref_tag in zip(true_predictions, ref_texts, ref_tags):
        extracted_terms=extract_terms(true_prediction, ref_text) # VAL set hardcoded
        total_extracted_terms.append(extracted_terms)
        label = extract_terms(ref_tag, ref_text)
        total_labels.append(label)
        individual_score = f1_score(extracted_terms, label) # VAL set hardcoded
        total_tp += individual_score['tp']
        total_len_pred += individual_score['len_pred']
        total_len_ref += individual_score['len_ref']
        
        f1_scores.append(individual_score['f1_score'])
        precisions.append(individual_score['precision'])
        recalls.append(individual_score['recall'])
            
    overall_result = micro_f1_score(total_tp, total_len_pred, total_len_ref)

    individual_report = pd.DataFrame({
        'input_text': [' '.join(ref_text) for ref_text in ref_texts],
        'prediction': true_predictions,
        'normalized_prediction': total_extracted_terms,
        'reference': ref_tags,
        'normalized_reference': total_labels,
        'f1_score': f1_scores,
        'precision': precisions,
        'recall': recalls
    })
        
    reports = os.listdir(os.path.dirname(report_save_path))
    if len(reports):
        if 'report_overall.csv' in reports:
            reports.remove('report_overall.csv') 
        last_report_num = max(list(map(lambda x: int(os.path.splitext(x)[0].split('_')[-1]), reports)))
        individual_report_save_path = report_save_path.format(last_report_num+1)
    else:
        individual_report_save_path = report_save_path.format(0)
        
    individual_report.to_csv(individual_report_save_path, index=False)
    
    overall_report = {
        'total_precision': overall_result['total_precision'],
        'total_recall': overall_result['total_recall'],
        'micro_f1_score': overall_result['micro_f1_score'],
    }
    
    with open(report_save_path.format('overall'), 'w') as f:
        f.write(json.dumps(overall_report, ensure_ascii=False, indent=4))
    
    return overall_report


In [None]:
config = {
    "model_name": model_name,
    "train_mode": True,
    "output_dir": "outputs",
    "dataset_name": "ACTER",
    "predict_category": False
}

In [None]:
# training arguments
from datetime import datetime

config['dataset_name'] = 'ACTER'
config['train_mode'] = True

output_dir = 'outputs'
now = datetime.now().strftime("%m_%d_%H_%M_%S")
output_dir = os.path.join(output_dir, 'train' if config['train_mode'] else 'test', config['dataset_name'], 'roberta', now)
os.makedirs(output_dir, exist_ok=True)
report_save_path = os.path.join(output_dir, 'report', 'report_{0}.csv')
os.makedirs(os.path.dirname(report_save_path), exist_ok=True)

with open(os.path.join(output_dir, 'config.json'), 'w') as f:
    f.write(json.dumps(config, ensure_ascii=False, indent=4))
        
training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    num_train_epochs=40,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy= "steps",     # or "epoch"
    eval_steps=50,
    save_steps=50,
    save_total_limit=4,
    load_best_model_at_end=True,   #loads the model with the best evaluation score
    metric_for_best_model="micro_f1_score",
    greater_is_better=True
)

In [None]:
# initialize model
label_list=["n", "B-T", "T"]
num_labels=len(label_list)
model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels) if "xlm" in model_name else RobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels)


In [None]:
# initialize huggingface trainer
compute_metrics = lambda x: eval_metric(x, report_save_path, val_texts, val_tags)
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
# train
torch.cuda.empty_cache()
trainer.train()

# Test Set Evaluation

In [None]:
# training arguments
from datetime import datetime

config['dataset_name'] = 'ACTER'
config['train_mode'] = False

output_dir = 'outputs'
now = datetime.now().strftime("%m_%d_%H_%M_%S")
output_dir = os.path.join(output_dir, 'train' if config['train_mode'] else 'test', config['dataset_name'], 'roberta', now)
os.makedirs(output_dir, exist_ok=True)
report_save_path = os.path.join(output_dir, 'report', 'report_{0}.csv')
os.makedirs(os.path.dirname(report_save_path), exist_ok=True)

with open(os.path.join(output_dir, 'config.json'), 'w') as f:
    f.write(json.dumps(config, ensure_ascii=False, indent=4))

training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    num_train_epochs=40,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy= "steps",     # or "epoch"
    eval_steps=50,
    save_steps=50,
    save_total_limit=4,
    load_best_model_at_end=True,   #loads the model with the best evaluation score
    metric_for_best_model="micro_f1_score",
    greater_is_better=True
)

In [None]:
compute_metrics = lambda x: eval_metric(x, report_save_path, test_texts, test_tags)

# initialize huggingface trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
#test
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions, axis=2)
# Remove ignored index (special tokens)
true_test_predictions = [
    [label_list[p] for (p, l) in zip(test_prediction, test_label) if l != -100]
    for test_prediction, test_label in zip(test_predictions, test_labels)
]

In [None]:
# example output
i=4
print('{:>10}  {:>10}  {:>10}'.format("Text", "Label", "Prediction"))
for j in range(len(true_test_predictions[i])):
  print('{:>10}  {:>10}  {:>10}'.format(test_texts[i][j], test_tags[i][j], true_test_predictions[i][j]))

# ACL-RD and BCGM

In [None]:
import os
import re
import numpy as np
import pandas as pd
from IPython.core.debugger import set_trace
from datasets import Dataset, DatasetDict, load_from_disk
from datetime import datetime
from tqdm import tqdm

dataset_names = ['ACL-RD', 'BCGM']

#split in sentences and tokenize
def tokenize_text(text, nlp):
  #sentenize (from spacy)
  doc = nlp(text)

  #tokenize
  sentence_list=[]
  mt = MosesTokenizer(lang='en')
  for s in doc.sents:
    tokenized_text = mt.tokenize(s, return_str=True)    #append tuple of tokens and original sentence
    sentence_list.extend(tokenized_text.split())     
  return sentence_list

# create dataset that can be used for training with the huggingface trainer
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
for dataset_name in tqdm(dataset_names):
    master_path = f'dataset/{dataset_name}/huggingface'

    dataset = load_from_disk(master_path)
    train_dataset = dataset['train'].to_pandas()
    test_dataset = dataset['test'].to_pandas()
    validation_dataset = dataset['validation'].to_pandas()

    nlp = English()
    nlp.add_pipe("sentencizer")
    preprocess = lambda x: tokenize_text(x, nlp)

    for dataset in [train_dataset, validation_dataset, test_dataset]:
        preprocessed_dataset = []
        for text_str, labels in zip(dataset['text'], dataset['label']):
            text = text_str[:]
            split_texts = preprocess(text)
            preprocessed_labels = []
            for label in labels:
                label_index = text.find(label)
                prev_text = text[:label_index]
                split_text = preprocess(prev_text)
                preprocessed_label = len(split_text) * ['n']
                for i, token in enumerate(preprocess(label)):
                    if i == 0:
                        preprocessed_label.append('B-T')
                    else:
                        preprocessed_label.append('T')
                preprocessed_labels.extend(preprocessed_label)
                text = text[label_index + len(label):]
                
            preprocessed_labels.extend(len(preprocess(text))*['n'])
            if len(preprocessed_labels) != len(split_texts):
                # print(f'Error: {len(preprocessed_labels)} != {len(split_texts)}')
                text = text_str[:]  
                split_texts = []
                preprocessed_labels = []
                for label in labels:
                    label_index = text.find(label)
                    prev_text = text[:label_index]
                    split_text = preprocess(prev_text)
                    split_labels = preprocess(label)
                    preprocessed_label = len(split_text) * ['n']
                    split_text.extend(split_labels)
                    for i, token in enumerate(split_labels):
                        if i == 0:
                            preprocessed_label.append('B-T')
                        else:
                            preprocessed_label.append('T')
                    split_texts.extend(split_text)
                    preprocessed_labels.extend(preprocessed_label)
                    text = text[label_index + len(label):]
                if len(split_texts) != len(preprocessed_labels):
                    print('Error')
                    break
                preprocessed_dataset.append([split_texts, preprocessed_labels, labels])
            else:
                preprocessed_dataset.append([split_texts, preprocessed_labels, labels])
        if dataset is train_dataset:
            trainings_data = preprocessed_dataset[:]
        elif dataset is validation_dataset:
            val_data = preprocessed_dataset[:]
        elif dataset is test_dataset:
            test_data = preprocessed_dataset[:]      

        #seperate tokens and tags

    #train
    train_tags=[tup[1] for tup in trainings_data]
    train_texts=[tup[0] for tup in trainings_data]

    #val
    val_tags=[tup[1] for tup in val_data]
    val_texts=[tup[0] for tup in val_data]

    #test
    test_tags=[tup[1] for tup in test_data]
    test_texts=[tup[0] for tup in test_data]    
    
    train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)
    val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)
    test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)

    train_dataset = OurDataset(train_input_and_labels, train_input_and_labels["labels"])
    val_dataset = OurDataset(val_input_and_labels, val_input_and_labels["labels"])
    test_dataset = OurDataset(test_input_and_labels, test_input_and_labels["labels"])

    # initialize model
    torch.cuda.empty_cache()    
    model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels) if "xlm" in model_name else RobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    # training arguments
    config['dataset_name'] = dataset_name
    config['train_mode'] = True

    output_dir = 'outputs'
    now = datetime.now().strftime("%m_%d_%H_%M_%S")
    output_dir = os.path.join(output_dir, 'train' if config['train_mode'] else 'test', config['dataset_name'], 'roberta', now)
    os.makedirs(output_dir, exist_ok=True)
    report_save_path = os.path.join(output_dir, 'report', 'report_{0}.csv')
    os.makedirs(os.path.dirname(report_save_path), exist_ok=True)

    with open(os.path.join(output_dir, 'config.json'), 'w') as f:
        f.write(json.dumps(config, ensure_ascii=False, indent=4))

    training_args = TrainingArguments(
        output_dir=output_dir,          # output directory
        # num_train_epochs=2,              # total # of training epochs
        num_train_epochs=40,              # total # of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=16,   # batch size for evaluation
        warmup_steps=0,                  # number of warmup steps for learning rate scheduler
        weight_decay=0,                  # strength of weight decay
        learning_rate=2e-5,
        logging_dir='./logs',            # directory for storing logs
        evaluation_strategy= "steps",     # or "epoch"
        eval_steps=50,
        save_steps=50,
        save_total_limit=4,
        load_best_model_at_end=True,   #loads the model with the best evaluation score
        metric_for_best_model="micro_f1_score",
        greater_is_better=True
    )

    # initialize huggingface trainer
    compute_metrics = lambda x: eval_metric(x, report_save_path, val_texts, val_tags)
    trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )
    
    # train
    trainer.train()

    # training arguments
    config['dataset_name'] = dataset_name
    config['train_mode'] = False

    output_dir = 'outputs'
    now = datetime.now().strftime("%m_%d_%H_%M_%S")
    output_dir = os.path.join(output_dir, 'train' if config['train_mode'] else 'test', config['dataset_name'], 'roberta', now)
    os.makedirs(output_dir, exist_ok=True)
    report_save_path = os.path.join(output_dir, 'report', 'report_{0}.csv')
    os.makedirs(os.path.dirname(report_save_path), exist_ok=True)

    with open(os.path.join(output_dir, 'config.json'), 'w') as f:
        f.write(json.dumps(config, ensure_ascii=False, indent=4))

    training_args = TrainingArguments(
        output_dir=output_dir,          # output directory
        num_train_epochs=40,              # total # of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=16,   # batch size for evaluation
        warmup_steps=0,                  # number of warmup steps for learning rate scheduler
        weight_decay=0,                  # strength of weight decay
        learning_rate=2e-5,
        logging_dir='./logs',            # directory for storing logs
        evaluation_strategy= "steps",     # or "epoch"
        eval_steps=50,
        save_steps=50,
        save_total_limit=4,
        load_best_model_at_end=True,   #loads the model with the best evaluation score
        metric_for_best_model="micro_f1_score",
        greater_is_better=True
    )

    compute_metrics = lambda x: eval_metric(x, report_save_path, test_texts, test_tags)
    
    # initialize huggingface trainer
    trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )
    
    #test
    test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
    test_predictions = np.argmax(test_predictions, axis=2)
    # Remove ignored index (special tokens)
    true_test_predictions = [
        [label_list[p] for (p, l) in zip(test_prediction, test_label) if l != -100]
        for test_prediction, test_label in zip(test_predictions, test_labels)
    ]
    
    print(f"Test metrics for {dataset_name}:")
    print(test_metrics)
    print()