Adapted code from https://github.com/Odeuropa/wp3-information-extraction-system

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd 'drive/MyDrive/Text Mining/Project'

/content/drive/MyDrive/Text Mining/Project


In [None]:
import pandas as pd
import numpy as np

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate==0.20.3
!pip install tokenizers -q
!pip install seqeval -q

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16
Collecting accelerate==0.20.3
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing coll

In [None]:
!pip install optuna
!pip install 'ray[tune]'
!pip install sigopt
!pip install wandb

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.5.0
Collecting ray[tune]
  Downloading ray-2.9.3-cp310-cp310-manylinux2014_x86_64.whl (64.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install transformers[torch]


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.20.3
    Uninstalling accelerate-0.20.3:
      Successfully uninstalled accelerate-0.20.3
Successfully installed accelerate-0.27.2


In [None]:
import time
from torch import cuda

import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification, AutoConfig
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

from datasets import load_metric, Dataset

import numpy as np
import re

import argparse
import csv
import sys
from os import path
import pandas as pd
import json

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

seed = 22
transformers.set_seed(seed)

def sentence_num(row):
    sentenceNum = row['Sentence-Token'].split("-")[0]
    return sentenceNum


def to_label_id(row, id_dict):
    label = row['Tag']
    if label not in id_dict:
        label = 'O'

    labelId = id_dict[label]
    return labelId

def to_clean_label(row):
    clean_tag = row['Tag'].replace("\\", "").replace("\_","_")
    clean_tag = clean_tag.split('|')[0]
    clean_tag = clean_tag.replace("B-I-", "B-")
    return clean_tag

def replace_punctuation(row):
    """Error case in Italian: 'bianco', '-', 'gialliccio' -> 'bianco-gialliccio'
    Bert tokenizer uses also punctuations to separate the tokens along with the whitespaces, although we provide the
    sentences with is_split_into_words=True. Therefore, if there is a punctuation in a single word in a CONLL file
    we cannot 100% guarantee the exact same surface realization (necessary to decide on a single label for a single word)
    after classification for that specific word:
    e.g., bianco-gialliccio becomes 3 separate CONLL lines: 1) bianco 2) - 3) gialliccio
    Things could have been easier and faster if we were delivering simple sentences as output instead of the exact
    CONLL file structure given as input. """
    word = row['Word'].strip()
    if len(word) > 1:
        word = re.sub(r'[^a-zA-Z0-9]', '', word)
    if word is None or word == "" or word == "nan":
        word = " "
    return word

def read_split_fold(split='dev', fold="0", label_dict=None):
    #change the path template as needed.
    path = 'Output/folds_{}_{}.tsv'.format(fold, split)
    try:
        data = pd.read_csv(path, sep='\t', skip_blank_lines=True,
                           encoding='utf-8', engine='python', quoting=csv.QUOTE_NONE,
                           names=['Document', 'Sentence-Token', 'Chars', 'Word', 'Tag', 'Empty'], header=None)
    except:
        print(f"Cannot read the file {path}")
        if split == "train":
            sys.exit()
        return None, None

    time.sleep(5)
    data.drop('Empty', inplace=True, axis=1)

    #For the reusability purposes, we still extract the label ids from the training data.
    data['Tag'] = data.apply(lambda row: to_clean_label(row), axis=1)

    print("Number of tags: {}".format(len(data.Tag.unique())))
    frequencies = data.Tag.value_counts()
    print(frequencies)

    if not label_dict:
        labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
    else:
        labels_to_ids = label_dict

    ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}

    data = data.astype({"Word": str})

    data['Word'] = data.apply(lambda row: replace_punctuation(row), axis=1)
    data['Tag'] = data.apply(lambda row: to_label_id(row, labels_to_ids), axis=1)
    data['Num'] = data.apply(lambda row: sentence_num(row), axis=1)

    # Important point is that we need unique document+Sentence-Token
    data = data.astype({"Num": int})
    data.set_index(['Document', 'Num'])
    df = data.groupby(['Document', 'Num'])['Word'].apply(list)
    df2 = data.groupby(['Document', 'Num'])['Tag'].apply(list)
    mergeddf = pd.merge(df, df2, on=['Document', 'Num'])
    mergeddf.rename(columns={'Word': 'sentence', 'Tag': 'word_labels'}, inplace=True)

    print("Number of unique sentences: {}".format(len(mergeddf)))

    return mergeddf, labels_to_ids, ids_to_labels


def tokenize_and_align_labels(examples, tokenizer, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["sentence"], max_length=512, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["word_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def cn_hp_space(trial):

    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 2e-5, 3e-5, 4e-5, 5e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 10, log=True)
    }


cpu


In [None]:
'''config = AutoConfig.from_pretrained("bert-base-uncased")
labels_to_ids = config.label2id
labels_to_ids
'''

{'LABEL_0': 0, 'LABEL_1': 1}

In [None]:
import accelerate
from transformers import TFBertForMaskedLM
import json
import numpy as np

hypsearch = False
do_train = False
do_test = True
# best found with hyperparam search
learning_rate = 4e-05
train_batch_size = 4
train_epochs = 7
#defaults from Tonelli code
#learning_rate = 4e-5
#train_batch_size = 8
#train_epochs = 10
model = "bert-base-uncased"

#model_checkpoint = model
model_checkpoint = "bert-base-uncased-english-0-hyp/run-9/checkpoint-2511"
fold = '0'
language = 'english'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
output_path = "Output"
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

if language not in ['english', 'german', 'italian', 'slovene', 'dutch', 'french']:
  raise Exception(f"Language error: {language} is not among the project languages.")

if do_train and hypsearch:
  raise Exception(f"Action error: Cannot do hyperparameter search and train in a single run. Please first run"
                  f"hypsearch and with the parameters obtained as the best, run do_train.")

config = AutoConfig.from_pretrained(model_checkpoint)
labels_to_ids = config.label2id
ids_to_labels = config.id2label

def model_init():
  m = AutoModelForTokenClassification.from_pretrained(model_checkpoint, config=config)
  m.to(device)
  return m

if hypsearch or do_train:
  trn, labels_to_ids, ids_to_labels = read_split_fold(fold=fold)
  train_dataset = Dataset.from_pandas(trn, split="train")
  val, _, _ = read_split_fold(fold=fold, split="dev", label_dict=labels_to_ids)
  val_dataset = Dataset.from_pandas(val, split="validation")

  print(labels_to_ids)
  tokenized_train = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
  tokenized_val = val_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
  label_list = list(labels_to_ids.values())
  config.label2id = labels_to_ids
  config.id2label = ids_to_labels
  config.num_labels = len(label_list)

model_name = model_checkpoint.split("/")[-1]

if hypsearch:
  tr_args = TrainingArguments(
      f"{model_name}-{language}-{fold}-hyp",
      evaluation_strategy="epoch",
      save_strategy="epoch",
      per_device_eval_batch_size=8,
      warmup_ratio=0.1,
      seed=22,
      weight_decay=0.01
    )
elif do_train:
  tr_args = TrainingArguments(
      f"{model_name}-{language}-{fold}",
      evaluation_strategy="epoch",
      save_strategy="epoch",
      learning_rate=learning_rate,
      per_device_train_batch_size=train_batch_size,
      per_device_eval_batch_size=8,
      num_train_epochs=train_epochs,
      warmup_ratio=0.1,
      seed=22,
      weight_decay=0.01
    )

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  # Remove ignored index (special tokens)
  true_predictions = [
      [ids_to_labels[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [ids_to_labels[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  results = metric.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      "f1": results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }


if do_train or hypsearch:
  trainer = Trainer(
      model_init=model_init,
      args=tr_args,
      train_dataset=tokenized_train,
      eval_dataset=tokenized_val,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
elif do_test:
  #for testing
  if path.exists(f"{model_checkpoint}/{language}-id2label.json"):
    ids_to_labels = json.load(open(f"{model_checkpoint}/{language}-id2label.json", "r"))
    ids_to_labels = {int(k): v for k, v in ids_to_labels.items()}
    labels_to_ids = {v: int(k) for k, v in ids_to_labels.items()}
    config.label2id = labels_to_ids
    config.id2label = ids_to_labels
    label_list = list(labels_to_ids.values())
    config.num_labels = len(label_list)

  m = AutoModelForTokenClassification.from_pretrained(model_checkpoint, config=config)
  m.to(device)
  trainer = Trainer(m, data_collator=data_collator, tokenizer=tokenizer)

if hypsearch:
  # hyperparam search with compute_metrics: default maximization is through the sum of all the metrics returned
  best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize", hp_space=cn_hp_space)
  best_params = best_run.hyperparameters
  print(f"Best run is with the hyperparams:{best_params}. You either have to find the right run and checkpoint "
        f"from the models saved or retrain with the correct parameters: referring to "
        f"https://discuss.huggingface.co/t/accessing-model-after-training-with-hyper-parameter-search/20081")

elif do_train:
  trainer.train()

class NumpyEncoder(json.JSONEncoder):
  def default(self, obj):
      if isinstance(obj, np.integer):
          return int(obj)
      elif isinstance(obj, np.floating):
          return float(obj)
      elif isinstance(obj, np.ndarray):
          return obj.tolist()
      return super(NumpyEncoder, self).default(obj)

if do_test:
  print("TEST RESULTS")
  test, _, _ = read_split_fold(split="train", label_dict=labels_to_ids, fold=fold)
  test_dataset = Dataset.from_pandas(test, split="train")
  tokenized_test = test_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer),
                                    batched=True)
  encoded_sents = []
  for batch in tokenized_test['input_ids']:
    encoded_sents.append(tokenizer.convert_ids_to_tokens(batch)[1:-1])

  predictions, labels, _ = trainer.predict(tokenized_test)
  # print(predictions.shape)
  predictions = np.argmax(predictions, axis=2)
  # print('Predictions', predictions.shape)
    # Convert predictions and labels to human-readable format

  # TODO have a datastructure that maps original tokens to subtokens
  # this is the hardest part
  # record how tokens are matched with the subtoken indices
  # gap between len 75 and 82

  readable_labels = [
      [ids_to_labels[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

    # Convert predictions and labels to human-readable format
  readable_predictions = [
      [(p, l) for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


TEST RESULTS
Number of tags: 10
O                  230672
B-Smell_Source       1176
I-Smell_Source       1088
B-Smell_Word          992
B-Quality             736
I-Quality             488
I-Location             32
I-Smell_Word           16
B-Location              8
B-Odour_Carrier         8
Name: Tag, dtype: int64
Number of unique sentences: 1114


Map:   0%|          | 0/1114 [00:00<?, ? examples/s]

In [None]:
# Function to generate human-readable output
def generate_readable_output(predictions, labels, sentences, encoded_sents):
    output_list = []
    for sentence, preds, true_labels, sent in zip(sentences, predictions, labels, encoded_sents):
        sentence_output = {"sentence": sentence, "subtokens": sent, "entities": []}
        for pred, true_label, subtoken in zip(preds, true_labels, sent):
            # Check if the label is not a padding token (-100)
            if true_label != -100:
                # Convert NumPy arrays to Python lists
                # Use .item() to get the scalar value from the NumPy array
                if true_label != 'O':
                  if (pred != true_label):
                      if len(pred) != 0:
                          entity = {"word": subtoken,
                                    "predicted_label": ids_to_labels[pred[0]],
                                    "true_label": true_label}
                          if entity["predicted_label"] != entity["true_label"]:
                              sentence_output["entities"].append(entity)
        output_list.append(sentence_output)

    # Serialize the output_list to JSON using the custom encoder
    json_filename = "output_train.json"
    with open(json_filename, 'w') as json_file:
        json.dump(output_list, json_file, cls=NumpyEncoder)
    print(f"Test results saved to {json_filename}.")


original_sentences = test_dataset["sentence"]
# Generate human-readable output
output_list = generate_readable_output(readable_predictions, readable_labels, original_sentences, encoded_sents)

Test results saved to output_train.json.


In [None]:
import json

# Load the JSON file
with open('output_train.json', 'r') as f:
    data = json.load(f)

# Filter out sentences with empty entity lists
data = [entry for entry in data if entry['entities']]

# Function to format the entities list for a sentence
def format_entities(entities):
    formatted = ""
    for entity in entities:
        word = ' '.join(entity['word'])
        predicted_label = entity['predicted_label']
        true_label = entity['true_label']
        formatted += f"\n\"{word}\"\t\t{predicted_label}\t\t{true_label}"
    return formatted

# Write formatted sentences to a .txt file
with open('formatted_sentences_train.txt', 'w') as outfile:
    for entry in data:
        sentence = ' '.join(entry['sentence'])
        subtokens = ' '.join(entry["subtokens"])
        entities = entry['entities']
        formatted_entities = format_entities(entities)
        outfile.write(f"\nsentence: {sentence}\n\nentities: [{formatted_entities}\n]")


In [None]:
print(original_sentences)


82
82
82
75
['enormous', 'tropical', 'forests', ',', 'little', 'known', 'to', 'man', ',', 'and', 'from', 'which', 'he', 'gathers', 'here', 'and', 'there', 'the', 'treasures', 'for', 'our', 'orchid', '-', 'and', 'greenhouses', ';', 'great', 'island', 'conservatories', 'like', 'Java', 'and', 'Ceylon', 'and', 'Borneo', ',', 'rich', 'in', 'spices', 'and', 'lovely', 'plant', 'life', ':', 'Australian', 'Bush', ',', 'with', 'traces', 'of', 'plant', 'life', 'as', 'if', 'from', 'another', 'world', ',', 'but', 'often', 'most', 'delicate', 'in', 'odour', 'even', 'in', 'the', 'fragments', 'of', 'them', 'we', 'see', 'in', 'our', 'greenhouses']


NameError: name 'true_label' is not defined