In [4]:
! pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 9.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 52.7 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 70.6 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 50.2 MB/s 
Collecting fsspec[http]>=20

In [5]:
from pathlib import Path
from argparse import Namespace
from typing import Union, List
from fastprogress import progress_bar
from typing_extensions import TypedDict
import torch
from datasets import Dataset, DatasetDict, load_metric
import numpy as np
from transformers import DataCollatorForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer


In [6]:
data_dir = Path("/content/drive/Shareddrives/derzart@gmail.com/studies/nlp-final-project/data")

args = Namespace(
    batch_size=64,
    num_workers=4
)

train_data_path = data_dir / '%-training'
valid_data_path = data_dir / '%-dev'
test_data_path = data_dir / '%-test'

In [7]:
SYMBOL_DICT = {
    "COMMA": ",",
}

LABEL_LIST = ["NONE", "PRED", "ARG1", "SUPPORT"]
POS_LIST = ["CC", "CD", "DT", "FW", "IN", "JJ", "JJR", "JJS", 
            "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", 
            "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "SYM", 
            "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP",
            "VBZ", "WDT", "WP", "WP$", "WRB", "PU", "EX", 
            "RP"]
BIO_TAG_CONVERSION_DICT = {
    ".": "PU",
    ",": "PU",
    "COMMA": "PU",
    "$": "PU",
    ":": "PU",
    "(": "PU",
    ")": "PU",
    "``": "PU",
    "''": "PU",
    "#": "PU"
}
BIO_TAG_LIST = ["O", "B-NP", "I-NP", "B-VP", "I-VP", "B-PP",
                "I-PP", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP",
                "B-SBAR", "I-SBAR", "B-PRT", "I-PRT", "B-CONJP",
                "I-CONJP", "B-UCP", "I-UCP"]

In [8]:
class Word(TypedDict):
  word: str
  pos: str
  biotag: str
  label: Union[str, None]

def parse_input(input_file: Union[str, Path], drop_label = False) -> List[Union[List[Word], None]]:
    """
    Parses the input file and returns a list of lists of words.
    """
    with open(input_file, "r") as f:
        lines = f.readlines()
    sentences: List[Union[List[Word], None]] = []
    last_sentence: List[Word] = []
    print("Parsing input file lines...")
    line_no = 0
    for line in progress_bar(lines):
        line_no += 1
        line = line.strip()
        word_info = line.split("\t")
        if len(word_info) >= 5:
            word_str = word_info[0].strip()
            if word_str in SYMBOL_DICT:
              word_str = SYMBOL_DICT[word_str]
            pos = word_info[1].strip()
            if pos in BIO_TAG_CONVERSION_DICT:
              pos = BIO_TAG_CONVERSION_DICT[pos]
            if pos not in POS_LIST:
              print(f"Warning: invalid POS on line {line_no} \"{pos}\", treated as PU.")
              pos = "PU"
            biotag = word_info[2].strip()
            if biotag not in BIO_TAG_LIST:
              print(f"Warning: invalid bio tag on line {line_no} \"{biotag}\", treated as O.")
              biotag = "O"
            if len(word_info) >= 6:
                label = word_info[5].strip()
            else:
                label = "NONE"
            if label not in LABEL_LIST:
              print(f"Warning: invalid label on line {line_no} \"{label}\", treated as NONE.")
              label = "NONE"
            if drop_label:
              label = None
            word = Word(word=word_str, pos=pos, biotag=biotag, label=label)
            last_sentence.append(word)
        else:
            if len(last_sentence) > 0:
                sentences.append(last_sentence)
            last_sentence = []
    if len(last_sentence) > 0:
        sentences.append(last_sentence)
    return sentences

In [9]:
train_sentences = parse_input(train_data_path)
valid_sentences = parse_input(valid_data_path)
test_sentences = parse_input(test_data_path, drop_label=True)

Parsing input file lines...


Parsing input file lines...


Parsing input file lines...


In [10]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(LABEL_LIST))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [11]:
def build_dataset_from_sentences(sentences, drop_label = False):
  dataset_tokens = []
  dataset_partitive_roles = []
  dataset_pos_tags = []
  dataset_bio_tags = []
  for sentence in sentences:
    tokens = [word['word'] for word in sentence]
    if not drop_label:
      partitive_roles = [LABEL_LIST.index(word['label']) for word in sentence]
    else:
      partitive_roles = None
    pos_tags = [POS_LIST.index(word['pos']) for word in sentence]
    bio_tags = [BIO_TAG_LIST.index(word['biotag']) for word in sentence]
    dataset_tokens.append(tokens)
    if not drop_label:
      dataset_partitive_roles.append(partitive_roles)
    dataset_pos_tags.append(pos_tags)
    dataset_bio_tags.append(bio_tags)
  if not drop_label:
    dataset_dict = {
        "tokens": dataset_tokens,
        "partitive_roles": dataset_partitive_roles,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags
    }
  else:
    dataset_dict = {
        "tokens": dataset_tokens,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags
    }
  return Dataset.from_dict(dataset_dict)

In [12]:
train_raw_dataset = build_dataset_from_sentences(train_sentences)
valid_raw_dataset = build_dataset_from_sentences(valid_sentences)
test_raw_dataset = build_dataset_from_sentences(test_sentences, drop_label=True)

raw_datasets = DatasetDict(train=train_raw_dataset, valid=valid_raw_dataset, test=test_raw_dataset)

In [13]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    if "partitive_roles" in examples:
      labels = []
      for i, label in enumerate(examples["partitive_roles"]):
          word_ids = tokenized_inputs.word_ids(batch_index=i)
          previous_word_idx = None
          label_ids = []
          for word_idx in word_ids:
              # Special tokens have a word id that is None. We set the label to -100 so they are automatically
              # ignored in the loss function.
              if word_idx is None:
                  label_ids.append(-100)
              # We set the label for the first token of each word.
              elif word_idx != previous_word_idx:
                  label_ids.append(label[word_idx])
              # For the other tokens in a word, we set the label to either the current label or -100, depending on
              # the label_all_tokens flag.
              else:
                  label_ids.append(label[word_idx] if label_all_tokens else -100)
              previous_word_idx = word_idx

          labels.append(label_ids)

      tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [LABEL_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABEL_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [16]:
batch_size=64

train_args = TrainingArguments(
    "bert_partitive_roles",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [17]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: partitive_roles, bio_tags, tokens, pos_tags. If partitive_roles, bio_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2367
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 370


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.158165,0.565327,0.453629,0.503356,0.943252
2,No log,0.082431,0.75,0.810484,0.77907,0.967408
3,No log,0.068327,0.756757,0.846774,0.799239,0.969709
4,No log,0.064097,0.765455,0.84879,0.804971,0.970475
5,No log,0.061665,0.778846,0.816532,0.797244,0.970475
6,No log,0.059725,0.76779,0.826613,0.796117,0.970475
7,No log,0.059067,0.788973,0.836694,0.812133,0.972776
8,No log,0.059283,0.792157,0.814516,0.803181,0.971626
9,No log,0.059789,0.767658,0.832661,0.798839,0.970475
10,No log,0.059383,0.776119,0.83871,0.806202,0.971626


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: partitive_roles, bio_tags, tokens, pos_tags. If partitive_roles, bio_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: partitive_roles, bio_tags, tokens, pos_tags. If partitive_roles, bio_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: partitive_roles, bio_tags,

TrainOutput(global_step=370, training_loss=0.08984487378919447, metrics={'train_runtime': 136.0864, 'train_samples_per_second': 173.934, 'train_steps_per_second': 2.719, 'total_flos': 421922697575040.0, 'train_loss': 0.08984487378919447, 'epoch': 10.0})

In [19]:
test_results = trainer.predict(tokenized_datasets['test'])

The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: bio_tags, tokens, pos_tags. If bio_tags, tokens, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 150
  Batch size = 64


In [23]:
np.argmax([1, 5, 3, 5])

1

In [25]:
out = []

for i in range(len(tokenized_datasets['test'])):
  sentence = tokenized_datasets['test'][i]
  tokenized_input = tokenizer(sentence["tokens"], truncation=True, is_split_into_words=True)
  predictions = test_results.predictions[i]
  label_ids = []
  for prediction in predictions:
    label_ids.append(np.argmax(prediction))
  word_id_to_label_idx = {}
  for j, word_id in enumerate(tokenized_input.word_ids()):
    if word_id in word_id_to_label_idx or word_id is None:
      continue
    word_id_to_label_idx[word_id] = j
  labelings = []
  for j, token in enumerate(sentence["tokens"]):
    label_idx = word_id_to_label_idx[j]
    label_id = label_ids[label_idx]
    label = LABEL_LIST[label_id] if label_id != 0 else None
    labelings.append((token, label))
  out.append(labelings)


In [26]:
out[0][:10]

[('Then', None),
 ('in', None),
 ('a', None),
 ('lightning', None),
 ('plunge', None),
 (',', None),
 ('the', None),
 ('Dow', None),
 ('Jones', None),
 ('industrials', 'ARG1')]

In [30]:
out_path = data_dir / 'test-out2'

In [31]:
with open(out_path, 'w') as f:
  for line in out:
    for labling in line:
      if labling[1]:
        f.write(f"{labling[0]}\t{labling[1]}\n")
      else:
        f.write(f"{labling[0]}\n")
    f.write("\n")

In [None]:
def test_sentence_string(s: str):
  tokenized_input = tokenizer(s, truncation=True)
  model(tokenized_input)