In [1]:
! pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 7.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 60.1 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 50.2 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 41.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3

In [2]:
from pathlib import Path
from argparse import Namespace
from typing import Union, List
from fastprogress import progress_bar
from typing_extensions import TypedDict
import torch
from datasets import Dataset, DatasetDict, load_metric
import numpy as np
from transformers import DataCollatorForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer


In [3]:
data_dir = Path("/content/drive/Shareddrives/derzart@gmail.com/studies/nlp-final-project/data")

args = Namespace(
    batch_size=64,
    num_workers=4
)

train_data_path = data_dir / '%-training'
valid_data_path = data_dir / '%-dev'
test_data_path = data_dir / 'total-test'

In [21]:
SYMBOL_DICT = {
    "COMMA": ",",
}

LABEL_LIST = ["NONE", "PRED", "ARG1", "SUPPORT"]
# LABEL_LIST = ["NONE", "ARG1"]
POS_LIST = ["CC", "CD", "DT", "FW", "IN", "JJ", "JJR", "JJS", 
            "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", 
            "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "SYM", 
            "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP",
            "VBZ", "WDT", "WP", "WP$", "WRB", "PU", "EX", 
            "RP", "AUX"]
BIO_TAG_CONVERSION_DICT = {
    ".": "PU",
    ",": "PU",
    "COMMA": "PU",
    "$": "PU",
    ":": "PU",
    "(": "PU",
    ")": "PU",
    "``": "PU",
    "''": "PU",
    "#": "PU",
    "/": "PU",
    "-": "PU",

}
BIO_TAG_LIST = ["O", "B-NP", "I-NP", "B-VP", "I-VP", "B-PP",
                "I-PP", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP",
                "B-SBAR", "I-SBAR", "B-PRT", "I-PRT", "B-CONJP",
                "I-CONJP", "B-UCP", "I-UCP", "B-LST", "I-LST", "B-INTJ", "I-INTJ"]

In [22]:
class Word(TypedDict):
  word: str
  pos: str
  biotag: str
  label: Union[str, None]

def parse_input(input_file: Union[str, Path], drop_label = False) -> List[Union[List[Word], None]]:
    """
    Parses the input file and returns a list of lists of words.
    """
    with open(input_file, "r") as f:
        lines = f.readlines()
    sentences: List[Union[List[Word], None]] = []
    last_sentence: List[Word] = []
    print("Parsing input file lines...")
    line_no = 0
    for line in progress_bar(lines):
        line_no += 1
        line = line.strip()
        word_info = line.split("\t")
        if len(word_info) >= 5:
            word_str = word_info[0].strip()
            if word_str in SYMBOL_DICT:
              word_str = SYMBOL_DICT[word_str]
            pos = word_info[1].strip()
            if pos in BIO_TAG_CONVERSION_DICT:
              pos = BIO_TAG_CONVERSION_DICT[pos]
            if pos not in POS_LIST:
              print(f"Warning: invalid POS on line {line_no} \"{pos}\", treated as PU.")
              pos = "PU"
            biotag = word_info[2].strip()
            if biotag not in BIO_TAG_LIST:
              print(f"Warning: invalid bio tag on line {line_no} \"{biotag}\", treated as O.")
              biotag = "O"
            if len(word_info) >= 6:
                label = word_info[5].strip()
            else:
                label = "NONE"
            if label not in LABEL_LIST:
              print(f"Warning: invalid label on line {line_no} \"{label}\", treated as NONE.")
              label = "NONE"
            if drop_label:
              label = None
            word = Word(word=word_str, pos=pos, biotag=biotag, label=label)
            last_sentence.append(word)
        else:
            if len(last_sentence) > 0:
                sentences.append(last_sentence)
            last_sentence = []
    if len(last_sentence) > 0:
        sentences.append(last_sentence)
    return sentences

In [23]:
train_sentences = parse_input(train_data_path)
valid_sentences = parse_input(valid_data_path)
test_sentences = parse_input(test_data_path, drop_label=True)

Parsing input file lines...


Parsing input file lines...


Parsing input file lines...


In [24]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(LABEL_LIST))

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.18.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

In [25]:
def build_dataset_from_sentences(sentences, drop_label = False):
  dataset_tokens = []
  dataset_partitive_roles = []
  dataset_pos_tags = []
  dataset_bio_tags = []
  for sentence in sentences:
    tokens = [word['word'] for word in sentence]
    if not drop_label:
      partitive_roles = [LABEL_LIST.index(word['label']) for word in sentence]
    else:
      partitive_roles = None
    pos_tags = [POS_LIST.index(word['pos']) for word in sentence]
    bio_tags = [BIO_TAG_LIST.index(word['biotag']) for word in sentence]
    dataset_tokens.append(tokens)
    if not drop_label:
      dataset_partitive_roles.append(partitive_roles)
    dataset_pos_tags.append(pos_tags)
    dataset_bio_tags.append(bio_tags)
  if not drop_label:
    dataset_dict = {
        "tokens": dataset_tokens,
        "partitive_roles": dataset_partitive_roles,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags
    }
  else:
    dataset_dict = {
        "tokens": dataset_tokens,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags
    }
  return Dataset.from_dict(dataset_dict)

In [26]:
train_raw_dataset = build_dataset_from_sentences(train_sentences)
valid_raw_dataset = build_dataset_from_sentences(valid_sentences)
test_raw_dataset = build_dataset_from_sentences(test_sentences, drop_label=True)

raw_datasets = DatasetDict(train=train_raw_dataset, valid=valid_raw_dataset, test=test_raw_dataset)

In [27]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    if "partitive_roles" in examples:
      labels = []
      for i, label in enumerate(examples["partitive_roles"]):
          word_ids = tokenized_inputs.word_ids(batch_index=i)
          previous_word_idx = None
          label_ids = []
          for word_idx in word_ids:
              # Special tokens have a word id that is None. We set the label to -100 so they are automatically
              # ignored in the loss function.
              if word_idx is None:
                  label_ids.append(-100)
              # We set the label for the first token of each word.
              elif word_idx != previous_word_idx:
                  label_ids.append(label[word_idx])
              # For the other tokens in a word, we set the label to either the current label or -100, depending on
              # the label_all_tokens flag.
              else:
                  label_ids.append(label[word_idx] if label_all_tokens else -100)
              previous_word_idx = word_idx

          labels.append(label_ids)

      tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [28]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [29]:
batch_size=64

train_args = TrainingArguments(
    "bert_partitive_roles",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  # Remove ignored index (special tokens)
  true_predictions = [
      [LABEL_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [LABEL_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  results = metric.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      "f1": results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [31]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, partitive_roles, bio_tags, pos_tags. If tokens, partitive_roles, bio_tags, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2367
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 370


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.182983,0.515581,0.366935,0.42874,0.935199
2,No log,0.089108,0.740812,0.772177,0.75617,0.964724
3,No log,0.071319,0.775591,0.794355,0.784861,0.968175
4,No log,0.067488,0.766355,0.826613,0.795344,0.968942
5,No log,0.062558,0.786127,0.822581,0.803941,0.970475
6,No log,0.06053,0.768519,0.836694,0.801158,0.970475
7,No log,0.061665,0.759928,0.84879,0.801905,0.970475
8,No log,0.060739,0.774436,0.830645,0.801556,0.971242
9,No log,0.0608,0.778195,0.834677,0.805447,0.971626
10,No log,0.060686,0.777989,0.826613,0.801564,0.971242


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, partitive_roles, bio_tags, pos_tags. If tokens, partitive_roles, bio_tags, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, partitive_roles, bio_tags, pos_tags. If tokens, partitive_roles, bio_tags, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 83
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, partitive_roles, b

TrainOutput(global_step=370, training_loss=0.09212669166358742, metrics={'train_runtime': 145.6445, 'train_samples_per_second': 162.519, 'train_steps_per_second': 2.54, 'total_flos': 420538030861776.0, 'train_loss': 0.09212669166358742, 'epoch': 10.0})

In [32]:
test_set = 'test'

In [33]:
test_results = trainer.predict(tokenized_datasets[test_set])

The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, bio_tags, pos_tags. If tokens, bio_tags, pos_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 746
  Batch size = 64


In [38]:
out = []

for i in range(len(tokenized_datasets[test_set])):
  sentence = tokenized_datasets[test_set][i]
  tokenized_input = tokenizer(sentence["tokens"], truncation=True, is_split_into_words=True)
  predictions = test_results.predictions[i]
  label_ids = []
  for prediction in predictions:
    label_ids.append(np.argmax(prediction))
  word_id_to_label_idx = {}
  for j, word_id in enumerate(tokenized_input.word_ids()):
    if word_id in word_id_to_label_idx or word_id is None:
      continue
    word_id_to_label_idx[word_id] = j
  labelings = []
  for j, token in enumerate(sentence["tokens"]):
    label_idx = word_id_to_label_idx[j]
    label_id = label_ids[label_idx]
    label = LABEL_LIST[label_id] if label_id != 0 else None
    labelings.append((token, label))
  out.append(labelings)


In [39]:
out[0][:]

[('Then', None),
 ('in', None),
 ('a', None),
 ('lightning', None),
 ('plunge', None),
 (',', None),
 ('the', None),
 ('Dow', None),
 ('Jones', None),
 ('industrials', 'ARG1'),
 ('in', None),
 ('barely', None),
 ('an', None),
 ('hour', None),
 ('surrendered', None),
 ('about', None),
 ('a', None),
 ('third', None),
 ('of', None),
 ('their', None),
 ('gains', 'ARG1'),
 ('this', None),
 ('year', None),
 (',', None),
 ('chalking', None),
 ('up', None),
 ('a', None),
 ('190.58-point', None),
 (',', None),
 ('or', None),
 ('6.9', None),
 ('%', 'PRED'),
 (',', None),
 ('loss', 'SUPPORT'),
 ('on', None),
 ('the', None),
 ('day', None),
 ('in', None),
 ('gargantuan', None),
 ('trading', None),
 ('volume', None),
 ('.', None)]

In [40]:
out_path = data_dir / '..' / 'out' / 'test-out-distilbert'

In [41]:
with open(out_path, 'w') as f:
  for line in out:
    for labling in line:
      if labling[1]:
        f.write(f"{labling[0]}\t{labling[1]}\n")
      else:
        f.write(f"{labling[0]}\n")
    f.write("\n")

In [None]:
def test_sentence_string(s: str):
  tokenized_input = tokenizer(s, truncation=True)
  model(tokenized_input)