In [1]:
! pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 6.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 68.5 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 72.3 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 77.4 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 59.4 MB/s 
Collecting huggingface-hub<1.0.0,>=0

In [2]:
from pathlib import Path
from argparse import Namespace
from typing import Union, List, Tuple, Optional
from fastprogress import progress_bar
from typing_extensions import TypedDict
import torch
from datasets import Dataset, DatasetDict, load_metric
import numpy as np
from transformers import DataCollatorForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DistilBertPreTrainedModel, PretrainedConfig, DistilBertModel
from transformers.modeling_outputs import TokenClassifierOutput
from dataclasses import dataclass

In [3]:
data_dir = Path("/content/drive/Shareddrives/derzart@gmail.com/studies/nlp-final-project/data")

args = Namespace(
    batch_size=64,
    num_workers=4
)

train_data_path = data_dir / 'total-training'
valid_data_path = data_dir / 'total-dev'
test_data_path = data_dir / 'total-test'

In [5]:
SYMBOL_DICT = {
    "COMMA": ",",
}

LABEL_LIST = ["NONE", "PRED", "ARG1", "SUPPORT"]
# LABEL_LIST = ["NONE", "ARG1"]
POS_LIST = ["CC", "CD", "DT", "FW", "IN", "JJ", "JJR", "JJS", 
            "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", 
            "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "SYM", 
            "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP",
            "VBZ", "WDT", "WP", "WP$", "WRB", "PU", "EX", 
            "RP", "AUX"]
BIO_TAG_CONVERSION_DICT = {
    ".": "PU",
    ",": "PU",
    "COMMA": "PU",
    "$": "PU",
    ":": "PU",
    "(": "PU",
    ")": "PU",
    "``": "PU",
    "''": "PU",
    "#": "PU",
    "/": "PU",
    "-": "PU",

}
BIO_TAG_LIST = ["O", "B-NP", "I-NP", "B-VP", "I-VP", "B-PP",
                "I-PP", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP",
                "B-SBAR", "I-SBAR", "B-PRT", "I-PRT", "B-CONJP",
                "I-CONJP", "B-UCP", "I-UCP", "B-LST", "I-LST", "B-INTJ", "I-INTJ"]

In [6]:
class Word(TypedDict):
  word: str
  pos: str
  biotag: str
  label: Union[str, None]

def parse_input(input_file: Union[str, Path], drop_label = False) -> List[Union[List[Word], None]]:
    """
    Parses the input file and returns a list of lists of words.
    """
    with open(input_file, "r") as f:
        lines = f.readlines()
    sentences: List[Union[List[Word], None]] = []
    last_sentence: List[Word] = []
    print("Parsing input file lines...")
    line_no = 0
    for line in progress_bar(lines):
        line_no += 1
        line = line.strip()
        word_info = line.split("\t")
        if len(word_info) >= 5:
            word_str = word_info[0].strip()
            if word_str in SYMBOL_DICT:
              word_str = SYMBOL_DICT[word_str]
            pos = word_info[1].strip()
            if pos in BIO_TAG_CONVERSION_DICT:
              pos = BIO_TAG_CONVERSION_DICT[pos]
            if pos not in POS_LIST:
              print(f"Warning: invalid POS on line {line_no} \"{pos}\", treated as PU.")
              pos = "PU"
            biotag = word_info[2].strip()
            if biotag not in BIO_TAG_LIST:
              print(f"Warning: invalid bio tag on line {line_no} \"{biotag}\", treated as O.")
              biotag = "O"
            if len(word_info) >= 6:
                label = word_info[5].strip()
            else:
                label = "NONE"
            if label not in LABEL_LIST:
              print(f"Warning: invalid label on line {line_no} \"{label}\", treated as NONE.")
              label = "NONE"
            if drop_label:
              label = None
            word = Word(word=word_str, pos=pos, biotag=biotag, label=label)
            last_sentence.append(word)
        else:
            if len(last_sentence) > 0:
                sentences.append(last_sentence)
            last_sentence = []
    if len(last_sentence) > 0:
        sentences.append(last_sentence)
    return sentences


In [7]:
train_sentences = parse_input(train_data_path)
valid_sentences = parse_input(valid_data_path)
test_sentences = parse_input(test_data_path, drop_label=True)

Parsing input file lines...


Parsing input file lines...


Parsing input file lines...


In [8]:
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
  def __init__(self, config: PretrainedConfig):
    super().__init__(config)
    self.num_labels = config.num_labels

    self.distilbert = DistilBertModel(config)
    self.dropout = torch.nn.Dropout(config.dropout)
    self.classifier = torch.nn.Linear(config.hidden_size + 2, config.num_labels)

    self.post_init()

  def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    pos_tags: Optional[torch.Tensor] = None,
    bio_tags: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    labels: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
  ) -> Union[TokenClassifierOutput, Tuple[torch.Tensor, ...]]:
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.distilbert(
      input_ids,
      attention_mask=attention_mask,
      head_mask=head_mask,
      inputs_embeds=inputs_embeds,
      output_attentions=output_attentions,
      output_hidden_states=output_hidden_states,
      return_dict=return_dict,
    )

    sequence_output = outputs[0]

    sequence_output = self.dropout(sequence_output)

    viewed_pos_tags = pos_tags.view((pos_tags.shape[0], pos_tags.shape[1], 1))
    viewed_bio_tags = bio_tags.view((bio_tags.shape[0], bio_tags.shape[1], 1))

    concat_output = torch.cat((sequence_output, viewed_pos_tags, viewed_bio_tags), 2)
    logits = self.classifier(concat_output)

    loss = None
    if labels is not None:
      loss_fct = torch.nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
      output = (logits,) + outputs[1:]
      return ((loss,) + output) if loss is not None else output

    return TokenClassifierOutput(
      loss=loss,
      logits=logits,
      hidden_states=outputs.hidden_states,
      attentions=outputs.attentions,
    )

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(LABEL_LIST))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [10]:
def build_dataset_from_sentences(sentences, drop_label = False):
  dataset_tokens = []
  dataset_partitive_roles = []
  dataset_pos_tags = []
  dataset_bio_tags = []
  for sentence in sentences:
    tokens = [word['word'] for word in sentence]
    if not drop_label:
      partitive_roles = [LABEL_LIST.index(word['label']) for word in sentence]
    else:
      partitive_roles = None
    pos_tags = [POS_LIST.index(word['pos']) for word in sentence]
    bio_tags = [BIO_TAG_LIST.index(word['biotag']) for word in sentence]
    dataset_tokens.append(tokens)
    if not drop_label:
      dataset_partitive_roles.append(partitive_roles)
    dataset_pos_tags.append(pos_tags)
    dataset_bio_tags.append(bio_tags)
  if not drop_label:
    dataset_dict = {
        "tokens": dataset_tokens,
        "partitive_roles": dataset_partitive_roles,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags
    }
  else:
    dataset_dict = {
        "tokens": dataset_tokens,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags
    }
  return Dataset.from_dict(dataset_dict)

In [11]:
train_raw_dataset = build_dataset_from_sentences(train_sentences)
valid_raw_dataset = build_dataset_from_sentences(valid_sentences)
test_raw_dataset = build_dataset_from_sentences(test_sentences, drop_label=True)

raw_datasets = DatasetDict(train=train_raw_dataset, valid=valid_raw_dataset, test=test_raw_dataset)

In [12]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)


    pos_tags = []
    bio_tags = []

    for i, pos_tag in enumerate(examples["pos_tags"]):
      bio_tag = examples["bio_tags"][i]
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      pos_tag_ids = []
      bio_tag_ids = []
      for word_idx in word_ids:
        if word_idx is None:
          pos_tag_ids.append(-100)
          bio_tag_ids.append(-100)
        elif word_idx != previous_word_idx:
          pos_tag_ids.append(pos_tag[word_idx])
          bio_tag_ids.append(bio_tag[word_idx])
        else:
          pos_tag_ids.append(pos_tag[word_idx] if label_all_tokens else -100)
          bio_tag_ids.append(bio_tag[word_idx] if label_all_tokens else -100)
      pos_tags.append(pos_tag_ids)
      bio_tags.append(bio_tag_ids)
    
    tokenized_inputs["pos_tags"] = pos_tags
    tokenized_inputs["bio_tags"] = bio_tags

    if "partitive_roles" in examples:
      labels = []
      for i, label in enumerate(examples["partitive_roles"]):
          word_ids = tokenized_inputs.word_ids(batch_index=i)
          previous_word_idx = None
          label_ids = []
          for word_idx in word_ids:
              if word_idx is None:
                  label_ids.append(-100)
              elif word_idx != previous_word_idx:
                  label_ids.append(label[word_idx])
              else:
                  label_ids.append(label[word_idx] if label_all_tokens else -100)
              previous_word_idx = word_idx

          labels.append(label_ids)

      tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
@dataclass
class PartitiveDataCollatorForTokenClassification(DataCollatorForTokenClassification):
    def torch_call(self, features):
        import torch

        sequence_length = 0

        for i, item in enumerate(features):
          sequence_length = max(len(item['input_ids']), sequence_length)
        
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
        )

        for k, v in batch.items():
          for i, item in enumerate(v):
            padding_value = -100
            if k == "attention_mask":
              padding_value = 0
            batch[k][i] = item + [padding_value] * (sequence_length - len(item))
        batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
        return batch

In [15]:
batch_size=64

train_args = TrainingArguments(
    "bert_partitive_roles",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01
)

data_collator = PartitiveDataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  # Remove ignored index (special tokens)
  true_predictions = [
      [LABEL_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [LABEL_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  results = metric.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": results["overall_precision"],
      "recall": results["overall_recall"],
      "f1": results["overall_f1"],
      "accuracy": results["overall_accuracy"],
  }

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [18]:
trainer = Trainer(
  model,
  train_args,
  train_dataset=tokenized_datasets["train"],
  eval_dataset=tokenized_datasets["valid"],
  data_collator=data_collator,
  tokenizer=tokenizer,
  compute_metrics=compute_metrics
)

In [19]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, partitive_roles. If tokens, partitive_roles are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12991
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2030


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.079153,0.733924,0.712025,0.722809,0.968178
2,No log,0.080398,0.733902,0.700723,0.716929,0.967308
3,0.046500,0.077596,0.7485,0.676763,0.710826,0.968445
4,0.046500,0.080393,0.740037,0.721971,0.730892,0.96918
5,0.041800,0.083345,0.724879,0.746835,0.735694,0.968044
6,0.041800,0.083515,0.734432,0.725136,0.729754,0.968378
7,0.041800,0.08496,0.760599,0.689421,0.723263,0.969247
8,0.039500,0.084225,0.753972,0.707957,0.73024,0.969047
9,0.039500,0.083521,0.756063,0.718807,0.736964,0.969581
10,0.039300,0.08226,0.74475,0.721519,0.732951,0.969247


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, partitive_roles. If tokens, partitive_roles are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 426
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, partitive_roles. If tokens, partitive_roles are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 426
  Batch size = 64
Saving model checkpoint to bert_partitive_roles/checkpoint-500
Configuration saved in bert_partitive_roles/checkpoint-500/config.json
Model weights saved in bert_partitive_roles/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bert_partiti

TrainOutput(global_step=2030, training_loss=0.04176878494582153, metrics={'train_runtime': 500.8996, 'train_samples_per_second': 259.353, 'train_steps_per_second': 4.053, 'total_flos': 2551123345037256.0, 'train_loss': 0.04176878494582153, 'epoch': 10.0})

In [None]:
test_set = 'test'

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'partitive_roles', 'pos_tags', 'bio_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12991
    })
    valid: Dataset({
        features: ['tokens', 'partitive_roles', 'pos_tags', 'bio_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 426
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'bio_tags', 'input_ids', 'attention_mask'],
        num_rows: 746
    })
})

In [None]:
test_results = trainer.predict(tokenized_datasets[test_set])

The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 746
  Batch size = 64


In [None]:
test_results

PredictionOutput(predictions=array([[[   6.7547884,  -10.348889 ],
        [   5.801474 ,   -6.0498323],
        [   5.810015 ,   -6.3363113],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   5.924703 ,   -9.496142 ],
        [   5.7230706,   -6.3419456],
        [   5.8068147,   -6.198631 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   5.680058 ,   -9.30028  ],
        [   5.736114 ,   -6.3017   ],
        [  -3.6347826,    4.3016186],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       ...,

       [[   6.0553117,   -9.954834 ],
        [   5.7628174,   -6.003422 ],
        [   5.658694 ,   -5.971573 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -10

In [None]:
out = []

for i in range(len(tokenized_datasets[test_set])):
  sentence = tokenized_datasets[test_set][i]
  tokenized_input = tokenizer(sentence["tokens"], truncation=True, is_split_into_words=True)
  predictions = test_results.predictions[i]
  label_ids = []
  for prediction in predictions:
    label_ids.append(np.argmax(prediction))
  word_id_to_label_idx = {}
  for j, word_id in enumerate(tokenized_input.word_ids()):
    if word_id in word_id_to_label_idx or word_id is None:
      continue
    word_id_to_label_idx[word_id] = j
  labelings = []
  for j, token in enumerate(sentence["tokens"]):
    label_idx = word_id_to_label_idx[j]
    label_id = label_ids[label_idx]
    label = LABEL_LIST[label_id] if label_id != 0 else None
    labelings.append((token, label))
  out.append(labelings)


In [None]:
out[0][:]

[('Then', None),
 ('in', None),
 ('a', None),
 ('lightning', None),
 ('plunge', None),
 (',', None),
 ('the', None),
 ('Dow', None),
 ('Jones', None),
 ('industrials', None),
 ('in', None),
 ('barely', None),
 ('an', None),
 ('hour', None),
 ('surrendered', None),
 ('about', None),
 ('a', None),
 ('third', None),
 ('of', None),
 ('their', None),
 ('gains', 'ARG1'),
 ('this', None),
 ('year', None),
 (',', None),
 ('chalking', None),
 ('up', None),
 ('a', None),
 ('190.58-point', None),
 (',', None),
 ('or', None),
 ('6.9', None),
 ('%', None),
 (',', None),
 ('loss', None),
 ('on', None),
 ('the', None),
 ('day', None),
 ('in', None),
 ('gargantuan', None),
 ('trading', None),
 ('volume', None),
 ('.', None)]

In [None]:
out_path = data_dir / '..' / 'out' / 'test-out-distilbert-enhanced'

In [None]:
with open(out_path, 'w') as f:
  for line in out:
    for labling in line:
      if labling[1]:
        f.write(f"{labling[0]}\t{labling[1]}\n")
      else:
        f.write(f"{labling[0]}\n")
    f.write("\n")

In [None]:
def test_sentence_string(s: str):
  tokenized_input = tokenizer(s, truncation=True)
  model(tokenized_input)