This notebook looks at the output from the snippet repository
and how to use it to train NER, classification, and mlm models.

In [1]:

from typing import List
from spacy import displacy
import src.data.snippet_repository as sr

### Named Enitiy Recognition Models (NER)

In [2]:
ner_repo = sr.SnippetRepository(sr.SnippetRepositoryMode.NER)

In [14]:
ner_data = ner_repo.get_training_data(batch_size=10)
detected = False
while not detected:
    ner_df = next(ner_data)
    detected = any(ner_df['ner_tags'].apply(lambda ner_tags: any(map(lambda t: t!="O", ner_tags))))

In [15]:
text = ner_df.iloc[9].text
ner_tags = ner_df.iloc[9].ner_tags
sr.visualize_ner_tags(text, ner_tags)

In [51]:
# load the model here

from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from transformers import DataCollatorForTokenClassification
from datasets import Dataset


In [6]:
AutoConfig.from_pretrained("dslim/bert-base-NER")

BertConfig {
  "_name_or_path": "dslim/bert-base-NER",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,
    "I-PER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size"

In [110]:
tokenizer = AutoTokenizer.from_pretrained(
    "dslim/bert-base-NER",
    padding=True, truncation=True,

)
config = AutoConfig.from_pretrained(
    "dslim/bert-base-NER", 
    output_attentions=True,
    output_hidden_states=True,
)
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", config=config)

In [40]:
features = tokenizer(
    [
        "This is sample @paulwalk, text national and it is grand".split(),
        "Hello this is Me, I am a person".split(),
    ], 
    is_split_into_words=True,
    return_tensors="pt",
    truncation=True,
    padding=True,
)
print(features.keys())
idx = 1
tokenizer.convert_ids_to_tokens(features["input_ids"][idx])
features.word_ids(batch_index=idx)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


[None, 0, 1, 2, 3, 3, 4, 5, 6, 7, None, None, None, None, None, None]

In [9]:
outs = model(**features)

In [10]:
outs.hidden_states[0].shape, outs.attentions[0].shape

(torch.Size([1, 10, 768]), torch.Size([1, 12, 10, 10]))

In [11]:
outs.hidden_states

(tensor([[[ 0.4593,  0.0671, -0.1690,  ...,  0.0050,  0.0154, -0.0556],
          [-1.5191, -0.0687,  0.8964,  ...,  0.4139,  0.3173,  0.3316],
          [-1.1648,  0.2270,  0.7271,  ...,  0.4671,  0.9646,  0.7556],
          ...,
          [-0.7087,  0.4728,  0.6344,  ...,  0.4425,  1.3855,  0.7695],
          [-0.8678,  0.2932,  0.3862,  ..., -0.9396, -0.1360,  0.5752],
          [ 0.0699,  0.0933,  0.3413,  ...,  0.5569, -0.5428,  0.4209]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[[ 0.2290, -0.0552, -0.0077,  ..., -0.0809, -0.0217,  0.0072],
          [-1.5546, -0.0652,  0.9279,  ...,  0.4970,  0.0371,  0.2910],
          [-1.1191,  0.4731,  0.6059,  ...,  0.4033,  0.5977,  0.8683],
          ...,
          [-0.3756,  0.4558,  0.4442,  ...,  0.4746,  1.0184,  0.9340],
          [-0.9685,  0.2611,  0.3602,  ..., -1.4144, -0.7480,  0.7747],
          [ 0.1437, -0.4574,  0.4823,  ...,  0.6414, -0.6725,  0.4760]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[[ 

In [16]:
batch = tokenizer(ner_df.text.tolist(), is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")

In [111]:
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")

In [95]:
ds = Dataset.from_pandas(ner_df.drop(columns=["tags"]).rename(columns={"ner_tags": "labels"}))
ds

Dataset({
    features: ['text', 'labels'],
    num_rows: 10
})

In [164]:
lbl_to_id = {"O":0, "B-DAT":1, "I-DAT":2}
id_to_lbl = {v:k for k,v in lbl_to_id.items()}

def convert_ner_tags_to_ids(ner_tags: List[str]) -> List[int]:
    return [lbl_to_id[ner_tag] for ner_tag in ner_tags]

ds_with_int_labels = ds.map(lambda x: {"labels":  [convert_ner_tags_to_ids(lbls) for lbls in x["labels"]]}, batched=True)
ds_with_int_labels

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 10
})

In [157]:
def tokenize_and_align_labels(tokenizer_f, examples):
    tokenized_inputs = tokenizer_f(examples["text"])

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(word_ids) # assume all tokens are special
        top_word_id = max(map(lambda x: x if x else -1, word_ids))
        for word_idx in range(top_word_id + 1):
            label_ids[word_ids.index(word_idx)] = label[word_idx]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [194]:
tokenize_f = partial(tokenizer, is_split_into_words=True, truncation=True)
ds_tokenized = ds_with_int_labels.map(partial(tokenize_and_align_labels, tokenize_f), batched=True, remove_columns=["text"])
ds_tokenized

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10
})

In [193]:
from functools import partial


lbl_to_id = {"O":0, "B-DAT":1, "I-DAT":2}
id_to_lbl = {v:k for k,v in lbl_to_id.items()}
tokenize_f = partial(tokenizer, is_split_into_words=True, truncation=True)
convert_label_f = lambda ner_tags: list(map(lambda tag: lbl_to_id[tag], ner_tags))

formatted_ds = ds.map(
    lambda sample: tokenize_f(sample["text"]) | {"labels": convert_label_f(sample["labels"])},
    remove_columns=["text"]
)
formatted_ds

  0%|          | 0/10 [00:00<?, ?ex/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10
})

In [195]:
from torch.utils.data import DataLoader
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")

data = DataLoader(ds_tokenized, batch_size=1, collate_fn=data_collator)

In [196]:
for d in data:
    print(d)
    break

{'labels': tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, -100,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, -100]]), 'input_ids': tensor([[  101,  1130,  1901,  1106, 11621,  1704,  6233,  5814,  1869,   117,
          2304, 24730,  4454,  1145,  4465,  2233,  1113,  1216,  3050,  1112,
          1705,  2060,   117,  3872,  3188,   117,  1105,  1295,  1104,  1278,
          6461,  1296,  1768,  1108,  3188,   117,  1111,  1296,  3397,  1214,
          1103,  3218,  1108,  4071,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 

In [182]:
d.keys()

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])

In [185]:
d["labels"].shape, d["input_ids"].shape, d["attention_mask"].shape

(torch.Size([1, 46]), torch.Size([1, 46]), torch.Size([1, 46]))

In [148]:
padded_ds[0]["labels"]


[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

In [119]:
data_collator(formatted_ds)

AttributeError: 'Dataset' object has no attribute 'keys'

In [113]:
features = [
    {"input_ids": [0, 1, 2], "labels": [0, 1, 2]},
    {"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]},
]

data_collator = DataCollatorForTokenClassification(tokenizer)

{'input_ids': tensor([[0, 1, 2, 0, 0, 0],
         [0, 1, 2, 3, 4, 5]]),
 'labels': tensor([[   0,    1,    2, -100, -100, -100],
         [   0,    1,    2,    3,    4,    5]]),
 'attention_mask': tensor([[1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1]])}