<a href="https://colab.research.google.com/github/git-grace/experiment/blob/main/base/preprocess_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer

In [None]:
sentence = "i want watch movies on visha"
# word-type-start-end
entities = [("visha", "App", 23, 28)]

In [None]:
sentence[23:28]

'visha'

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
tokens = tokenizer(sentence, return_offsets_mapping=True)
tokens

{'input_ids': [101, 1045, 2215, 3422, 5691, 2006, 25292, 3270, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (2, 6), (7, 12), (13, 19), (20, 22), (23, 26), (26, 28), (0, 0)]}

In [None]:
words = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
words

['[CLS]', 'i', 'want', 'watch', 'movies', 'on', 'vis', '##ha', '[SEP]']

In [None]:
def build_entities_by_token(entity_list, token_offsets):
  entities_by_token = []
  edge_mismatch = False
  start2id = {}
  end2id = {}
  for i, (s, e) in enumerate(token_offsets):
    if e == 0:
      continue
    start2id[s] = i
    end2id[e] = i+1

  print("start2id: ", start2id, "end2id: ", end2id)
  for w, t, s, e in entity_list:
    sid = start2id.get(s, 0)
    eid = end2id.get(e, 0)
    if sid and eid and (eid > sid):
      entities_by_token.append((w, t, sid, eid))
    else:
      edge_mismatch = True
    print(w, t, sid, eid)
  return entities_by_token, edge_mismatch


class TrainingData:
  def __init__(self, sub_words, tokens, entities):
    self.sub_words = sub_words
    self.input_ids = tokens["input_ids"]
    self.length = len(tokens["input_ids"])
    self.token_type_ids = tokens["token_type_ids"]
    self.attention_mask = tokens["attention_mask"]
    self.offset_mapping = tokens["offset_mapping"]
    self.entities_tokens, self.edge_match = build_entities_by_token(entities, tokens["offset_mapping"])

  def get_tags(self, max_len):
    tags = ["O" for _ in range(max_len)]
    tags[0] = "START_TAG"
    tags[self.length-1] = "END_TAG"
    for w, t, s, e in self.entities_tokens:
      tags[s] = f'B-{t}'
      for j in range(s+1, e):
        tags[j] = f"I-{t}"
    return tags


train_data = TrainingData(
  sub_words=words,
  tokens=tokens,
  entities=entities,
)

tags = train_data.get_tags(max_len=10)
for w, t in zip(words, tags):
  print(w, t)

start2id:  {0: 1, 2: 2, 7: 3, 13: 4, 20: 5, 23: 6, 26: 7} end2id:  {1: 2, 6: 3, 12: 4, 19: 5, 22: 6, 26: 7, 28: 8}
visha App 6 8
[CLS] START_TAG
i O
want O
watch O
movies O
on O
vis B-App
##ha I-App
[SEP] END_TAG


In [16]:
import re
from collections import Counter

def build_example(sentence_noted_list):
  slot_value_pattern = re.compile(r'\[(?P<value>.+?)\]\((?P<name>.+?)\)', flags=re.I | re.M)
  slot_counter = Counter()
  data_slot_list = []

  for raw in sentence_noted_list:
    sub_sentence = []
    slots = []
    start_idx, act_len = 0, 0
    for match_res in slot_value_pattern.finditer(raw):
      raw_slot_name, raw_slot_value = match_res.group("name"), match_res.group("value")
      start_pos, end_pos = match_res.start(), match_res.end()
      print(start_pos, end_pos)
      if start_pos > start_idx:
        sub_sentence.append(raw[start_idx:start_pos])
        act_len += start_pos - start_idx
      raw_slot_value = raw_slot_value.strip()
      sub_sentence.append(raw_slot_value)
      _start_pos = act_len
      act_len += len(raw_slot_value)
      _end_pos = act_len
      slots.append((raw_slot_value, raw_slot_name, _start_pos, _end_pos))
      slot_counter[raw_slot_name] += 1
      start_idx = end_pos
    if start_idx < len(raw):
      sub_sentence.append(raw[start_idx:])

    raw_text = "".join(sub_sentence)
    for slot in slots:
      assert raw_text[slot[2]:slot[3]] == slot[0], f"{raw_text} {slot}"
    data_slot_list.append((raw_text, slots))

  return data_slot_list


build_example(sentence_noted_list=[
    "i like eat [apple](fruit) do you like it?",
    "do you want [apple](fruit) and [banana](fruit)",
])

11 25
12 26
31 46


[('i like eat apple do you like it?', [('apple', 'fruit', 11, 16)]),
 ('do you want apple and banana',
  [('apple', 'fruit', 12, 17), ('banana', 'fruit', 22, 28)])]

In [None]:
%pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [None]:
import os
import math
import torch
import torch.nn as nn
from torchcrf import CRF
from itertools import repeat
from transformers import BertModel