In [1]:
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q

In [2]:
!pip install datasets transformers==4.28.0



In [3]:
exit()

In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW, DataCollatorForTokenClassification

In [2]:
df = pd.read_json('train.json',lines=True)
test_df = pd.read_json('test.json',lines=True)
valid_df = pd.read_json('valid.json',lines=True)
print(len(df))
print(len(test_df))
print(len(valid_df))

5228
5865
5330


In [3]:
df[:2]

Unnamed: 0,tags,tokens
0,"[1, 0, 0, 0, 0, 0, 1, 0]","[Naloxone, reverses, the, antihypertensive, ef..."
1,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[In, unanesthetized, ,, spontaneously, hyperte..."


In [4]:
tag_name = ["O",
    "B-Chemical",
    "B-Disease",
    "I-Disease",
    "I-Chemical"]

In [5]:
from datasets import Dataset, ClassLabel, Sequence, Features, Value, DatasetDict
tags = ClassLabel(num_classes=len(tag_name), names=tag_name)

In [6]:
dataset_structure = {"ner_tags":Sequence(tags),
                 'tokens': Sequence(feature=Value(dtype='string'))}

In [8]:
def df_to_dataset(df, columns=['tags', 'tokens']):
  assert set(['tags', 'tokens']).issubset(df.columns)

  ner_tags = df['tags'].map(tags.str2int).values.tolist()
  tokens = df['tokens'].values.tolist()

  assert isinstance(tokens[0], list)
  assert isinstance(ner_tags[0], list)
  d = {'ner_tags':ner_tags, 'tokens':tokens}# 如果有其他欄位例如id, spans請從這裡添加
  # create dataset
  dataset = Dataset.from_dict(mapping=d,
              features=Features(dataset_structure),)
  return dataset

dataset = df_to_dataset(df) # 從train.txt變成df，然後轉成訓練資料dataset
test_dataset =  df_to_dataset(test_df) # 從test-submit.txt變成test_df，然後轉成訓練資料test_dataset
valid_dataset =  df_to_dataset(valid_df)

# train = dataset.train_test_split(test_size=0.2) # 訓練做分割，保留validation set

# Split the 10% test + valid in half test, half valid
# test_valid = train['test']
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': dataset, # Trainer會用到
    'test': test_dataset, # 獨立的測試資料
    'valid': valid_dataset}) # Trainer會用到

label_names = dataset['train'].features["ner_tags"].feature.names
label_names

['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

In [9]:
# dataset['train'][:1]

In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding="max_length",
                     truncation=True, is_split_into_words=True)

In [12]:
tokenized_datasets_ = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5228 [00:00<?, ? examples/s]

Map:   0%|          | 0/5865 [00:00<?, ? examples/s]

Map:   0%|          | 0/5330 [00:00<?, ? examples/s]

In [13]:
# tokenized_datasets_['train'][0]['input_ids'][:20]

In [14]:
# tokenized_datasets_['train'][0]['ner_tags'][:20]

In [15]:
# len(tokenized_datasets_['train'][0]['input_ids']) == len(tokenized_datasets_['train'][0]['ner_tags'])

In [16]:
#Get the values for input_ids, attention_mask, adjusted labels
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"],
                is_split_into_words=True, truncation=True)
  # print(tokenized_samples['input_ids'][:2])
  # tokenizer(string, padding=True, truncation=True)
  # assert False

  # print(len(tokenized_samples["input_ids"]))
  # print(tokenized_samples.word_ids(batch_index=2))
  total_adjusted_labels = []

  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    i = -1
    adjusted_label_ids = []
    # print(existing_label_ids)
    # print(adjusted_label_ids)
    # assert False
    for word_idx in word_ids_list:
      # Special tokens have a word id that is None. We set the label to -100 so they are automatically
      # ignored in the loss function.
      if(word_idx is None):
        adjusted_label_ids.append(-100)
      elif(word_idx!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = word_idx
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])

    total_adjusted_labels.append(adjusted_label_ids)

  #add adjusted labels to the tokenized samples
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels,
                batched=True,
                remove_columns=list(dataset["train"].features.keys()))

Map:   0%|          | 0/5228 [00:00<?, ? examples/s]

Map:   0%|          | 0/5865 [00:00<?, ? examples/s]

Map:   0%|          | 0/5330 [00:00<?, ? examples/s]

In [17]:
out = tokenizer("Fine tune NER in google colab!")
out

{'input_ids': [101, 2986, 8694, 11265, 2099, 1999, 8224, 15270, 2497, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
out.word_ids(0)

[None, 0, 1, 2, 2, 3, 4, 5, 5, 6, None]

In [19]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5228
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5865
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5330
    })
})

In [20]:
# tokenized_dataset['train'][:2]

In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [23]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_names)) #Fine Tuning 訓練模型
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [24]:
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")  #We will use seqeval metrics, commonly used for token classification

def compute_metrics(p):
    predictions, labels = p

    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  metric = load_metric("seqeval")  #We will use seqeval metrics, commonly used for token classification


In [25]:
example = dataset["train"][1]
labels = [label_names[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'Chemical': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Disease': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [26]:
from transformers import TrainingArguments, Trainer

batch_size = 16
logging_steps = len(tokenized_dataset['train']) // batch_size
epochs = 15 # 多訓練幾次可以讓分數好一些

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/bert-fine-tune-ner/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps)


In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [28]:
# trainer.train_dataset[0]

In [29]:
#fine tune using train method
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2181,0.163517,0.839383,0.895959,0.866749,0.943021
2,0.0786,0.152547,0.873014,0.908795,0.890545,0.953509
3,0.0376,0.178547,0.908865,0.880781,0.894603,0.954533
4,0.0206,0.204167,0.902492,0.887844,0.895108,0.955626
5,0.0124,0.223464,0.899062,0.897963,0.898512,0.95646
6,0.0081,0.237895,0.895613,0.908081,0.901804,0.956847
7,0.0055,0.257144,0.909901,0.8909,0.9003,0.95585
8,0.0033,0.263398,0.904611,0.900679,0.902641,0.957111
9,0.0021,0.277479,0.905012,0.900679,0.90284,0.956887
10,0.0015,0.294945,0.88222,0.91691,0.899231,0.955348


TrainOutput(global_step=4905, training_loss=0.026015458824198454, metrics={'train_runtime': 901.9936, 'train_samples_per_second': 86.941, 'train_steps_per_second': 5.438, 'total_flos': 1542750764736792.0, 'train_loss': 0.026015458824198454, 'epoch': 15.0})

In [30]:
trainer.evaluate()

{'eval_loss': 0.2995509207248688,
 'eval_precision': 0.903942141183661,
 'eval_recall': 0.9039728353140917,
 'eval_f1': 0.9039574879883194,
 'eval_accuracy': 0.9573962891353753,
 'eval_runtime': 16.8278,
 'eval_samples_per_second': 316.737,
 'eval_steps_per_second': 19.848,
 'epoch': 15.0}

In [None]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
# label對測試資料是沒有意義的，最開始為了滿足格式需求會全部填成'O'
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
# results
# true_predictions


In [37]:
results["overall_f1"]

0.8910791003285318

In [34]:
# predictions[0]

array([0, 1, 1, 1, 1, 0, 0, 2, 2, 2, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       2, 2, 0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [32]:
# 將編碼後的資料轉換回文字，(但這些文字會是破碎的，舉例來說 "Apple"可能變成"App", "le")
re_tokenized_tokens = []
for i in tokenized_dataset["test"]['input_ids']:
  a = tokenizer.convert_ids_to_tokens(i, skip_special_tokens=True)
  re_tokenized_tokens.append(a)

# 文字和true_predictions變成dataframe(依然是破碎的狀態)
new_df = None
lis = []
for token, pred in zip(re_tokenized_tokens, true_predictions):
  token_list = np.array(token).reshape(-1,1)
  tag_list = np.array(pred).reshape(-1,1)
  # print(token_list.shape, tag_list.shape, )

  try:
    result = np.concatenate((token_list,tag_list),axis=1)
  except ValueError: # 有時候重新編碼過程會有問題，發生在token_list,tag_list的長度不匹配，
              # 做concat的時候遇到這個狀況會raise ValueError，因此進行例外處理
    max_ = max(len(token_list), len(tag_list))
    if len(token_list) < max_:
      miss_len =  max_ - len(token_list) # 計算不匹配的長度
      token_list = token_list.reshape(-1).tolist()
      token_list += ['miss_string']*miss_len # 把長度補起來
      token_list = np.array(token_list).reshape(-1,1)

    elif len(tag_list) < max_:
      miss_len =  max_ - len(token_list) # 計算不匹配的長度
      tag_list = tag_list.reshape(-1).tolist()
      tag_list += ['O']*miss_len  # 把長度補起來
      tag_list = np.array(tag_list).reshape(-1,1)

    result = np.concatenate((token_list,tag_list),axis=1)

  if isinstance(new_df, pd.core.frame.DataFrame): # 2~N個迴圈
    tmp = pd.DataFrame(result, columns=['tokens', 'ner_tags'])
    new_df = pd.concat([new_df,tmp], axis=0)
  else: # else的區塊只有第一個迴圈會被執行，初始化我的new_df
    new_df = pd.DataFrame(result, columns=['tokens', 'ner_tags'])

new_df.loc[:, 'tokens'] = new_df.loc[:, 'tokens'].map(lambda x: x.replace('##','') if x[:2]=='##' else x)
new_df.loc[:, 'tokens'] = new_df.loc[:, 'tokens'].map(lambda x: x.replace('\u200b','')) # 不可見編碼
new_df = new_df[new_df['tokens']!='miss_string']
new_df = new_df.reset_index(drop=True)
new_df.head()

Unnamed: 0,tokens,ner_tags
0,fa,B-Chemical
1,mot,B-Chemical
2,idi,B-Chemical
3,ne,B-Chemical
4,-,O


### 新建答案卷
從原始資料裡面重新新建一個答案卷，格式符合example.txt的格式

In [33]:
  with open('test-submit.txt', 'r') as f:
  text = f.readlines()
  text = [i.rstrip('\n') for i in text]
submit_answer = pd.DataFrame(text)
submit_answer.columns = ["tokens"]
submit_answer['ner_tags'] = [""]*len(submit_answer)
submit_answer.loc[:, 'tokens'] = submit_answer.loc[:, 'tokens'].map(lambda x: x.replace('\u200b',''))
submit_answer.head()

IndentationError: ignored

## Dirty work
NKP工作裡資料處理最是麻煩，會有其他語言的文字，各種符號等等，我們的預測結果會以英文為主，所以對於這些特殊文字的識別能力並不好。
#### 但最困擾的是輸出格式的問題
這些特殊文字會讓輸出很難對上答案應有的格式，所以需要不斷地一一確認，用assert做檢查，然後判斷該如何處理。這是單純的資料處理，或者說工程問題。跟AI無關卻必要。

我這邊是取巧，只要輸出的文字跟特殊文字的長度能對的上，我就同意他的輸出。ner_tag則用'O'來表示。

In [None]:
idx = 33194
submit_answer.loc[idx:33204,:]

In [None]:
idx = 64572
new_df.loc[idx,:]

In [None]:
def most_frequent(List):
  return max(set(List), key = List.count)

# 用來判斷是否是特殊字元，會對所有不是英文和數字的文字回傳False
def isEnglish(s): # python>=3.7
  import sys
  assert sys.version_info>=(3,7,0)
  '''
  isEnglish('Ještě') # False
  isEnglish('123') # True
  isEnglish('abc') # True
  '''
  return s.isascii()
event = [] # deprecated，用來記錄錯誤
attr = 0  # 設定一個標誌符號，加快檢索效率，也避免重複出現文字造成錯誤
for i in range(len(submit_answer)):
  target_token = submit_answer.at[i, 'tokens']

  # 特殊文字處理
  if isEnglish(target_token):
    pass # 是英文就pass
  else: # 特殊字元在這裡處理
    voc = '' # 無字元
    for j in range(attr, len(new_df)):
      '''
      從反編碼的資料中用for迴圈搜尋，獲得對應位置的tokens(破碎)，並不斷組裝，
      直到len(voc)==len(target_token)，停止迴圈break

      例外條件，如果搜尋30次`j>attr+30`還搜尋不到，跳出系統錯誤中斷程序
      '''
      voc += new_df.at[j, 'tokens'] #
      attr += 1 # 標誌符移動
      if len(voc)==len(target_token): #
        break
      elif j>attr+30:
        assert False
    most_freq_tag = 'O'
    submit_answer.at[i, 'ner_tags'] = most_freq_tag
    continue

  # 通常流程
  '''
  我們會逐列的填寫submit_answer的ner_tags，
  most_freq_tag: 預測值，對應並將填寫到相應位置的ner_tags
  '''
  voc = ""
  tag_candidate = []
  if target_token=='': # 如果是空行，
    most_freq_tag = ''
    continue
  else:
    most_freq_tag = 'O'
  '''
  碎片化文字重組與tag推論
  '''
  for j in range(attr, len(new_df)):

    re_token = new_df.at[j, 'tokens']

    voc += re_token
    '''
    tag_candidate:
    因為文字破碎化，每一個碎片都有一個tag，將這些tag集中起來，
    待會用most_frequent方法(自定義函式)找頻率最高的作為代表
    '''
    tag_candidate.append(new_df.at[j, 'ner_tags'])

    # 紀錄訊息，在new_df上的index，目前拚出來的文字(來自反編碼的資料)，應該要拚出來的文字(來自原始資料未經編碼),
    event.append((j, voc, i, target_token))


    '''
    例外狀況
    如果搜尋30次還是沒找到，會啟動例外狀況
    1. 如果判斷是網址，繼續搜尋直到總計搜尋200次
    2. 其他: 彈出系統錯誤中斷程序。表示有特殊字元或未知事件發生了。
      沒辦法將碎片文字拼湊成原始文字，也就沒辦法知道對應的ner_tags
    '''
    if j>attr+30:
      if target_token[:8]=='https://' or\
        target_token[:7]=='http://' or\
        target_token[:4]=='www.': # is url
        if j<attr+200: # 判斷是網址，繼續搜尋直到總計搜尋200次
          pass
        else:
          # 把最後35筆事件print出來看
          for eve in event[-35:]:
            print(eve)
          assert False
      else: # 其他: 沒辦法將碎片文字拼湊成原始文字
        # 把最後35筆事件print出來看
        for eve in event[-35:]:
          print(eve)

        assert False

    if voc==target_token.lower(): # 如果成功拼湊回原始文字
      attr = j+1 # 移動new_df(反編碼資料)的標示符
      most_freq_tag = most_frequent(tag_candidate) # 計算最常出現的tag作為代表
      break # 中斷「碎片化文字重組與tag推論」
    else:
      pass
  submit_answer.at[i, 'ner_tags'] = most_freq_tag # 填寫tag
  print(f'{i}/{len(submit_answer)} processing...', end='\r') # print進度
submit_answer.head()

In [None]:
'''
最後檢查，確認沒有任何沒填空到的列
'''
check = submit_answer[submit_answer['ner_tags']== '']
check = check[check['tokens']!='']['tokens']
assert len(check)==0 # 如果有沒填空到的，跳出例外
check

In [None]:
'''
輸出答案，
我發現用pd.to_csv在特殊字元和網址會出錯，這邊簡單寫了一個輸出
'''
with open("answer.txt", "w") as f:
  for i in range(len(submit_answer)):
    token, tag = submit_answer.loc[i,:]
    text = token+'\t'+tag+'\n' # 相符於答案格式
    f.write(text)