In [1]:
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q

In [4]:
!pip install datasets transformers==4.28.0



In [5]:
exit()

In [1]:
import torch
import numpy as np
import pandas as pd
from datasets import load_metric
from torch.utils.data import DataLoader
from datasets import Dataset, ClassLabel, Sequence, Features, Value, DatasetDict
from transformers import AutoTokenizer,AutoModelForTokenClassification, AdamW, DataCollatorForTokenClassification



In [2]:
df = pd.read_json('/kaggle/input/515651651/train.json',lines=True)
test_df = pd.read_json('/kaggle/input/515651651/test.json',lines=True)
valid_df = pd.read_json('/kaggle/input/515651651/valid.json',lines=True)
print(len(df))
print(len(test_df))
print(len(valid_df))
df[:2]

5228
5865
5330


Unnamed: 0,tags,tokens
0,"[1, 0, 0, 0, 0, 0, 1, 0]","[Naloxone, reverses, the, antihypertensive, ef..."
1,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[In, unanesthetized, ,, spontaneously, hyperte..."


In [3]:
tag_name = ["O",
    "B-Chemical",
    "B-Disease",
    "I-Disease",
    "I-Chemical"]

In [4]:
tags = ClassLabel(num_classes=len(tag_name), names=tag_name)

In [5]:
tags

ClassLabel(num_classes=5, names=['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical'], id=None)

In [7]:
dataset_structure = {"ner_tags":Sequence(tags),
                 'tokens': Sequence(feature=Value(dtype='string'))}

In [8]:
dataset_structure

{'ner_tags': Sequence(feature=ClassLabel(num_classes=5, names=['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [9]:
dataset_structure["ner_tags"].feature.names

['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

In [10]:
def df_to_dataset(df, columns=['tags', 'tokens']):
  ner_tags = df['tags']
  tokens = df['tokens']
  d = {'ner_tags':ner_tags, 'tokens':tokens}
  dataset = Dataset.from_dict(mapping=d,features=Features(dataset_structure),)
  return dataset

dataset = df_to_dataset(df)
test_dataset =  df_to_dataset(test_df)
valid_dataset =  df_to_dataset(valid_df)

dataset = DatasetDict({
    'train': dataset,
    'test': test_dataset,
    'valid': valid_dataset})

label_names = dataset['train'].features["ner_tags"].feature.names
label_names

['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

In [11]:
dataset['train']

Dataset({
    features: ['ner_tags', 'tokens'],
    num_rows: 5228
})

In [12]:
dataset['train'][:1]

{'ner_tags': [[1, 0, 0, 0, 0, 0, 1, 0]],
 'tokens': [['Naloxone',
   'reverses',
   'the',
   'antihypertensive',
   'effect',
   'of',
   'clonidine',
   '.']]}

In [13]:
model_name="xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [14]:
def tokenize_function(sample):
    return tokenizer(sample["tokens"], padding="max_length",truncation=True, is_split_into_words=True)

In [15]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [16]:
tokenized_datasets['train'][0]['input_ids'][:20]

[0,
 353,
 365,
 27012,
 13,
 39531,
 5908,
 70,
 2874,
 3038,
 1264,
 41745,
 272,
 21543,
 111,
 20450,
 93,
 30833,
 6,
 5]

In [17]:
tokenized_datasets['train'][0]['ner_tags'][:20]

[1, 0, 0, 0, 0, 0, 1, 0]

In [18]:
# len(tokenized_datasets['train'][0]['input_ids']) == len(tokenized_datasets['train'][0]['ner_tags'])

In [19]:
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"],
                is_split_into_words=True, truncation=True)

  total_adjusted_labels = []

  for k in range(0, len(tokenized_samples["input_ids"])):
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    adjusted_label_ids = []
    prev_wid = -1
    i = -1
    for word_idx in word_ids_list:
      if(word_idx is None):
        adjusted_label_ids.append(-100)
      elif(word_idx!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = word_idx
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])

    total_adjusted_labels.append(adjusted_label_ids)

  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels,batched=True,remove_columns=list(dataset["train"].features.keys()))

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [20]:
out = tokenizer("Fine tune NER in google colab!")
out

{'input_ids': [0, 67455, 56312, 6, 52952, 23, 26484, 552, 6114, 38, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
out.word_ids(0)

[None, 0, 1, 2, 2, 3, 4, 5, 5, 5, None]

In [22]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5228
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5865
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5330
    })
})

In [23]:
tokenized_dataset['train'][0]['input_ids']

[0,
 353,
 365,
 27012,
 13,
 39531,
 5908,
 70,
 2874,
 3038,
 1264,
 41745,
 272,
 21543,
 111,
 20450,
 93,
 30833,
 6,
 5,
 2]

In [24]:
tokenized_dataset['train'][0]['attention_mask']

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [25]:
tokenized_dataset['train'][0]['labels']

[-100, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, -100]

In [26]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [27]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [28]:
model = AutoModelForTokenClassification.from_pretrained(model_name,num_labels=len(label_names))
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to 

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_featu

In [29]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p

    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [30]:
example = dataset["train"][1]
labels = [label_names[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'Chemical': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Disease': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [31]:
from transformers import TrainingArguments, Trainer

epochs = 8
batch_size = 8
logging_steps = len(tokenized_dataset['train']) // batch_size

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/bert-fine-tune-ner/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps)


In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [33]:
# no_deprecation_warning=True
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1998,0.152059,0.880237,0.886227,0.883221,0.955641
2,0.0985,0.140662,0.904148,0.892996,0.898538,0.96229
3,0.0689,0.168287,0.906583,0.89328,0.899882,0.962159
4,0.0392,0.187791,0.917092,0.890763,0.903736,0.962783
5,0.0217,0.183481,0.915424,0.911888,0.913653,0.965832
6,0.0154,0.21661,0.910696,0.914085,0.912387,0.965558
7,0.0064,0.243117,0.905977,0.918693,0.912291,0.96519
8,0.0029,0.252574,0.912343,0.914865,0.913602,0.965766


TrainOutput(global_step=5232, training_loss=0.05651471189676075, metrics={'train_runtime': 2498.2537, 'train_samples_per_second': 16.741, 'train_steps_per_second': 2.094, 'total_flos': 5403264983758728.0, 'train_loss': 0.05651471189676075, 'epoch': 8.0})

In [34]:
# os.environ.set('WANDB_DISABLED')

In [40]:
trainer.evaluate()

{'eval_loss': 0.252573698759079,
 'eval_precision': 0.9123427117206278,
 'eval_recall': 0.9148649606578294,
 'eval_f1': 0.9136020953527059,
 'eval_accuracy': 0.9657664081766052,
 'eval_runtime': 51.0759,
 'eval_samples_per_second': 104.355,
 'eval_steps_per_second': 13.059,
 'epoch': 8.0}

In [41]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
# label對測試資料是沒有意義的，最開始為了滿足格式需求會全部填成'O'
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
# results
# true_predictions


In [43]:
results["overall_f1"]

0.9078184154770589

In [38]:
# 將編碼後的資料轉換回文字，(但這些文字會是破碎的，舉例來說 "Apple"可能變成"App", "le")
re_tokenized_tokens = []
for i in tokenized_dataset["test"]['input_ids']:
  a = tokenizer.convert_ids_to_tokens(i, skip_special_tokens=True)
  re_tokenized_tokens.append(a)

# 文字和true_predictions變成dataframe(依然是破碎的狀態)
new_df = None
lis = []
for token, pred in zip(re_tokenized_tokens, true_predictions):
  token_list = np.array(token).reshape(-1,1)
  tag_list = np.array(pred).reshape(-1,1)
  # print(token_list.shape, tag_list.shape, )

  try:
    result = np.concatenate((token_list,tag_list),axis=1)
  except ValueError: # 有時候重新編碼過程會有問題，發生在token_list,tag_list的長度不匹配，
              # 做concat的時候遇到這個狀況會raise ValueError，因此進行例外處理
    max_ = max(len(token_list), len(tag_list))
    if len(token_list) < max_:
      miss_len =  max_ - len(token_list) # 計算不匹配的長度
      token_list = token_list.reshape(-1).tolist()
      token_list += ['miss_string']*miss_len # 把長度補起來
      token_list = np.array(token_list).reshape(-1,1)

    elif len(tag_list) < max_:
      miss_len =  max_ - len(token_list) # 計算不匹配的長度
      tag_list = tag_list.reshape(-1).tolist()
      tag_list += ['O']*miss_len  # 把長度補起來
      tag_list = np.array(tag_list).reshape(-1,1)

    result = np.concatenate((token_list,tag_list),axis=1)

  if isinstance(new_df, pd.core.frame.DataFrame): # 2~N個迴圈
    tmp = pd.DataFrame(result, columns=['tokens', 'ner_tags'])
    new_df = pd.concat([new_df,tmp], axis=0)
  else: # else的區塊只有第一個迴圈會被執行，初始化我的new_df
    new_df = pd.DataFrame(result, columns=['tokens', 'ner_tags'])

new_df.loc[:, 'tokens'] = new_df.loc[:, 'tokens'].map(lambda x: x.replace('##','') if x[:2]=='##' else x)
new_df.loc[:, 'tokens'] = new_df.loc[:, 'tokens'].map(lambda x: x.replace('\u200b','')) # 不可見編碼
new_df = new_df[new_df['tokens']!='miss_string']
new_df = new_df.reset_index(drop=True)
new_df.head()

Unnamed: 0,tokens,ner_tags
0,▁Fam,B-Chemical
1,oti,B-Chemical
2,dine,B-Chemical
3,▁-,O
4,▁associated,O


In [39]:
new_df[:20]

Unnamed: 0,tokens,ner_tags
0,▁Fam,B-Chemical
1,oti,B-Chemical
2,dine,B-Chemical
3,▁-,O
4,▁associated,O
5,▁de,B-Disease
6,li,B-Disease
7,rium,B-Disease
8,▁,O
9,.,O
