In [None]:
! pip install transformers datasets tokenizers evaluate
! pip install transformers[sentencepiece]
! pip install torch
! pip install tensorflow
! pip install spacy
! pip install seqeval
! pip install ipywidgets
! pip install "ray[tune]" transformers datasets scipy scikit-learn torch

In [2]:
# mount to folder
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/mobile_privacy/cleaned/models

Mounted at /content/drive
/content/drive/MyDrive/mobile_privacy/cleaned/models


In [4]:
# imports
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import sys, os,json
sys.path.insert(1, '/content/drive/MyDrive/mobile_privacy/cleaned')
import datasets
import transformers
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np
from transformers import Trainer
from transformers import TrainingArguments
from ray import tune
from seqeval.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
RANDOM_SEED = 0

In [21]:
label_match_dict = {
    'B-PER': 'B-NOUN',
    'I-PER': 'I-NOUN',
    'B-ORG': 'B-CMPX',
    'I-ORG': 'I-CMPX',
    'B-LOC': 'B-QUES',
    'I-LOC': 'I-QUES',
    'B-Noun Phrase': 'B-NOUN',
    'I-Noun Phrase': 'I-NOUN',
    'B-Complex Terms': 'B-CMPX',
    'I-Complex Terms': 'I-CMPX',
    'B-Questions': 'B-QUES',
    'I-Questions': 'I-QUES',
    'O': 'O'
}

def transform_ner_tags_to_conll2003_format(ner_tags):
    '''
    Transform the ner_tags to CONLL-2003 format.
    '''
    res = []
    for tag in ner_tags:
        res.append(label_match_dict.get(tag,'O'))
    return res

def load_dataset(path, split=[0.9, 0.05, 0.05]):
    '''
    Load the dataset from the path as a Dataset object.
    Dataset format is:
        - id: the id of the scenario
        - tokens: tokennized words
        - ner_tags: the NER tags of the tokens
    @ param path: file path to dataset json file.
    @ param split: the [train, validation, test] split.
    '''
    with open(path, 'r') as f:
        data1 = json.load(f)
    res = {'id': [], 'tokens': [], 'ner_tags': []}
    for id, data in data1.items():
        res['id'].append(id)
        res['tokens'].append(data['words'])
        res['ner_tags'].append(transform_ner_tags_to_conll2003_format(data['codes']))
    
    dataset = datasets.Dataset.from_dict(res, features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                'O', 'B-NOUN', 'I-NOUN', 'B-CMPX', 'I-CMPX', 'B-QUES', 'I-QUES'
                            ]
                        )
                    ),
                }
            ))
    if len(split) != 3 or abs(split[0] + split[1] + split[2] - 1.0) > 1e-5:
      print(split)
      raise Exception("split must be in the form [train, validation, test], and the split should sum up to 1")
    train_testvalid = dataset.shuffle(seed=RANDOM_SEED).train_test_split(test_size=split[1] + split[2], seed=RANDOM_SEED)
    # Split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=split[2]/(split[1] + split[2]), seed=RANDOM_SEED)
    # gather everyone if you want to have a single DatasetDict
    train_test_valid_dataset = datasets.dataset_dict.DatasetDict(
        {
        'train': train_testvalid['train'],
        'validation': test_valid['train'],
        'test': test_valid['test']
        }
    )
    return train_test_valid_dataset

In [8]:
# tokenize and align dataset
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples, **kwargs):
    '''
    Input: a row of Dataset
    Output: use dataset.map() method to map this function for each row.
            This will tokenize each row and align the original labels.
    '''
    tokenizer = kwargs['tokenizer']
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [22]:
# execution codes
# 1. load dataset
# change DATASET_PATH to your dataset path
DATASET_PATH = '../datasets/dataset_300.json'
raw_datasets = load_dataset(DATASET_PATH, [0.8, 0.1, 0.1])
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names

# 2. auto tokenizer
model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    fn_kwargs={'tokenizer':tokenizer}
)

# 3. data collator and metric
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    MultiLabelBinarizer().fit_transform(true_labels)
    MultiLabelBinarizer().fit_transform(true_predictions)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    # return all_metrics
    print(classification_report(true_labels,true_predictions ))
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# 4. define model
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

def model_init():
    return AutoModelForTokenClassification.from_pretrained(
            model_checkpoint,
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
    
# 5. train
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
)
trainer = Trainer(
    # model=model,
    model_init = model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--dslim--bert-base-NER/snapshots/f7c2808a659015eeb8828f3f809a2f1be67a2446/config.json
Model config BertConfig {
  "_name_or_path": "dslim/bert-base-NER",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-NOUN",
    "2": "I-NOUN",
    "3": "B-CMPX",
    "4": "I-CMPX",
    "5": "B-QUES",
    "6": "I-QUES"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-CMPX": 3,
    "B-NOUN": 1,
    "B-QUES": 5,
    "I-CMPX": 4,
    "I-NOUN": 2,
    "I-QUES": 6,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "positi

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/433M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--dslim--bert-base-NER/snapshots/f7c2808a659015eeb8828f3f809a2f1be67a2446/pytorch_model.bin
All model checkpoint weights were used when initializing BertForTokenClassification.

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--dslim--bert-base-NER/snapshots/f7c2808a659015eeb8828f3f809a2f1be67a2446/config.json
Model config BertConfig {
  "_name_or_path

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.519263,0.307692,0.433498,0.359918,0.845686
2,No log,0.368536,0.464413,0.642857,0.539256,0.879622
3,No log,0.342457,0.51992,0.642857,0.57489,0.888739
4,No log,0.365781,0.486989,0.64532,0.555085,0.878271
5,No log,0.393313,0.50947,0.662562,0.576017,0.879791
6,No log,0.430155,0.552795,0.657635,0.600675,0.891102
7,No log,0.44836,0.549603,0.682266,0.608791,0.883674
8,No log,0.47,0.547945,0.689655,0.610687,0.884687
9,No log,0.474582,0.533333,0.689655,0.601504,0.88283
10,No log,0.47762,0.559184,0.674877,0.611607,0.884518


***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        CMPX       0.00      0.00      0.00        30
        NOUN       0.31      0.53      0.39       332
        QUES       0.00      0.00      0.00        44

   micro avg       0.31      0.43      0.36       406
   macro avg       0.10      0.18      0.13       406
weighted avg       0.25      0.43      0.32       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-30
Configuration saved in bert-finetuned-ner/checkpoint-30/config.json
Model weights saved in bert-finetuned-ner/checkpoint-30/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-30/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-30/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.00      0.00      0.00        30
        NOUN       0.57      0.75      0.65       332
        QUES       0.10      0.25      0.14        44

   micro avg       0.46      0.64      0.54       406
   macro avg       0.22      0.33      0.26       406
weighted avg       0.47      0.64      0.54       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-60
Configuration saved in bert-finetuned-ner/checkpoint-60/config.json
Model weights saved in bert-finetuned-ner/checkpoint-60/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-60/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-60/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.00      0.00      0.00        30
        NOUN       0.62      0.74      0.67       332
        QUES       0.16      0.34      0.22        44

   micro avg       0.52      0.64      0.57       406
   macro avg       0.26      0.36      0.30       406
weighted avg       0.52      0.64      0.57       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-90
Configuration saved in bert-finetuned-ner/checkpoint-90/config.json
Model weights saved in bert-finetuned-ner/checkpoint-90/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-90/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-90/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.00      0.00      0.00        30
        NOUN       0.60      0.72      0.66       332
        QUES       0.28      0.50      0.35        44

   micro avg       0.49      0.65      0.56       406
   macro avg       0.29      0.41      0.34       406
weighted avg       0.52      0.65      0.58       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-120
Configuration saved in bert-finetuned-ner/checkpoint-120/config.json
Model weights saved in bert-finetuned-ner/checkpoint-120/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-120/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-120/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.01      0.03      0.02        30
        NOUN       0.64      0.72      0.68       332
        QUES       0.35      0.64      0.46        44

   micro avg       0.51      0.66      0.58       406
   macro avg       0.34      0.46      0.38       406
weighted avg       0.56      0.66      0.61       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-150
Configuration saved in bert-finetuned-ner/checkpoint-150/config.json
Model weights saved in bert-finetuned-ner/checkpoint-150/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-150/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-150/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.00      0.00      0.00        30
        NOUN       0.66      0.73      0.69       332
        QUES       0.33      0.52      0.40        44

   micro avg       0.55      0.66      0.60       406
   macro avg       0.33      0.42      0.37       406
weighted avg       0.57      0.66      0.61       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-180
Configuration saved in bert-finetuned-ner/checkpoint-180/config.json
Model weights saved in bert-finetuned-ner/checkpoint-180/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-180/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-180/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.02      0.03      0.03        30
        NOUN       0.64      0.75      0.69       332
        QUES       0.40      0.61      0.49        44

   micro avg       0.55      0.68      0.61       406
   macro avg       0.36      0.47      0.40       406
weighted avg       0.57      0.68      0.62       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-210
Configuration saved in bert-finetuned-ner/checkpoint-210/config.json
Model weights saved in bert-finetuned-ner/checkpoint-210/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-210/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-210/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.02      0.03      0.03        30
        NOUN       0.64      0.76      0.69       332
        QUES       0.39      0.59      0.47        44

   micro avg       0.55      0.69      0.61       406
   macro avg       0.35      0.46      0.40       406
weighted avg       0.56      0.69      0.62       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-240
Configuration saved in bert-finetuned-ner/checkpoint-240/config.json
Model weights saved in bert-finetuned-ner/checkpoint-240/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-240/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-240/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.02      0.03      0.02        30
        NOUN       0.63      0.76      0.69       332
        QUES       0.38      0.59      0.46        44

   micro avg       0.53      0.69      0.60       406
   macro avg       0.34      0.46      0.39       406
weighted avg       0.56      0.69      0.61       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-270
Configuration saved in bert-finetuned-ner/checkpoint-270/config.json
Model weights saved in bert-finetuned-ner/checkpoint-270/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-270/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-270/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


              precision    recall  f1-score   support

        CMPX       0.02      0.03      0.03        30
        NOUN       0.66      0.75      0.70       332
        QUES       0.37      0.57      0.45        44

   micro avg       0.56      0.67      0.61       406
   macro avg       0.35      0.45      0.39       406
weighted avg       0.58      0.67      0.62       406



Saving model checkpoint to bert-finetuned-ner/checkpoint-300
Configuration saved in bert-finetuned-ner/checkpoint-300/config.json
Model weights saved in bert-finetuned-ner/checkpoint-300/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-300/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-300/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=300, training_loss=0.22299659729003907, metrics={'train_runtime': 7821.2481, 'train_samples_per_second': 0.307, 'train_steps_per_second': 0.038, 'total_flos': 309613907245104.0, 'train_loss': 0.22299659729003907, 'epoch': 10.0})