In [None]:
! pip install transformers datasets tokenizers evaluate
! pip install transformers[sentencepiece]
! pip install torch
! pip install tensorflow
! pip install spacy
! pip install seqeval
! pip install ipywidgets
! pip install "ray[tune]"

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/mobile_privacy/models

In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# tokenizer = AutoTokenizer.from_pretrained("./bert-finetuned-ner/checkpoint-176")
# model = AutoModelForTokenClassification.from_pretrained("./bert-finetuned-ner/checkpoint-176",ignore_mismatched_sizes=True)


tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER",ignore_mismatched_sizes=True)



In [None]:
## for testing
# nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# example = '''The goals that I want to achieve through this screen is to check my previous payments, to see how many payments I have left(along with their amounts), and to possibly make an early payment. The way that I get to this particular page is by first opening the app, tapping on the profile icon on the bottom of the screen, tapping on "Your Orders", tapping on "Filter" and selecting "2021" under "Filter by order date" and then tapping "Apply". I then tap on the purchase, which is an ASUS laptop, scroll down to the "Order info" section and tap on "View and manage monthly payments". From there, I tap on the summary information for the payment plan, which then takes me to the more detailed page(the screenshot). In order to achieve my goals, I either just quickly review the previously paid amount and see what amount is still remaining to be paid or I scroll down a tad bit to tap on "Pay early" in order to make the next payment now instead of waiting for it to be automatically deducted from my payment method on the specified future date.'''
# ner_results = nlp(example)
# for i in ner_results:
#   if i['entity'].startswith('B'):
#     print("\n")
#   print(i['word'], "     ", i['entity'], "   ",i['score'], f"   {i['start']}-{i['end']}")

# print(ner_results)

In [None]:
import sys, os
sys.path.insert(1, '/content/drive/MyDrive/mobile_privacy')
import datasets
from lib_analysis import *
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [None]:
# 1. Need to process scenarios data to CONLL-2003 format
# 2. Load the data to Dataset class
# If the model only need token and ner-tag, then it's easy; otherwise, we need POS tagging, chunk tagging, which need another model to do this job.


def transform_ner_tags_to_conll2003_format(ner_tags):
    '''
    Transform the ner_tags to CONLL-2003 format.
    '''
    res = []
    for tag in ner_tags:
        if tag == 'o':
            res.append('O')
        elif tag == 'b-i':
            res.append('B-MISC')
        elif tag == 'i-i':
            res.append('I-MISC')
    return res

def load_dataset(path):
    '''
    Load the dataset from the path as a Dataset object.
    Dataset format is:
        - id: the id of the scenario
        - tokens: tokennized words
        - ner_tags: the NER tags of the tokens
    '''
    data1 = read_and_parse_data(path)
    res = {'id': [], 'tokens': [], 'ner_tags': []}
    for id, data in data1.items():
        res['id'].append(id)
        res['tokens'].append(data['words'])
        res['ner_tags'].append(transform_ner_tags_to_conll2003_format(data['codes']))
    return datasets.Dataset.from_dict(res, features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'
                            ]
                        )
                    ),
                }
            ))

samples = load_dataset('sample1_3.txt')
scenarios = load_dataset('scenario1_2.txt')
raw_datasets = datasets.concatenate_datasets([samples, scenarios]).shuffle(seed=0).train_test_split(test_size=0.1)
raw_datasets = datasets.dataset_dict.DatasetDict({'train': raw_datasets['train'], 'validation': raw_datasets['test']})
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names

In [None]:
print(raw_datasets)

In [None]:
# auto tokenizer
from transformers import AutoTokenizer

model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

# tokenize and align dataset
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
# data collation
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# metric
import evaluate

metric = evaluate.load("seqeval")

import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }



In [None]:
# define model
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
from transformers import AutoModelForTokenClassification

def model_init():
    return AutoModelForTokenClassification.from_pretrained(
            model_checkpoint,
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )

# train
from transformers import Trainer
from transformers import TrainingArguments
from ray import tune

def ray_hp_space(trial):
    return {
        "learning_rate": tune.loguniform(1e-6, 1e-4),
        "weight_decay": tune.choice([0.01, 0.001, 0.1]),
    }

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)
trainer = Trainer(
    # model=model,
    model_init = model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# trainer.train()

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    hp_space=ray_hp_space,
    n_trials=10,
    max_concurrent_trials=1
)

In [None]:
print(best_trial)

In [None]:
# 1. Process the data, to create a raw dataset that fits the format of CONLL-2003
# 2. Create a tokenized dataset, that contains the tokenized version of the raw dataset
# 3. Collate the tokenized dataset with DataCollatorForTokenClassification
# 4. Define a compute_metrics() function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.
# 5. Create a Trainer object, and train the model
# 6. Evaluate the model on the test set
# Detail on https://huggingface.co/course/chapter7/2