In [None]:
! pip install transformers datasets tokenizers evaluate
! pip install transformers[sentencepiece]
! pip install torch
! pip install tensorflow
! pip install spacy
! pip install seqeval
! pip install ipywidgets

In [None]:
! pip install ipywidgets

In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER",ignore_mismatched_sizes=True)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Who is David Cooperfield? He is a magician from New York."

ner_results = nlp(example)
print(ner_results)


In [None]:
# 1. Need to process scenarios data to CONLL-2003 format
# 2. Load the data to Dataset class
# If the model only need token and ner-tag, then it's easy; otherwise, we need POS tagging, chunk tagging, which need another model to do this job.
import sys
sys.path.insert(1, 'D:\GradProject\mobile_privacy\mobile_privacy')
import datasets
from notebooks.lib_analysis import *

def transform_ner_tags_to_conll2003_format(ner_tags):
    '''
    Transform the ner_tags to CONLL-2003 format.
    '''
    res = []
    for tag in ner_tags:
        if tag == 'o':
            res.append('O')
        elif tag == 'b-i':
            res.append('B-MISC')
        elif tag == 'i-i':
            res.append('I-MISC')
    return res

def load_dataset(path):
    '''
    Load the dataset from the path as a Dataset object.
    Dataset format is:
        - id: the id of the scenario
        - tokens: tokennized words
        - ner_tags: the NER tags of the tokens
    '''
    data1 = read_and_parse_data(path)
    res = {'id': [], 'tokens': [], 'ner_tags': []}
    for id, data in data1.items():
        res['id'].append(id)
        res['tokens'].append(data['words'])
        res['ner_tags'].append(transform_ner_tags_to_conll2003_format(data['codes']))
    return datasets.Dataset.from_dict(res, features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'
                            ]
                        )
                    ),
                }
            ))

samples = load_dataset('sample1_3.txt')
scenarios = load_dataset('scenario1_2.txt')
raw_datasets = datasets.dataset_dict.DatasetDict({'train': scenarios, 'validation': samples})
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names

# auto tokenizer
from transformers import AutoTokenizer

model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

# tokenize and align dataset
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

# data collation
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# metric
import evaluate

metric = evaluate.load("seqeval")

import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# define model
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)



In [None]:
from transformers import Trainer

from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=30,
    weight_decay=0.01,
    push_to_hub=False,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
data1 = read_and_parse_data('../datasets/sample1-TH.json')
data = {'id': [], 'words': [], 'codes': []}
for id, text, clean_text, words, codes, scores in data1.items():
    data['id'].append(id)
    data['words'].append(words)
    data['codes'].append(codes)


In [None]:
# 1. Process the data, to create a raw dataset that fits the format of CONLL-2003
# 2. Create a tokenized dataset, that contains the tokenized version of the raw dataset
# 3. Collate the tokenized dataset with DataCollatorForTokenClassification
# 4. Define a compute_metrics() function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.
# 5. Create a Trainer object, and train the model
# 6. Evaluate the model on the test set
# Detail on https://huggingface.co/course/chapter7/2