In [None]:
! pip install transformers[sentencepiece] datasets tokenizers evaluate scipy scikit-learn
! pip install torch
! pip install tensorflow
! pip install spacy
! pip install seqeval
! pip install ipywidgets
! pip install "ray[tune]"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloadi

In [None]:
# mount to folder
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/mobile_privacy/cleaned/models

Mounted at /content/drive
/content/drive/MyDrive/mobile_privacy/models


In [None]:
# imports
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, DataCollatorForTokenClassification, Trainer, TrainingArguments
import sys, os,json
sys.path.insert(1, '/content/drive/MyDrive/mobile_privacy/cleaned')
import datasets
from notebooks.lib_analysis import *
import evaluate
import numpy as np
from ray import tune
from seqeval.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import spacy
from copy import deepcopy
from spacy.training import offsets_to_biluo_tags, biluo_to_iob
nlp_parser = spacy.load("en_core_web_sm")
RANDOM_SEED = 0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [None]:
label_match_dict = {
    'B-PER': 'B-NOUN',
    'I-PER': 'I-NOUN',
    'B-ORG': 'B-CMPX',
    'I-ORG': 'I-CMPX',
    'B-LOC': 'B-QUES',
    'I-LOC': 'I-QUES',
    'B-Noun Phrase': 'B-NOUN',
    'I-Noun Phrase': 'I-NOUN',
    'B-Complex Terms': 'B-CMPX',
    'I-Complex Terms': 'I-CMPX',
    'B-Questions': 'B-QUES',
    'I-Questions': 'I-QUES',
    'O': 'O'
}

def transform_ner_tags_to_conll2003_format(ner_tags):
    '''
    Transform the ner_tags to CONLL-2003 format.
    '''
    res = []
    for tag in ner_tags:
        res.append(label_match_dict.get(tag,'O'))
    return res

def load_dataset(path, split=[0.9, 0.05, 0.05]):
    '''
    Load the dataset from the path as a Dataset object.
    Dataset format is:
        - id: the id of the scenario
        - tokens: tokennized words
        - ner_tags: the NER tags of the tokens
    @ param path: file path to dataset json file.
    @ param split: the [train, validation, test] split.
    '''
    with open(path, 'r') as f:
        data1 = json.load(f)
    res = {'id': [], 'tokens': [], 'ner_tags': []}
    for id, data in data1.items():
        res['id'].append(id)
        res['tokens'].append(data['words'])
        res['ner_tags'].append(transform_ner_tags_to_conll2003_format(data['codes']))
    
    dataset = datasets.Dataset.from_dict(res, features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                'O', 'B-NOUN', 'I-NOUN', 'B-CMPX', 'I-CMPX', 'B-QUES', 'I-QUES'
                            ]
                        )
                    ),
                }
            ))
    if len(split) != 3 or abs(split[0] + split[1] + split[2] - 1.0) > 1e-5:
      print(split)
      raise Exception("split must be in the form [train, validation, test], and the split should sum up to 1")
    train_testvalid = dataset.shuffle(seed=RANDOM_SEED).train_test_split(test_size=split[1] + split[2], seed=RANDOM_SEED)
    # Split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=split[2]/(split[1] + split[2]), seed=RANDOM_SEED)
    # gather everyone if you want to have a single DatasetDict
    train_test_valid_dataset = datasets.dataset_dict.DatasetDict(
        {
        'train': train_testvalid['train'],
        'validation': test_valid['train'],
        'test': test_valid['test']
        }
    )
    return train_test_valid_dataset

In [None]:
def convert_jsonl_to_bio(jsonl_list):
  res = {}
  for data in jsonl_list:
    id = data['scenario_id']
    clean_text = data['text']
    labels = data['label']
    doc = nlp_parser(clean_text)
    biluo_tags = offsets_to_biluo_tags(doc, labels)
    iob_tags = biluo_to_iob(biluo_tags)
    res[id] = {'id':id, 'clean_text':clean_text, 'words': [str(token) for token in doc], 'codes':iob_tags}
  return res

def match_label(label):
    true_label = deepcopy(label)
    for i in range(len(true_label)):
        true_label[i] = label_match_dict[true_label[i]]
    return true_label

In [None]:
def test_a_scenario(test_scenario, nlp):
    '''
    Print the words, true label, and predicted label

    Input: 
    @ param test_scenario: A scenario from loaded dataset
    @param nlp: A pipeline generated from pipeline()
    
    '''
    id = test_scenario['id']
    words = test_scenario['words']
    true_label = test_scenario['codes']
    example = test_scenario['clean_text']
    ner_result = nlp(example)

    pred_scenario = {'scenario_id': id, 'text': example, 'label':list()}
    for ent in ner_result:
        pred_scenario['label'].append([ent['start'], ent['end'], ent['entity_group']])
    pred_label = convert_jsonl_to_bio([pred_scenario])[id]['codes']
    
    true_label = match_label(true_label)
    pred_label = match_label(pred_label)
    

    print(f" {'WORD':<16} {'TRUE LABEL':<16} {'PREDICTION'}")
    print(f"{'-'*48}")
    for i in range(len(words)):
        print(f" {words[i]:<16} {true_label[i]:<16} {pred_label[i]}")


In [None]:
# for testing

# 1. load dataset
# change DATASET_PATH to your dataset path
DATASET_PATH = '../datasets/dataset_300.json'
raw_datasets = load_dataset(DATASET_PATH, [0.8, 0.1, 0.1])
test_set = raw_datasets["test"]
ner_feature = test_set.features["ner_tags"]
label_names = ner_feature.feature.names
with open(DATASET_PATH) as f:
    raw_dataset = json.load(f)
test_scenarios = [raw_dataset[id] for id in test_set['id']]

# 2. testing
# change MODEL_PATH to your model's path
MODEL_PATH = "./bert-finetuned-ner/checkpoint-300"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH,ignore_mismatched_sizes=True)
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")


In [None]:
# here is an example of making a prediction:

test_scenario = test_scenarios[0]
test_a_scenario(test_scenario, nlp)

 WORD             TRUE LABEL       PREDICTION
------------------------------------------------
 I                O                O
 use              O                O
 this             O                O
 for              O                O
 quick            O                O
 recordings       O                O
 ,                O                O
 mostly           O                O
 when             O                O
 I                O                O
 am               O                O
 writing          O                O
 songs            B-NOUN           B-NOUN
 or               O                O
 sketching        O                O
 ideas            B-CMPX           B-CMPX
 for              I-CMPX           I-CMPX
 future           I-CMPX           I-CMPX
 songs            I-CMPX           B-NOUN
 .                O                O
 I                O                O
 open             O                O
 the              O                O
 app              O          