In [None]:
import os
import sys
dirname = os.path.dirname
BASE_DIR = os.path.abspath(dirname(dirname(os.path.abspath('.'))))
print(BASE_DIR)
sys.path.insert(0, BASE_DIR)

In [None]:
import json

In [None]:
import warnings; warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

import logging

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [None]:
from utils.ner_processor import NerProcessor
from utils.input_example_to_tensors import InputExampleToTensors
from utils.utils import prune_examples
from transformers import BertTokenizer

# START

### Tokenizer

In [None]:
pretrained_model_name = f'{BASE_DIR}/pretrained_models/bert-base-swedish-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, do_lower_case=False)

### NerProcessor

In [None]:
dataset_path = f'{BASE_DIR}/datasets/ner/swedish_ner_corpus/'
processor = NerProcessor(dataset_path, tokenizer, do_lower_case=True)

In [None]:
label_list = processor.get_label_list()
label_list

In [None]:
train_input_examples_all = processor.get_input_examples('train')

In [None]:
train_input_examples = prune_examples(train_input_examples_all, ratio=0.01)

### InputExampleToTensors

In [None]:
samples_transformer = InputExampleToTensors(tokenizer, 
                                            max_seq_length=16, 
                                            label_tuple=tuple(label_list))

### Example

In [None]:
ex = train_input_examples[2]

In [None]:
ex.text_a

In [None]:
ex.labels_a

#### a. tokenize by hand (for comparison to b.)

In [None]:
tokens = tokenizer.tokenize(ex.text_a.lower())
tokens, len(tokens)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids, len(token_ids)

#### b. samples transformer

In [None]:
input_ids, input_mask, segment_ids, label_ids = samples_transformer(ex)

In [None]:
input_ids

In [None]:
tokenizer.convert_ids_to_tokens(input_ids)

In [None]:
input_mask

In [None]:
segment_ids

In [None]:
label_ids

In [None]:
samples_transformer.label2id

# Test

In [None]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

In [None]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
input_ids

In [None]:
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
labels

In [None]:
outputs = model(input_ids, labels=labels)
outputs

In [None]:
loss, scores = outputs[:2]

In [None]:
loss

In [None]:
scores

In [None]:
scores = model(input_ids)
scores