In [None]:
DOWNSTREAM_TASK = 'ner'

# START

In [None]:
import warnings; warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

import logging

import os
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertForTokenClassification
from pytorch_pretrained_bert import BertAdam

from utils.utils import get_available_models, get_available_datasets, prune_examples, ENV_VARIABLE
from utils.bert_dataset import BertDataset
from utils.input_example import InputExample
from utils.input_example_to_tensors import InputExampleToTensors
from utils.ner_processor import NerProcessor
from utils.ner_trainer import NERTrainer

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [None]:
DIR_PRETRAINED_MODELS = ENV_VARIABLE['DIR_PRETRAINED_MODELS']
DIR_DATASETS = os.path.join(ENV_VARIABLE['DIR_DATASETS'], DOWNSTREAM_TASK)
DIR_CHECKPOINTS = os.path.join(ENV_VARIABLE['DIR_CHECKPOINTS'], DOWNSTREAM_TASK)

### 0. Available Models & Datasets

In [None]:
available_models = get_available_models()
available_models

In [None]:
available_datasets = get_available_datasets(DOWNSTREAM_TASK)
available_datasets

### 1. Settings

In [None]:
model_name = 'bert-base-swedish-uncased'
#model_name = 'bert-base-multilingual-uncased'

assert model_name in available_models

In [None]:
#dataset = 'SUC'
dataset = 'swedish_ner_corpus'

assert dataset in available_datasets

### 2. Tokenizer

In [None]:
example_sentence = \
    'iran har hittills inte reagerat på någondera av de stora påkarna som saudier och irakier hött med :' + \
    ' landsbergis ansåg att gorbatjovs lördagsappell visade att denne ignorerar vädjanden från väst om att' + \
    ' börja tala med regeringen i vilnius .'

In [None]:
if 'swedish' in model_name:
    pretrained_model_name = f'{DIR_PRETRAINED_MODELS}/{model_name}'
else:
    pretrained_model_name = model_name

pretrained_model_name

In [None]:
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, do_lower_case=False)  # needs to be False !!

In [None]:
tokenized_text = tokenizer.tokenize(example_sentence)
len(tokenized_text), tokenized_text[:10]

In [None]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
len(indexed_tokens), indexed_tokens[:3]

### 3. Processor (Data)

In [None]:
if dataset == 'SUC':
    dataset_path = f'{DIR_DATASETS}/SUC/moreTags/'
elif dataset == 'swedish_ner_corpus':
    dataset_path = f'{DIR_DATASETS}/swedish_ner_corpus/'
    
dataset_path

In [None]:
processor = NerProcessor(dataset_path, tokenizer, do_lower_case=True)
processor

In [None]:
label_list = processor.get_label_list()
label_list

#### Prune Examples (Temp)

In [None]:
prune_ratio = 0.1

#### Train Data

In [None]:
train_input_examples_all = processor.get_input_examples('train')

In [None]:
train_input_examples = prune_examples(train_input_examples_all, ratio=prune_ratio)

In [None]:
print(train_input_examples[8].guid)
print(train_input_examples[8].text_a)
print(tokenizer.tokenize(train_input_examples[8].text_a))
print(train_input_examples[8].labels_a)

#### Validation Data

In [None]:
valid_input_examples_all = processor.get_input_examples('test')

In [None]:
valid_input_examples = prune_examples(valid_input_examples_all, ratio=prune_ratio)

In [None]:
print(len(valid_input_examples))
print(valid_input_examples[1].text_a)
print(tokenizer.tokenize(valid_input_examples[1].text_a))
print(valid_input_examples[1].labels_a)

#### Dataloader

In [None]:
batch_size = 16

In [None]:
#B-LOC O B-TME O O O O O O O O O O O O O O O B-PRS O O B-PRS O O O O O O O O O O O O O O O B-LOC O
samples_transformer = InputExampleToTensors(tokenizer, 
                                            max_seq_length=64, 
                                            label_tuple=tuple(label_list))

In [None]:
train_data = BertDataset(train_input_examples, 
                         transform=samples_transformer)
train_dataloader = DataLoader(train_data, 
                              sampler=RandomSampler(train_data), 
                              batch_size=batch_size)

In [None]:
valid_data = BertDataset(valid_input_examples, 
                         transform=samples_transformer)
valid_dataloader = DataLoader(valid_data, 
                              sampler=SequentialSampler(valid_data), 
                              batch_size=batch_size)

### 4. Model

In [None]:
pretrained_model_name

In [None]:
model = BertForTokenClassification.from_pretrained(pretrained_model_name, 
                                                   num_labels=len(label_list))

model

### 5. Train

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
trainer = NERTrainer(model, 
                     train_dataloader, 
                     train_dataloader, 
                     label_list, 
                     fp16=True if torch.cuda.is_available() else False
                    )

trainer

In [None]:
trainer.fit(learning_rate=2e-5, num_epochs=2)  # 4

In [None]:
def display(_metrics):
    
    print('--- train ---')
    print('> batch')
    print(_metrics['batch']['train'])
    print('--- valid ---')
    print('> batch')
    print(_metrics['batch']['valid'])
    print('> epoch')
    print(_metrics['epoch']['valid'])
    
display(trainer.metrics)

#### Different training rates

#### Save Model Checkpoint

In [None]:
torch.save(model.state_dict(), f'./{DIR_CHECKPOINTS}/saved__{dataset}__{model_name}.pkl')

### 6. Investigate

In [None]:
sent_tokenizer = tokenizer
sentence_lower = 'En som arbetar mycket hårt är erik som är politisk aktiv inom anderst i sverige .'
sentence_lower = sentence_lower.lower()
print(sentence_lower)

In [None]:
example = InputExample("", sentence_lower, label='O')
example

In [None]:
to_tensors = InputExampleToTensors(tokenizer, 
                                   max_seq_length=128, 
                                   label_tuple=tuple(label_list))
input_ids, input_mask, segment_ids, label_id = to_tensors(example)
input_ids, input_mask, segment_ids, label_id

In [None]:
tokens_tensor = input_ids.view(1,-1)
segments_tensors = segment_ids.view(1,-1)

In [None]:
if device == 'cuda':
    model.to('cuda')
    tokens_tensor.to('cuda')
    segments_tensors.to('cuda')

In [None]:
model.eval()

In [None]:
logits = model(tokens_tensor, segments_tensors)
logits

In [None]:
res = []
res.extend(logits[0].argmax(-1))
res

In [None]:
np_logits = logits[0].detach().cpu().numpy()
np.argmax(np_logits, axis=2)

In [None]:
lst = np.argmax(np_logits, axis=2)[0].tolist()
lst = lst[1:]
lst

In [None]:
splitinput = tokenizer.tokenize(sentence_lower)
splitinput

In [None]:
for num, word in zip(lst, splitinput):
    if num == 4:
        print("PERSON: " + word)
    elif num == 5:
        print("ORG: " + word)
    elif num == 6:
        print("LOCATION: " + word)
    elif num == 7:
        print("WORK: " + word)
    elif num == 8:
        print("PRODUCT: " + word)
    else:
        print(num, word)
#    if num == 4 or num == 5:
#        print('{} {}'.format(num, word))

In [None]:
print("PER: " + str(trainer.total_per_correct / trainer.total_per))
print("LOC: " + str(trainer.total_loc_correct / trainer.total_loc))
print("ORG: " + str(trainer.total_org_correct / trainer.total_org))

In [None]:
trainer.labelDict

In [None]:
trainer.val_f1_score_hist

In [None]:
def cluster(my_list, n):
    final = [my_list[i * n:(i + 1) * n] for i in range((len(my_list) + n - 1) // n )]
    return list(map(lambda x: sum(x)/len(x), final))