In [None]:
import warnings; warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
import logging
import torch
from transformers import *
from pytorch_pretrained_bert import BertAdam
from utils.datasets import *
from utils.processors import *
from utils.train import *
from utils.datasets import BertDataset as BDS
#from utils import BertDataset, NERProcessor, InputExampleToTensors, NERTrainer,InputExample,SwedishNERCorpusProcessor,SUCProcessor
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd
import nltk
import matplotlib.pyplot as plt

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [None]:
#model_name = 'bert-base-uncased'
model_name = 'bert-base-multilingual-uncased'
#model_name = './swe-uncased_L-12_H-768_A-12'
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
tokenized_text = tokenizer.tokenize('iran har hittills inte reagerat på någondera av de stora påkarna som saudier och irakier hött med : landsbergis ansåg att gorbatjovs lördagsappell visade att denne ignorerar vädjanden från väst om att börja tala med regeringen i vilnius .')
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [None]:
processor = SUCProcessor('SUC/moreTags/', tokenizer, do_lower_case=True)
label_list = processor.get_label_list()
label_list

In [None]:
#df = pd.read_csv('SUC/smaller/valid.csv', names=['labels', 'text'], header=1, sep='\t')
train_examples = processor.get_train_examples()
#tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)

print(train_examples[8].text_a)
print(train_examples[8].label)

In [None]:
print(train_examples[1].text_a)
print(train_examples[1].label)

In [None]:
#B-LOC O B-TME O O O O O O O O O O O O O O O B-PRS O O B-PRS O O O O O O O O O O O O O O O B-LOC O

batch_size = 16
samples_transformer = InputExampleToTensors(tokenizer, max_seq_length=64, label_list=label_list)

train_data = BDS(train_examples, transform=samples_transformer)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
train_examples = processor.get_train_examples()

print(train_examples[5].text_a)
print(train_examples[5].label)

In [None]:
val_examples = processor.get_test_examples()

print(val_examples[1].text_a)
print(val_examples[1].label)
validation_data = BDS(val_examples, transform=samples_transformer)
valid_dataloader = DataLoader(validation_data, sampler=SequentialSampler(validation_data), batch_size=batch_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

In [None]:
trainer = NERTrainer(model, train_dataloader, valid_dataloader, label_list, fp16=True)

In [None]:
trainer.fit(learning_rate=2e-5, num_epochs=4)

In [None]:
training_rates = [4e-5, 5e-5]

for t in training_rates:
    print("\n\ntraining rate: " + str(t))
    for i in range(0,2):
        print("iter: " + str(i))
        model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
        trainer = NERTrainer(model, train_dataloader, valid_dataloader, label_list, fp16=True)
        trainer.fit(learning_rate=t, num_epochs=4)
        print(trainer.labelDict)
        print("\n\n")
    

In [None]:
!pwd
torch.save(model.state_dict(), '/home/jupyter/suc_ner_multilingual_saved.pkl')

In [None]:
import numpy as np
sent_tokenizer = tokenizer
sentence_lower = 'En som arbetar mycket hårt är erik som är politisk aktiv inom anderst i sverige .'
sentence_lower = sentence_lower.lower()
print(sentence_lower)
example = InputExample("", sentence_lower, label='O')
to_tensors = InputExampleToTensors(tokenizer, max_seq_length=128, label_list=label_list)
input_ids, input_mask, segment_ids, label_id = to_tensors(example)
#input_ids, input_mask, segment_ids, label_id = train_data[2]
model.to('cuda')
tokens_tensor = input_ids.view(1,-1).to('cuda')
segments_tensors = segment_ids.view(1,-1).to('cuda')
model.eval()
logits = model(tokens_tensor, segments_tensors)
res = []
res.extend(logits[0].argmax(-1))
np_logits = logits[0].detach().cpu().numpy()
np.argmax(np_logits, axis=2)
lst = np.argmax(np_logits, axis=2)[0].tolist()
lst = lst[1:]

#print(segment_ids)
splitinput = tokenizer.tokenize(sentence_lower)
#print(lst)
#print(list(zip(lst,splitinput)))
#print(logits)
for num, word in zip(lst, splitinput):
    if num == 4:
        print("PERSON: " + word)
    elif num == 5:
        print("ORG: " + word)
    elif num == 6:
        print("LOCATION: " + word)
    elif num == 7:
        print("WORK: " + word)
    elif num == 8:
        print("PRODUCT: " + word)
    else:
        print(num, word)
#    if num == 4 or num == 5:
#        print('{} {}'.format(num, word))

In [None]:
print("PER: " + str(trainer.total_per_correct / trainer.total_per))
print("LOC: " + str(trainer.total_loc_correct / trainer.total_loc))
print("ORG: " + str(trainer.total_org_correct / trainer.total_org))


In [None]:
trainer.labelDict

In [None]:
trainer.val_f1_score_hist

In [None]:
def cluster(my_list, n):
    final = [my_list[i * n:(i + 1) * n] for i in range((len(my_list) + n - 1) // n )]
    return list(map(lambda x: sum(x)/len(x), final))

In [None]:
import matplotlib.pyplot as plt
val = cluster(trainer.val_f1_score_hist, 50)
plt.plot(val)