In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm


In [None]:
data = pd.read_csv("./out.csv", sep='\t')
data = data.fillna(method="ffill")

In [None]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

In [None]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); 
print(n_words)

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)


In [None]:
sent = getter.get_next()
print(sent)

In [None]:
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

In [None]:
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

In [None]:
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [None]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

In [None]:
MAX_LEN = 75
bs = 32

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
#torch.cuda.get_device_name(0) 
tags_vals
model_name = './swe-uncased_L-12_H-768_A-12'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)


In [None]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]


In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
#model_name = './swe-uncased_L-12_H-768_A-12'
#model = BertForTokenClassification.from_pretrained("bert-base-multilingual-uncased", num_labels=len(tag2idx))
model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(tag2idx))

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [None]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
model.to('cuda')

In [None]:
epochs = 1
max_grad_norm = 1.0
pbar = tqdm(total=epochs*len(train_dataloader))

for _ in range(epochs):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        pbar.update(1)
        pbar.set_description("Loss: " + str(loss.item()))
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

In [None]:
!pwd

In [None]:
torch.save(model.state_dict(), '/home/jupyter/bert-ner-saved.pkl')

In [None]:
import numpy as np
from utils import BertDataset, NERProcessor, InputExampleToTensors, NERTrainer,InputExample,SwedishNERCorpusProcessor

sent_tokenizer = tokenizer
sentence_lower = 'jag vill gratulera jonas sjöstedt och välkomna honom som partiledarkollega'
sentence_lower = sentence_lower.lower()
print(sentence_lower)
example = InputExample("", sentence_lower, label='O')
to_tensors = InputExampleToTensors(tokenizer, max_seq_length=128, label_list=label_list)
input_ids, input_mask, segment_ids, label_id = to_tensors(example)
#input_ids, input_mask, segment_ids, label_id = train_data[2]
model.to('cuda')
tokens_tensor = input_ids.view(1,-1).to('cuda')
segments_tensors = segment_ids.view(1,-1).to('cuda')
model.eval()
logits = model(tokens_tensor, segments_tensors)
res = []
res.extend(logits[0].argmax(-1))
np_logits = logits[0].detach().cpu().numpy()
np.argmax(np_logits, axis=2)
lst = np.argmax(np_logits, axis=2)[0].tolist()
lst = lst[1:]

#print(segment_ids)
splitinput = tokenizer.tokenize(sentence_lower)
#print(lst)
#print(list(zip(lst,splitinput)))
#print(logits)
for num, word in zip(lst, splitinput):
    if num == 4:
        print("PER: " + word)
    if num == 5:
        print("ORG: " + word)
    if num == 6:
        print("LOC: " + word)
    if num == 7:
        print("MISC: " + word)
    print(num, word)