In [None]:
#!python main.py -i './docs/' -o './output/' -d 'cpu' -m './result_21_06_2021/multiclass_bert_model/' -s 5 -f 2

In [None]:
!wget https://petroles.puc-rio.ai/downloads/Models/NER/Multiclass/result_21_06_2021.zip

In [None]:
import zipfile, os

list_zips = ['./result_21_06_2021.zip']
list_folders = ['./result_21_06_2021']

for path_to_zip_file, directory_to_extract_to in zip(list_zips,list_folders):
    if not os.path.exists(directory_to_extract_to):
        os.makedirs(directory_to_extract_to)
    else:
        continue
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)

In [None]:
# !pip install --user langdetect stanza transformers

In [1]:
from transformers import (BertConfig, BertForTokenClassification,
                                  BertTokenizer)
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from IPython.core.debugger import set_trace
import torch.nn.functional as F
from langdetect import detect
from datetime import datetime
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np
import stanza
import torch
import nltk
import json
import re
import os

import warnings
warnings.filterwarnings("ignore")


class BertNer(BertForTokenClassification):

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, valid_ids=None):
        sequence_output = self.bert(input_ids, token_type_ids, attention_mask, head_mask=None)[0]
        batch_size,max_len,feat_dim = sequence_output.shape
        valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32,device='cuda' if torch.cuda.is_available() else 'cpu')
        for i in range(batch_size):
            jj = -1
            for j in range(max_len):
                    if valid_ids[i][j].item() == 1:
                        jj += 1
                        valid_output[i][jj] = sequence_output[i][j]
        sequence_output = self.dropout(valid_output)
        logits = self.classifier(sequence_output)
        return logits

class InputFeatures(object):

    def __init__(self, input_ids, input_mask, segment_ids, valid_positions, tokens):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.valid_positions = valid_positions
        self.tokens = tokens

class Ner:

    def __init__(self,model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["label_map"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.model.eval()
        
        try:
            if self.device == 'cuda':
                self.nlp = stanza.Pipeline('pt',processors='tokenize',use_gpu=True) # initialize Portuguese neural pipeline
            else:
                self.nlp = stanza.Pipeline('pt',processors='tokenize') # initialize Portuguese neural pipeline
        except:
            stanza.download('pt') # download Portuguese model
            if self.device == 'cuda':
                self.nlp = stanza.Pipeline('pt',processors='tokenize',use_gpu=True) # initialize Portuguese neural pipeline
            else:
                self.nlp = stanza.Pipeline('pt',processors='tokenize') # initialize Portuguese neural pipeline

    def load_model(self, model_dir: str, model_config: str = "model_config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        model = BertNer.from_pretrained(model_dir)
        tokenizer = BertTokenizer.from_pretrained(model_dir)#, do_lower_case=model_config["do_lower"])
        return model, tokenizer, model_config

    def tokenize(self, sent):
        """ tokenize input"""
        words = [token.text for token in sent]
        tokens = []
        valid_positions = []
        for i, word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions
    
    def get_sents(self, text:str):
        doc = self.nlp(text)
        new_sents = []
        sentences = doc.sentences
        for sent in sentences:

            if len(sent.tokens)>self.max_seq_length:
                aux = []
                for idx, token in enumerate(sent.tokens):
                    aux.append(token)
                    if ((idx+1)%(self.max_seq_length)-150) == 0:
                        new_sents.append(aux)
                        aux = []
            else:
                new_sents.append(sent.tokens)
        return new_sents
        
    def preprocess(self, text: str):
        """ preprocess """
        print('Start stanza tokenizer...',end='     ')
        text_tokenized = self.get_sents(text)
        self.sents = text_tokenized
        print('Done!')
        
        features = []   
        for sent in text_tokenized:
            tokens, valid_positions = self.tokenize(sent)
            # insert "[CLS]"
            tokens.insert(0,"[CLS]")
            valid_positions.insert(0,1)
            # insert "[SEP]"
            tokens.append("[SEP]")
            valid_positions.append(1)
            segment_ids = []
            for i in range(len(tokens)):
                segment_ids.append(0)
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)
                valid_positions.append(0)
            
            features.append(
                InputFeatures(input_ids=input_ids[:self.max_seq_length],
                              input_mask=input_mask[:self.max_seq_length],
                              segment_ids=segment_ids[:self.max_seq_length],
                              valid_positions=valid_positions[:self.max_seq_length],
                              tokens= sent[:self.max_seq_length]))
            
        return features
    
    def get_predict(self, logits, logits_label, valid_ids):
        list_labels = []
        for i, (l, l_label) in enumerate(zip(logits,logits_label)):
            logits_confidence = [values[label].item() for values,label in zip(l,l_label)]
            logits = []
            pos = 0
            for index,mask in enumerate(valid_ids[i]):
                if index == 0:
                    continue
                if mask == 1:
                    logits.append((logits_label[i][index-pos],logits_confidence[index-pos]))
                else:
                    pos += 1

            list_labels.append([(self.label_map[label],confidence) for label,confidence in logits])
        return list_labels
    
    def get_output(self, all_tokens, all_labels):
        search = True
        flag = 0
        out = []
        #transforms_labels = lambda x: x if x=='O' else x[2:]
        for idx, (tokens, labels) in enumerate(zip(all_tokens,all_labels)):
            #list_label = [transforms_labels(label[0]) for label in labels if label[0]!='[SEP]']
            list_label = [label[0] for label in labels if label[0]!='[SEP]']
            
            for index, (word, label) in enumerate(zip(tokens,list_label)):
                if label != 'O' and search:
                    contador = index
                    while label[2:] == list_label[contador+1][2:] and list_label[contador+1] != '0':
                        contador +=1
                    total = contador - index +1
                    flag = total + index
                    if total > 0:
                        wordesss = []
                        wordStartChar = []
                        wordEndChar = []
                        for word, label in zip(tokens[index:contador+1], list_label[index:contador+1]):
                            try:
                                wordStartChar.append(word.start_char)
                                wordEndChar.append(word.end_char)
                                wordesss.append(word.text)
                            except Exception as e: print(e)

                        out.append((' '.join(map(str,wordesss)),min(wordStartChar),max(wordEndChar),label[2:]))
                    else:
                        total = 0
                    search = False
                if index >= flag:
                    search = True

        return pd.DataFrame(out,columns=['TEXT','START','END','LABEL'])  
    
    def get_output_2(self, all_tokens, all_labels):
        out = []
        #transforms_labels = lambda x: x if x=='O' else x[2:]
        for idx, (tokens, labels) in enumerate(zip(all_tokens,all_labels)):
            #list_label = [transforms_labels(label[0]) for label in labels if label[0]!='[SEP]']
            list_label = [label[0] for label in labels if label[0]!='[SEP]']
            
            for word, label in zip(tokens,list_label):
                if label != 'O':
                    out.append((word.text,word.start_char,word.end_char,label))

        return pd.DataFrame(out,columns=['TEXT','START','END','LABEL'])
    
    
    def predict(self, text: str, batch_size=2):
        test_features = self.preprocess(text)
        #self.test_features = test_features
        
        all_input_ids = torch.tensor([f.input_ids for f in test_features],dtype=torch.long,device=self.device)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],dtype=torch.long,device=self.device)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],dtype=torch.long,device=self.device)
        all_valid_positions = torch.tensor([f.valid_positions for f in test_features],dtype=torch.long,device=self.device)
        all_tokens = [f.tokens for f in test_features]
        
        test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_valid_positions)
        test_dataloader = DataLoader(test_data, batch_size=batch_size)

        all_labels = []
        for index, batch in enumerate(tqdm(test_dataloader, desc=f'Inference batch_size={batch_size}')):
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, valid_ids = batch
            with torch.no_grad():
                logits = self.model(input_ids, segment_ids, input_mask, valid_ids)

            logits = F.softmax(logits,dim=2)
            logits_label = torch.argmax(logits,dim=2)
            logits_label = logits_label.detach().cpu().numpy()
            
            all_labels += self.get_predict(logits, logits_label, valid_ids)
        
        try:
            out = self.get_output(all_tokens, all_labels)
        except:
            out = self.get_output_2(all_tokens, all_labels)
        return out

In [2]:
with open('./teste.txt', 'r', encoding='utf-16') as f:
    text = f.read()

In [3]:
model = Ner('../datasets/result_pos_tag_portuguese/dataset_docs/multiclass_bert_model/')

2022-02-17 10:55:52 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |

2022-02-17 10:55:52 INFO: Use device: cpu
2022-02-17 10:55:52 INFO: Loading: tokenize
2022-02-17 10:55:52 INFO: Loading: mwt
2022-02-17 10:55:52 INFO: Done loading processors!


In [4]:
output = model.predict(text)

Start stanza tokenizer...     Done!


Inference batch_size=2: 100%|██████████| 257/257 [16:51<00:00,  3.94s/it]


In [5]:
output

Unnamed: 0,TEXT,START,END,LABEL
0,SIDIPLA,6351,6358,uniCRONO
1,SIDIPLA,8241,8248,LIT
2,estuarino,8971,8980,uniCRONO
3,salmão,18577,18583,LIT
4,arenque,19549,19556,LIT
5,ictioplâncton,28337,28350,LIT
6,Grande,31511,31517,CAMP
7,estuarino,31903,31912,BAC
8,miragaia,33074,33082,LIT
9,camarão,33279,33286,CAMP


In [6]:
output.to_csv('teste.csv')

In [None]:
data =pd.read_csv('../result_pos_tag_portuguese/dataset_sent/test.csv',sep='\t',index_col=0)
grupos = data.groupby('sentence')
grps = []
for key in grupos.groups.keys():
    try:
        grp = grupos.get_group(key)
        text = ' '.join(grp['0'].to_list())
        out = model.predict(text,batch_size=1)
        grp['preds'] = [a[0] for a in out[1][0][:-1]]
        grps.append(grp)
        print(key)
    except:
        continue

In [None]:
df1 = pd.concat(grps) 

In [None]:
df1['BAC'] = df1['1'].apply(lambda x: 1 if 'BAC' in x  else 0)
df1['LIT'] = df1['1'].apply(lambda x: 1 if 'LIT' in x else 0)
df1['CAMP'] = df1['1'].apply(lambda x: 1 if 'CAMP' in x else 0)
df1['uniCRONO'] = df1['1'].apply(lambda x: 1 if 'uniCRONO' in x  else 0)

df1['BAC_Pred'] = df1['preds'].apply(lambda x: 1 if 'BAC' in x  else 0)
df1['LIT_Pred'] = df1['preds'].apply(lambda x: 1 if 'LIT' in x else 0)
df1['CAMP_Pred'] = df1['preds'].apply(lambda x: 1 if 'CAMP' in x else 0)
df1['uniCRONO_Pred'] = df1['preds'].apply(lambda x: 1 if 'uniCRONO' in x  else 0)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score


f1 = [f1_score(df1['BAC'], df1['BAC_Pred'], average='macro'),
      f1_score(df1['LIT'], df1['LIT_Pred'], average='macro'),
      f1_score(df1['CAMP'], df1['CAMP_Pred'], average='macro'),
      f1_score(df1['uniCRONO'], df1['uniCRONO_Pred'], average='macro')
     ]

precision = [precision_score(df1['BAC'], df1['BAC_Pred'], average='macro'),
             precision_score(df1['LIT'], df1['LIT_Pred'], average='macro'),
             precision_score(df1['CAMP'], df1['CAMP_Pred'], average='macro'),
             precision_score(df1['uniCRONO'], df1['uniCRONO_Pred'], average='macro')
            ]

recall = [recall_score(df1['BAC'], df1['BAC_Pred'], average='macro'),
          recall_score(df1['LIT'], df1['LIT_Pred'], average='macro'),
          recall_score(df1['CAMP'], df1['CAMP_Pred'], average='macro'),
          recall_score(df1['uniCRONO'], df1['uniCRONO_Pred'], average='macro')
         ]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bar
barWidth = 0.25
fig = plt.subplots(figsize =(12, 8))
 
 
# Set position of bar on X axis
br1 = np.arange(len(f1))
br2 = [x + barWidth for x in br1]
br3 = [x + barWidth for x in br2]
 
# Make the plot
plt.bar(br1, f1, color ='r', width = barWidth,
        edgecolor ='grey', label ='f1')
plt.bar(br2, precision, color ='g', width = barWidth,
        edgecolor ='grey', label ='precision')
plt.bar(br3, recall, color ='b', width = barWidth,
        edgecolor ='grey', label ='recall')
 
# Adding Xticks
plt.xlabel('Entidade', fontweight ='bold', fontsize = 15)
plt.ylabel('Score', fontweight ='bold', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(f1))],
        ['BAC','LIT','CAMP','uniCRONO'])
 
plt.legend()
plt.ylim([0,1.2])
plt.grid(True,alpha=0.7,linestyle='--',)
plt.yticks(np.arange(13)*0.1)
plt.show()