In [1]:
from transformers import pipeline
from transformers import BertTokenizer, BertModel
from transformers import AlbertTokenizer, AlbertModel

from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel 

import torch

BERT(S) for Relation Extraction
GITHUB: https://github.com/plkmo/BERT-Relation-Extraction
POST: https://towardsdatascience.com/bert-s-for-relation-extraction-in-nlp-2c7c3ab487c4

### BERT

In [21]:
unmasker = pipeline('fill-mask', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
unmasker("Hello I'm a [MASK] model.")

[{'sequence': "hello i'm a fashion model.",
  'score': 0.10731082409620285,
  'token': 4827,
  'token_str': 'fashion'},
 {'sequence': "hello i'm a role model.",
  'score': 0.08774485439062119,
  'token': 2535,
  'token_str': 'role'},
 {'sequence': "hello i'm a new model.",
  'score': 0.05338380113244057,
  'token': 2047,
  'token_str': 'new'},
 {'sequence': "hello i'm a super model.",
  'score': 0.04667210206389427,
  'token': 3565,
  'token_str': 'super'},
 {'sequence': "hello i'm a fine model.",
  'score': 0.027095874771475792,
  'token': 2986,
  'token_str': 'fine'}]

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2709, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.9377, -0.5043, -0.9799,  0.9030,  0.9329, -0.2438,  0.8926,  0.2288,
         -0.9531, -1.0000, -0.8862,  0.9906,  0.9855,  0.7155,  0.9455, -0.8645,
         -0.6035, -0.6666,  0.3020, -0.1587,  0.7455,  1.0000, -0.4022,  0.4261,
          0.6151,  0.9996, -0.8773,  0.9594,  0.9585,  0.6950, -0.6718,  0.3325,
         -0.9954, -0.2268, -0.9658, -0.9951,  0.6127, -0.7670,  0.0873,  0.0824,
         -0.9518,  0.4713,  1.000

### ALBERT

In [25]:
unmasker = pipeline('fill-mask', model='albert-base-v2')
unmasker("Hello I'm a [MASK] model.")

[{'sequence': "hello i'm a modeling model.",
  'score': 0.05816109851002693,
  'token': 12807,
  'token_str': 'modeling'},
 {'sequence': "hello i'm a modelling model.",
  'score': 0.037488292902708054,
  'token': 23089,
  'token_str': 'modelling'},
 {'sequence': "hello i'm a model model.",
  'score': 0.03372497111558914,
  'token': 1061,
  'token_str': 'model'},
 {'sequence': "hello i'm a runway model.",
  'score': 0.01731342077255249,
  'token': 8014,
  'token_str': 'runway'},
 {'sequence': "hello i'm a lingerie model.",
  'score': 0.014405577443540096,
  'token': 29104,
  'token_str': 'lingerie'}]

In [26]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained("albert-base-v2")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
output

BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 1.0633,  0.6634,  1.2338,  ..., -1.5131, -0.4445,  1.2011],
         [-0.2914, -0.5385, -1.6138,  ...,  0.2044,  2.1072, -0.3526],
         [ 0.3940,  0.8559, -0.5069,  ...,  0.8633,  0.4893,  0.2798],
         ...,
         [ 0.4754, -1.4797, -0.7564,  ...,  1.2648,  1.6309,  0.4099],
         [ 0.0298,  0.1406,  0.2338,  ..., -0.2372,  0.6055, -0.0437],
         [ 0.0726,  0.1270, -0.0512,  ..., -0.0985,  0.1229,  0.2115]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-1.3040e-01,  1.0424e-01,  3.9711e-01, -4.7384e-01, -1.3758e-02,
         -9.8533e-01, -5.0128e-02,  1.2259e-02, -5.6493e-02, -9.9441e-01,
          9.4768e-01, -1.4119e-01, -5.6679e-01, -8.5202e-01, -8.8070e-01,
          2.3836e-01, -1.9595e-01, -1.6008e-01,  9.9556e-01,  6.8591e-02,
         -6.8246e-01, -9.9791e-01,  9.9637e-01,  9.3427e-01,  9.4481e-01,
          1.9624e-01, -1.2115e-01, -9.9659e-01, -9.5589e-01,  1.7546e-01,
         -9.

### BERTimbau

In [28]:
model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

Some weights of BertForPreTraining were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
pipe = pipeline('fill-mask', model=model, tokenizer=tokenizer)

pipe('Tinha uma [MASK] no meio do caminho.')

The model 'BertForPreTraining' is not supported for fill-mask. Supported models are ['RemBertForMaskedLM', 'RoFormerForMaskedLM', 'BigBirdForMaskedLM', 'Wav2Vec2ForMaskedLM', 'ConvBertForMaskedLM', 'LayoutLMForMaskedLM', 'DistilBertForMaskedLM', 'AlbertForMaskedLM', 'BartForConditionalGeneration', 'MBartForConditionalGeneration', 'CamembertForMaskedLM', 'XLMRobertaForMaskedLM', 'LongformerForMaskedLM', 'RobertaForMaskedLM', 'SqueezeBertForMaskedLM', 'BertForMaskedLM', 'MegatronBertForMaskedLM', 'MobileBertForMaskedLM', 'FlaubertWithLMHeadModel', 'XLMWithLMHeadModel', 'ElectraForMaskedLM', 'ReformerForMaskedLM', 'FunnelForMaskedLM', 'MPNetForMaskedLM', 'TapasForMaskedLM', 'DebertaForMaskedLM', 'DebertaV2ForMaskedLM', 'IBertForMaskedLM'].


[{'sequence': 'Tinha uma pedra no meio do caminho.',
  'score': 0.14287783205509186,
  'token': 5028,
  'token_str': 'pedra'},
 {'sequence': 'Tinha uma árvore no meio do caminho.',
  'score': 0.06213398277759552,
  'token': 7411,
  'token_str': 'árvore'},
 {'sequence': 'Tinha uma estrada no meio do caminho.',
  'score': 0.055150069296360016,
  'token': 5675,
  'token_str': 'estrada'},
 {'sequence': 'Tinha uma casa no meio do caminho.',
  'score': 0.029918987303972244,
  'token': 1105,
  'token_str': 'casa'},
 {'sequence': 'Tinha uma cruz no meio do caminho.',
  'score': 0.025660423561930656,
  'token': 3466,
  'token_str': 'cruz'}]

In [30]:
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
input_ids = tokenizer.encode('Tinha uma pedra no meio do caminho.', return_tensors='pt')

with torch.no_grad():
    outs = model(input_ids)
    encoded = outs[0][0, 1:-1]  # Ignore [CLS] and [SEP] special tokens

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
outs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2824, -0.1887,  0.5604,  ..., -0.6297,  0.0302, -0.5707],
         [-0.0398, -0.3057,  0.2431,  ..., -0.5420,  0.1857, -0.5775],
         [-0.2926, -0.1957,  0.7020,  ..., -0.2843,  0.0530, -0.4304],
         ...,
         [ 0.0662,  0.2845,  0.1871,  ..., -0.2542, -0.2933, -0.0661],
         [ 0.2761, -0.1657,  0.3288,  ..., -0.2102,  0.0029, -0.2009],
         [ 0.2825, -0.1884,  0.5614,  ..., -0.6288,  0.0294, -0.5712]]]), pooler_output=tensor([[-1.0484e-01, -2.8821e-02, -5.0372e-02,  1.9559e-01,  1.3728e-01,
         -8.5767e-02,  9.9934e-01, -3.7218e-02,  2.1358e-01, -2.0643e-01,
         -9.6026e-01,  9.3278e-03,  1.9185e-01,  2.1787e-01, -7.2317e-02,
          1.3728e-01, -1.2064e-01, -6.4236e-02,  9.8421e-01,  8.1832e-01,
         -1.9355e-02, -5.8726e-02,  2.2956e-01, -1.1758e-01,  5.8525e-02,
         -1.2762e-01,  9.6016e-02,  9.0655e-02, -1.5974e-01, -4.4946e-02,
         -1.3218e-03, -9.7839e-01, -

In [32]:
encoded

tensor([[-0.0398, -0.3057,  0.2431,  ..., -0.5420,  0.1857, -0.5775],
        [-0.2926, -0.1957,  0.7020,  ..., -0.2843,  0.0530, -0.4304],
        [ 0.2463, -0.1467,  0.5496,  ...,  0.3781, -0.2325, -0.5469],
        ...,
        [ 0.0662,  0.7817,  0.3486,  ..., -0.4131, -0.2852, -0.2819],
        [ 0.0662,  0.2845,  0.1871,  ..., -0.2542, -0.2933, -0.0661],
        [ 0.2761, -0.1657,  0.3288,  ..., -0.2102,  0.0029, -0.2009]])

### Código de preprocessamento do BERT Relation Extraction

https://github.com/plkmo/BERT-Relation-Extraction/blob/master/src/preprocessing_funcs.py
BERT-Relation-Extraction/src/preprocessing_funcs.py

In [2]:
import os
import re
import spacy
import math
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from misc import save_as_pickle, load_pickle, get_subject_objects
from tqdm import tqdm
import logging

In [3]:
tqdm.pandas(desc="prog_bar")
logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
                    datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
logger = logging.getLogger('__file__')

In [4]:
# preprocess sentences
def process_sent(sent):
    if sent not in [" ", "\n", ""]:
        sent = sent.strip("\n")            
        sent = re.sub('<[A-Z]+/*>', '', sent) # remove special tokens eg. <FIL/>, <S>
        sent = re.sub(r"[\*\"\n\\…\+\-\/\=\(\)‘•€\[\]\|♫:;—”“~`#]", " ", sent)
        sent = re.sub(' {2,}', ' ', sent) # remove extra spaces > 1
        sent = re.sub("^ +", "", sent) # remove space in front
        sent = re.sub(r"([\.\?,!]){2,}", r"\1", sent) # remove multiple puncs
        sent = re.sub(r" +([\.\?,!])", r"\1", sent) # remove extra spaces in front of punc
        #sent = re.sub(r"([A-Z]{2,})", lambda x: x.group(1).capitalize(), sent) # Replace all CAPS with capitalize
        return sent
    return

In [5]:

def process_textlines(text):
    text = [process_sent(sent) for sent in text]
    text = " ".join([t for t in text if t is not None])
    text = re.sub(' {2,}', ' ', text) # remove extra spaces > 1
    return text 

In [6]:
def create_pretraining_corpus(raw_text, nlp, window_size=40):
    '''
    Input: Chunk of raw text
    Output: modified corpus of triplets (relation statement, entity1, entity2)
    '''
    logger.info("Processing sentences...")
    sents_doc = nlp(raw_text)
    ents = sents_doc.ents # get entities
    
    logger.info("Processing relation statements by entities...")
    entities_of_interest = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", \
                            "WORK_OF_ART", "LAW", "LANGUAGE"]
    length_doc = len(sents_doc)
    D = []; ents_list = []
    for i in tqdm(range(len(ents))):
        e1 = ents[i]
        e1start = e1.start; e1end = e1.end
        if e1.label_ not in entities_of_interest:
            continue
        if re.search("[\d+]", e1.text): # entities should not contain numbers
            continue
        
        for j in range(1, len(ents) - i):
            e2 = ents[i + j]
            e2start = e2.start; e2end = e2.end
            if e2.label_ not in entities_of_interest:
                continue
            if re.search("[\d+]", e2.text): # entities should not contain numbers
                continue
            if e1.text.lower() == e2.text.lower(): # make sure e1 != e2
                continue
            
            if (1 <= (e2start - e1end) <= window_size): # check if next nearest entity within window_size
                # Find start of sentence
                punc_token = False
                start = e1start - 1
                if start > 0:
                    while not punc_token:
                        punc_token = sents_doc[start].is_punct
                        start -= 1
                        if start < 0:
                            break
                    left_r = start + 2 if start > 0 else 0
                else:
                    left_r = 0
                
                # Find end of sentence
                punc_token = False
                start = e2end
                if start < length_doc:
                    while not punc_token:
                        punc_token = sents_doc[start].is_punct
                        start += 1
                        if start == length_doc:
                            break
                    right_r = start if start < length_doc else length_doc
                else:
                    right_r = length_doc
                
                if (right_r - left_r) > window_size: # sentence should not be longer than window_size
                    continue
                
                x = [token.text for token in sents_doc[left_r:right_r]]
                
                ### empty strings check ###
                for token in x:
                    assert len(token) > 0
                assert len(e1.text) > 0
                assert len(e2.text) > 0
                assert e1start != e1end
                assert e2start != e2end
                assert (e2start - e1end) > 0
                
                r = (x, (e1start - left_r, e1end - left_r), (e2start - left_r, e2end - left_r))
                D.append((r, e1.text, e2.text))
                ents_list.append((e1.text, e2.text))
                #print(e1.text,",", e2.text)
    print("Processed dataset samples from named entity extraction:")
    samples_D_idx = np.random.choice([idx for idx in range(len(D))],\
                                      size=min(3, len(D)),\
                                      replace=False)
    for idx in samples_D_idx:
        print(D[idx], '\n')
    ref_D = len(D)
    
    logger.info("Processing relation statements by dependency tree parsing...")
    doc_sents = [s for s in sents_doc.sents]
    for sent_ in tqdm(doc_sents, total=len(doc_sents)):
        if len(sent_) > (window_size + 1):
            continue
        
        left_r = sent_[0].i
        pairs = get_subject_objects(sent_)
        
        if len(pairs) > 0:
            for pair in pairs:
                e1, e2 = pair[0], pair[1]
                
                if (len(e1) > 3) or (len(e2) > 3): # don't want entities that are too long
                    continue
                
                e1text, e2text = " ".join(w.text for w in e1) if isinstance(e1, list) else e1.text,\
                                    " ".join(w.text for w in e2) if isinstance(e2, list) else e2.text
                e1start, e1end = e1[0].i if isinstance(e1, list) else e1.i, e1[-1].i + 1 if isinstance(e1, list) else e1.i + 1
                e2start, e2end = e2[0].i if isinstance(e2, list) else e2.i, e2[-1].i + 1 if isinstance(e2, list) else e2.i + 1
                if (e1end < e2start) and ((e1text, e2text) not in ents_list):
                    assert e1start != e1end
                    assert e2start != e2end
                    assert (e2start - e1end) > 0
                    r = ([w.text for w in sent_], (e1start - left_r, e1end - left_r), (e2start - left_r, e2end - left_r))
                    D.append((r, e1text, e2text))
                    ents_list.append((e1text, e2text))
    
    print("Processed dataset samples from dependency tree parsing:")
    if (len(D) - ref_D) > 0:
        samples_D_idx = np.random.choice([idx for idx in range(ref_D, len(D))],\
                                          size=min(3,(len(D) - ref_D)),\
                                          replace=False)
        for idx in samples_D_idx:
            print(D[idx], '\n')
    return D

In [7]:
# Extract raw text in English
with open('cnn.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
    
nlp = spacy.load("en_core_web_lg")

In [8]:
create_pretraining_corpus(raw_text[:10000], nlp, window_size=40)

09/10/2021 04:48:35 PM [INFO]: Processing sentences...
09/10/2021 04:48:36 PM [INFO]: Processing relation statements by entities...
100%|███████████████████████████████████████████████████████████████████████████████| 218/218 [00:00<00:00, 645.99it/s]
09/10/2021 04:48:36 PM [INFO]: Processing relation statements by dependency tree parsing...


Processed dataset samples from named entity extraction:
((['House', 'Speaker', 'John', 'Boehner', 'Majority', 'Leader', 'Eric', 'Cantor', 'Majority', 'Whip', 'Kevin', 'McCarthy', 'and', 'Conference', 'Chair', 'Cathy', 'McMorris', 'Rodgers', 'issued', 'a', 'statement', 'Saturday', 'praising', 'the', 'president', '.'], (10, 12), (13, 14)), 'Kevin McCarthy', 'Conference') 

((['What', 'will', 'happen', 'if', 'they', 'vote', 'no?It', "'s", 'unclear', '.', 'A', 'senior', 'administration', 'official', 'told', 'CNN', 'that', 'Obama', 'has', 'the', 'authority', 'to', 'act', 'without', 'Congress', '--'], (6, 7), (15, 16)), 'no?It', 'CNN') 

((['CNN', ')', '--', 'Usain', 'Bolt', 'rounded', 'off', 'the', 'world', 'championships', 'Sunday', 'by', 'claiming', 'his', 'third', 'gold', 'in', 'Moscow', 'as', 'he', 'anchored', 'Jamaica', 'to', 'victory', 'in', 'the', 'men', "'s", '4x100', 'm', 'relay', '.'], (0, 1), (3, 5)), 'CNN', 'Usain Bolt') 



100%|████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 9751.93it/s]


Processed dataset samples from dependency tree parsing:
((['Some', 'U.S.', 'lawmakers', 'have', 'called', 'for', 'immediate', 'action', 'while', 'others', 'warn', 'of', 'stepping', 'into', 'what', 'could', 'become', 'a', 'quagmire', '.'], (0, 3), (5, 8)), 'Some U.S. lawmakers', 'for immediate action') 

((['The', 'inspectors', 'will', 'share', 'their', 'findings', 'with', 'U.N.', 'Secretary', '-', 'General', 'Ban', 'Ki', '-', 'moon', 'Ban', 'who', 'has', 'said', 'he', 'wants', 'to', 'wait', 'until', 'the', 'U.N.', 'team', "'s", 'final', 'report', 'is', 'completed', 'before', 'presenting', 'it', 'to', 'the', 'U.N.', 'Security', 'Council', '.'], (0, 2), (4, 6)), 'The inspectors', 'their findings') 

((['There', 'are', 'key', 'questions', 'looming', 'over', 'the', 'debate', ':', 'What', 'did', 'U.N.', 'weapons', 'inspectors', 'find', 'in', 'Syria', '?'], (11, 14), (15, 17)), 'U.N. weapons inspectors', 'in Syria') 



[((['U.S.',
    'President',
    'Barack',
    'Obama',
    'wants',
    'lawmakers',
    'to',
    'weigh',
    'in',
    'on',
    'whether',
    'to',
    'use',
    'military',
    'force',
    'in',
    'Syria',
    '.'],
   (0, 1),
   (2, 4)),
  'U.S.',
  'Barack Obama'),
 ((['U.S.',
    'President',
    'Barack',
    'Obama',
    'wants',
    'lawmakers',
    'to',
    'weigh',
    'in',
    'on',
    'whether',
    'to',
    'use',
    'military',
    'force',
    'in',
    'Syria',
    '.'],
   (0, 1),
   (16, 17)),
  'U.S.',
  'Syria'),
 ((['U.S.',
    'President',
    'Barack',
    'Obama',
    'wants',
    'lawmakers',
    'to',
    'weigh',
    'in',
    'on',
    'whether',
    'to',
    'use',
    'military',
    'force',
    'in',
    'Syria',
    '.'],
   (2, 4),
   (16, 17)),
  'Barack Obama',
  'Syria'),
 ((['Obama',
    'sent',
    'a',
    'letter',
    'to',
    'the',
    'heads',
    'of',
    'the',
    'House',
    'and',
    'Senate',
    'on',
    'Saturday'

In [9]:
# Extract raw text in Portuguese
with open('corpusPublico(sem IBICT)-SemProcessamento.txt', 'r', encoding='utf-8') as f:
    raw_text_pt = f.read()
    
nlp_pt = spacy.load("pt_core_news_lg")

In [23]:
len(raw_text_pt)

526158850

In [21]:
corpus_pt = create_pretraining_corpus(raw_text_pt[:1000000], nlp_pt, window_size=40)

09/10/2021 04:54:01 PM [INFO]: Processing sentences...
09/10/2021 04:55:01 PM [INFO]: Processing relation statements by entities...
100%|██████████████████████████████████████████████████████████████████████████████| 6319/6319 [03:18<00:00, 31.91it/s]
09/10/2021 04:58:19 PM [INFO]: Processing relation statements by dependency tree parsing...


Processed dataset samples from named entity extraction:
((['SE', 'E', ' \n', 'Santo', '-', 'ES', 'GasBrasiliano', '-', 'SP', 'Gasmig', '-', 'MG', 'Gas', 'Natural', 'Sul', '-', 'SP', 'MSGas', '-', 'MS', 'PBGás', '-', 'PB', 'Potigás', '-', 'RN', 'Rongás', '-'], (0, 1), (25, 26)), 'SE', 'RN') 

((['Fundamentos', 'Econômicos', 'Evolução', 'Histórica', 'e', 'Organização', 'Industrial', ' \n', 'Rio', 'de', 'Janeiro', ':'], (0, 4), (8, 11)), 'Fundamentos Econômicos Evolução Histórica', 'Rio de Janeiro') 

((['RJ', 'Cegás', '-', 'CE', 'Cigás', '-', 'AM', 'Comgás', '-', 'SP', 'Compagás', '-', 'PR', 'Copergás', '-', 'PE', 'Emsergás', '-'], (1, 2), (16, 17)), 'Cegás', 'Emsergás') 



100%|███████████████████████████████████████████████████████████████████████████| 9664/9664 [00:00<00:00, 67928.85it/s]

Processed dataset samples from dependency tree parsing:
((['NUMBER', '>', 'Note-se', 'que', 'p2', '<'], (0, 2), (4, 6)), 'NUMBER >', 'p2 <') 






In [22]:
len(corpus_pt)

426

### Código original  
https://github.com/plkmo/BERT-Relation-Extraction/blob/master/src/preprocessing_funcs.py  
BERT-Relation-Extraction/src/preprocessing_funcs.py  

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 18:12:22 2019
@author: weetee
"""
import os
import re
import spacy
import math
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from .misc import save_as_pickle, load_pickle, get_subject_objects
from tqdm import tqdm
import logging

tqdm.pandas(desc="prog_bar")
logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
                    datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
logger = logging.getLogger('__file__')


def process_sent(sent):
    if sent not in [" ", "\n", ""]:
        sent = sent.strip("\n")            
        sent = re.sub('<[A-Z]+/*>', '', sent) # remove special tokens eg. <FIL/>, <S>
        sent = re.sub(r"[\*\"\n\\…\+\-\/\=\(\)‘•€\[\]\|♫:;—”“~`#]", " ", sent)
        sent = re.sub(' {2,}', ' ', sent) # remove extra spaces > 1
        sent = re.sub("^ +", "", sent) # remove space in front
        sent = re.sub(r"([\.\?,!]){2,}", r"\1", sent) # remove multiple puncs
        sent = re.sub(r" +([\.\?,!])", r"\1", sent) # remove extra spaces in front of punc
        #sent = re.sub(r"([A-Z]{2,})", lambda x: x.group(1).capitalize(), sent) # Replace all CAPS with capitalize
        return sent
    return

def process_textlines(text):
    text = [process_sent(sent) for sent in text]
    text = " ".join([t for t in text if t is not None])
    text = re.sub(' {2,}', ' ', text) # remove extra spaces > 1
    return text    

def create_pretraining_corpus(raw_text, nlp, window_size=40):
    '''
    Input: Chunk of raw text
    Output: modified corpus of triplets (relation statement, entity1, entity2)
    '''
    logger.info("Processing sentences...")
    sents_doc = nlp(raw_text)
    ents = sents_doc.ents # get entities
    
    logger.info("Processing relation statements by entities...")
    entities_of_interest = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", \
                            "WORK_OF_ART", "LAW", "LANGUAGE"]
    length_doc = len(sents_doc)
    D = []; ents_list = []
    for i in tqdm(range(len(ents))):
        e1 = ents[i]
        e1start = e1.start; e1end = e1.end
        if e1.label_ not in entities_of_interest:
            continue
        if re.search("[\d+]", e1.text): # entities should not contain numbers
            continue
        
        for j in range(1, len(ents) - i):
            e2 = ents[i + j]
            e2start = e2.start; e2end = e2.end
            if e2.label_ not in entities_of_interest:
                continue
            if re.search("[\d+]", e2.text): # entities should not contain numbers
                continue
            if e1.text.lower() == e2.text.lower(): # make sure e1 != e2
                continue
            
            if (1 <= (e2start - e1end) <= window_size): # check if next nearest entity within window_size
                # Find start of sentence
                punc_token = False
                start = e1start - 1
                if start > 0:
                    while not punc_token:
                        punc_token = sents_doc[start].is_punct
                        start -= 1
                        if start < 0:
                            break
                    left_r = start + 2 if start > 0 else 0
                else:
                    left_r = 0
                
                # Find end of sentence
                punc_token = False
                start = e2end
                if start < length_doc:
                    while not punc_token:
                        punc_token = sents_doc[start].is_punct
                        start += 1
                        if start == length_doc:
                            break
                    right_r = start if start < length_doc else length_doc
                else:
                    right_r = length_doc
                
                if (right_r - left_r) > window_size: # sentence should not be longer than window_size
                    continue
                
                x = [token.text for token in sents_doc[left_r:right_r]]
                
                ### empty strings check ###
                for token in x:
                    assert len(token) > 0
                assert len(e1.text) > 0
                assert len(e2.text) > 0
                assert e1start != e1end
                assert e2start != e2end
                assert (e2start - e1end) > 0
                
                r = (x, (e1start - left_r, e1end - left_r), (e2start - left_r, e2end - left_r))
                D.append((r, e1.text, e2.text))
                ents_list.append((e1.text, e2.text))
                #print(e1.text,",", e2.text)
    print("Processed dataset samples from named entity extraction:")
    samples_D_idx = np.random.choice([idx for idx in range(len(D))],\
                                      size=min(3, len(D)),\
                                      replace=False)
    for idx in samples_D_idx:
        print(D[idx], '\n')
    ref_D = len(D)
    
    logger.info("Processing relation statements by dependency tree parsing...")
    doc_sents = [s for s in sents_doc.sents]
    for sent_ in tqdm(doc_sents, total=len(doc_sents)):
        if len(sent_) > (window_size + 1):
            continue
        
        left_r = sent_[0].i
        pairs = get_subject_objects(sent_)
        
        if len(pairs) > 0:
            for pair in pairs:
                e1, e2 = pair[0], pair[1]
                
                if (len(e1) > 3) or (len(e2) > 3): # don't want entities that are too long
                    continue
                
                e1text, e2text = " ".join(w.text for w in e1) if isinstance(e1, list) else e1.text,\
                                    " ".join(w.text for w in e2) if isinstance(e2, list) else e2.text
                e1start, e1end = e1[0].i if isinstance(e1, list) else e1.i, e1[-1].i + 1 if isinstance(e1, list) else e1.i + 1
                e2start, e2end = e2[0].i if isinstance(e2, list) else e2.i, e2[-1].i + 1 if isinstance(e2, list) else e2.i + 1
                if (e1end < e2start) and ((e1text, e2text) not in ents_list):
                    assert e1start != e1end
                    assert e2start != e2end
                    assert (e2start - e1end) > 0
                    r = ([w.text for w in sent_], (e1start - left_r, e1end - left_r), (e2start - left_r, e2end - left_r))
                    D.append((r, e1text, e2text))
                    ents_list.append((e1text, e2text))
    
    print("Processed dataset samples from dependency tree parsing:")
    if (len(D) - ref_D) > 0:
        samples_D_idx = np.random.choice([idx for idx in range(ref_D, len(D))],\
                                          size=min(3,(len(D) - ref_D)),\
                                          replace=False)
        for idx in samples_D_idx:
            print(D[idx], '\n')
    return D

class pretrain_dataset(Dataset):
    def __init__(self, args, D, batch_size=None):
        self.internal_batching = True
        self.batch_size = batch_size # batch_size cannot be None if internal_batching == True
        self.alpha = 0.7
        self.mask_probability = 0.15
        
        self.df = pd.DataFrame(D, columns=['r','e1','e2'])
        self.e1s = list(self.df['e1'].unique())
        self.e2s = list(self.df['e2'].unique())
        
        if args.model_no == 0:
            from .model.BERT.tokenization_bert import BertTokenizer as Tokenizer
            model = args.model_size #'bert-base-uncased'
            lower_case = True
            model_name = 'BERT'
        elif args.model_no == 1:
            from .model.ALBERT.tokenization_albert import AlbertTokenizer as Tokenizer
            model = args.model_size #'albert-base-v2'
            lower_case = False
            model_name = 'ALBERT'
        elif args.model_no == 2:
            from .model.BERT.tokenization_bert import BertTokenizer as Tokenizer
            model = 'bert-base-uncased'
            lower_case = False
            model_name = 'BioBERT'
        
        tokenizer_path = './data/%s_tokenizer.pkl' % (model_name)
        if os.path.isfile(tokenizer_path):
            self.tokenizer = load_pickle('%s_tokenizer.pkl' % (model_name))
            logger.info("Loaded tokenizer from saved path.")
        else:
            if args.model_no == 2:
                self.tokenizer = Tokenizer(vocab_file='./additional_models/biobert_v1.1_pubmed/vocab.txt',
                                           do_lower_case=False)
            else:
                self.tokenizer = Tokenizer.from_pretrained(model, do_lower_case=False)
            self.tokenizer.add_tokens(['[E1]', '[/E1]', '[E2]', '[/E2]', '[BLANK]'])
            save_as_pickle("%s_tokenizer.pkl" % (model_name), self.tokenizer)
            logger.info("Saved %s tokenizer at ./data/%s_tokenizer.pkl" % (model_name, model_name))
        
        e1_id = self.tokenizer.convert_tokens_to_ids('[E1]')
        e2_id = self.tokenizer.convert_tokens_to_ids('[E2]')
        assert e1_id != e2_id != 1
            
        self.cls_token = self.tokenizer.cls_token
        self.sep_token = self.tokenizer.sep_token
        self.E1_token_id = self.tokenizer.encode("[E1]")[1:-1][0]
        self.E1s_token_id = self.tokenizer.encode("[/E1]")[1:-1][0]
        self.E2_token_id = self.tokenizer.encode("[E2]")[1:-1][0]
        self.E2s_token_id = self.tokenizer.encode("[/E2]")[1:-1][0]
        self.PS = Pad_Sequence(seq_pad_value=self.tokenizer.pad_token_id,\
                               label_pad_value=self.tokenizer.pad_token_id,\
                               label2_pad_value=-1,\
                               label3_pad_value=-1,\
                               label4_pad_value=-1)
        
    def put_blanks(self, D):
        blank_e1 = np.random.uniform()
        blank_e2 = np.random.uniform()
        if blank_e1 >= self.alpha:
            r, e1, e2 = D
            D = (r, "[BLANK]", e2)
        
        if blank_e2 >= self.alpha:
            r, e1, e2 = D
            D = (r, e1, "[BLANK]")
        return D
        
    def tokenize(self, D):
        (x, s1, s2), e1, e2 = D
        x = [w.lower() for w in x if x != '[BLANK]'] # we are using uncased model
        
        ### Include random masks for MLM training
        forbidden_idxs = [i for i in range(s1[0], s1[1])] + [i for i in range(s2[0], s2[1])]
        pool_idxs = [i for i in range(len(x)) if i not in forbidden_idxs]
        masked_idxs = np.random.choice(pool_idxs,\
                                        size=round(self.mask_probability*len(pool_idxs)),\
                                        replace=False)
        masked_for_pred = [token.lower() for idx, token in enumerate(x) if (idx in masked_idxs)]
        #masked_for_pred = [w.lower() for w in masked_for_pred] # we are using uncased model
        x = [token if (idx not in masked_idxs) else self.tokenizer.mask_token \
             for idx, token in enumerate(x)]

        ### replace x spans with '[BLANK]' if e is '[BLANK]'
        if (e1 == '[BLANK]') and (e2 != '[BLANK]'):
            x = [self.cls_token] + x[:s1[0]] + ['[E1]' ,'[BLANK]', '[/E1]'] + \
                x[s1[1]:s2[0]] + ['[E2]'] + x[s2[0]:s2[1]] + ['[/E2]'] + x[s2[1]:] + [self.sep_token]
        
        elif (e1 == '[BLANK]') and (e2 == '[BLANK]'):
            x = [self.cls_token] + x[:s1[0]] + ['[E1]' ,'[BLANK]', '[/E1]'] + \
                x[s1[1]:s2[0]] + ['[E2]', '[BLANK]', '[/E2]'] + x[s2[1]:] + [self.sep_token]
        
        elif (e1 != '[BLANK]') and (e2 == '[BLANK]'):
            x = [self.cls_token] + x[:s1[0]] + ['[E1]'] + x[s1[0]:s1[1]] + ['[/E1]'] + \
                x[s1[1]:s2[0]] + ['[E2]', '[BLANK]', '[/E2]'] + x[s2[1]:] + [self.sep_token]
        
        elif (e1 != '[BLANK]') and (e2 != '[BLANK]'):
            x = [self.cls_token] + x[:s1[0]] + ['[E1]'] + x[s1[0]:s1[1]] + ['[/E1]'] + \
                x[s1[1]:s2[0]] + ['[E2]'] + x[s2[0]:s2[1]] + ['[/E2]'] + x[s2[1]:] + [self.sep_token]

        e1_e2_start = ([i for i, e in enumerate(x) if e == '[E1]'][0],\
                        [i for i, e in enumerate(x) if e == '[E2]'][0])
        
        x = self.tokenizer.convert_tokens_to_ids(x)
        masked_for_pred = self.tokenizer.convert_tokens_to_ids(masked_for_pred)
        '''
        e1 = [e for idx, e in enumerate(x) if idx in [i for i in\
              range(x.index(self.E1_token_id) + 1, x.index(self.E1s_token_id))]]
        e2 = [e for idx, e in enumerate(x) if idx in [i for i in\
              range(x.index(self.E2_token_id) + 1, x.index(self.E2s_token_id))]]
        '''
        return x, masked_for_pred, e1_e2_start #, e1, e2
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        ### implements standard batching
        if not self.internal_batching:
            r, e1, e2 = self.df.iloc[idx]
            x, masked_for_pred, e1_e2_start = self.tokenize(self.put_blanks((r, e1, e2)))
            x = torch.tensor(x)
            masked_for_pred = torch.tensor(masked_for_pred)
            e1_e2_start = torch.tensor(e1_e2_start)
            #e1, e2 = torch.tensor(e1), torch.tensor(e2)
            return x, masked_for_pred, e1_e2_start, e1, e2
        
        ### implements noise contrastive estimation
        else:
            ### get positive samples
            r, e1, e2 = self.df.iloc[idx] # positive sample
            pool = self.df[((self.df['e1'] == e1) & (self.df['e2'] == e2))].index
            pool = pool.append(self.df[((self.df['e1'] == e2) & (self.df['e2'] == e1))].index)
            pos_idxs = np.random.choice(pool, \
                                        size=min(int(self.batch_size//2), len(pool)), replace=False)
            ### get negative samples
            '''
            choose from option: 
            1) sampling uniformly from all negatives
            2) sampling uniformly from negatives that share e1 or e2
            '''
            if np.random.uniform() > 0.5:   
                pool = self.df[((self.df['e1'] != e1) | (self.df['e2'] != e2))].index
                neg_idxs = np.random.choice(pool, \
                                            size=min(int(self.batch_size//2), len(pool)), replace=False)
                Q = 1/len(pool)
            
            else:
                if np.random.uniform() > 0.5: # share e1 but not e2
                    pool = self.df[((self.df['e1'] == e1) & (self.df['e2'] != e2))].index
                    if len(pool) > 0:
                        neg_idxs = np.random.choice(pool, \
                                                    size=min(int(self.batch_size//2), len(pool)), replace=False)
                    else:
                        neg_idxs = []

                else: # share e2 but not e1
                    pool = self.df[((self.df['e1'] != e1) & (self.df['e2'] == e2))].index
                    if len(pool) > 0:
                        neg_idxs = np.random.choice(pool, \
                                                    size=min(int(self.batch_size//2), len(pool)), replace=False)
                    else:
                        neg_idxs = []
                        
                if len(neg_idxs) == 0: # if empty, sample from all negatives
                    pool = self.df[((self.df['e1'] != e1) | (self.df['e2'] != e2))].index
                    neg_idxs = np.random.choice(pool, \
                                            size=min(int(self.batch_size//2), len(pool)), replace=False)
                Q = 1/len(pool)
            
            batch = []
            ## process positive sample
            pos_df = self.df.loc[pos_idxs]
            for idx, row in pos_df.iterrows():
                r, e1, e2 = row[0], row[1], row[2]
                x, masked_for_pred, e1_e2_start = self.tokenize(self.put_blanks((r, e1, e2)))
                x = torch.LongTensor(x)
                masked_for_pred = torch.LongTensor(masked_for_pred)
                e1_e2_start = torch.tensor(e1_e2_start)
                #e1, e2 = torch.tensor(e1), torch.tensor(e2)
                batch.append((x, masked_for_pred, e1_e2_start, torch.FloatTensor([1.0]),\
                              torch.LongTensor([1])))
            
            ## process negative samples
            negs_df = self.df.loc[neg_idxs]
            for idx, row in negs_df.iterrows():
                r, e1, e2 = row[0], row[1], row[2]
                x, masked_for_pred, e1_e2_start = self.tokenize(self.put_blanks((r, e1, e2)))
                x = torch.LongTensor(x)
                masked_for_pred = torch.LongTensor(masked_for_pred)
                e1_e2_start = torch.tensor(e1_e2_start)
                #e1, e2 = torch.tensor(e1), torch.tensor(e2)
                batch.append((x, masked_for_pred, e1_e2_start, torch.FloatTensor([Q]), torch.LongTensor([0])))
            batch = self.PS(batch)
            return batch
    
class Pad_Sequence():
    """
    collate_fn for dataloader to collate sequences of different lengths into a fixed length batch
    Returns padded x sequence, y sequence, x lengths and y lengths of batch
    """
    def __init__(self, seq_pad_value, label_pad_value=1, label2_pad_value=-1,\
                 label3_pad_value=-1, label4_pad_value=-1):
        self.seq_pad_value = seq_pad_value
        self.label_pad_value = label_pad_value
        self.label2_pad_value = label2_pad_value
        self.label3_pad_value = label3_pad_value
        self.label4_pad_value = label4_pad_value
        
    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)
        seqs = [x[0] for x in sorted_batch]
        seqs_padded = pad_sequence(seqs, batch_first=True, padding_value=self.seq_pad_value)
        x_lengths = torch.LongTensor([len(x) for x in seqs])
        
        labels = list(map(lambda x: x[1], sorted_batch))
        labels_padded = pad_sequence(labels, batch_first=True, padding_value=self.label_pad_value)
        y_lengths = torch.LongTensor([len(x) for x in labels])
        
        labels2 = list(map(lambda x: x[2], sorted_batch))
        labels2_padded = pad_sequence(labels2, batch_first=True, padding_value=self.label2_pad_value)
        y2_lengths = torch.LongTensor([len(x) for x in labels2])
        
        labels3 = list(map(lambda x: x[3], sorted_batch))
        labels3_padded = pad_sequence(labels3, batch_first=True, padding_value=self.label3_pad_value)
        y3_lengths = torch.LongTensor([len(x) for x in labels3])
        
        labels4 = list(map(lambda x: x[4], sorted_batch))
        labels4_padded = pad_sequence(labels4, batch_first=True, padding_value=self.label4_pad_value)
        y4_lengths = torch.LongTensor([len(x) for x in labels4])
        return seqs_padded, labels_padded, labels2_padded, labels3_padded, labels4_padded,\
                x_lengths, y_lengths, y2_lengths, y3_lengths, y4_lengths

def load_dataloaders(args, max_length=50000):
    
    if not os.path.isfile("./data/D.pkl"):
        logger.info("Loading pre-training data...")
        with open(args.pretrain_data, "r", encoding="utf8") as f:
            text = f.readlines()
        
        #text = text[:1500] # restrict size for testing
        text = process_textlines(text)
        
        logger.info("Length of text (characters): %d" % len(text))
        num_chunks = math.ceil(len(text)/max_length)
        logger.info("Splitting into %d max length chunks of size %d" % (num_chunks, max_length))
        text_chunks = (text[i*max_length:(i*max_length + max_length)] for i in range(num_chunks))
        
        D = []
        logger.info("Loading Spacy NLP...")
        nlp = spacy.load("en_core_web_lg")
        
        for text_chunk in tqdm(text_chunks, total=num_chunks):
            D.extend(create_pretraining_corpus(text_chunk, nlp, window_size=40))
            
        logger.info("Total number of relation statements in pre-training corpus: %d" % len(D))
        save_as_pickle("D.pkl", D)
        logger.info("Saved pre-training corpus to %s" % "./data/D.pkl")
    else:
        logger.info("Loaded pre-training data from saved file")
        D = load_pickle("D.pkl")
        
    train_set = pretrain_dataset(args, D, batch_size=args.batch_size)
    train_length = len(train_set)
    '''
    # if using fixed batching
    PS = Pad_Sequence(seq_pad_value=train_set.tokenizer.pad_token_id,\
                      label_pad_value=train_set.tokenizer.pad_token_id,\
                      label2_pad_value=-1,\
                      label3_pad_value=-1,\
                      label4_pad_value=-1)
    train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, \
                              num_workers=0, collate_fn=PS, pin_memory=False)
    '''
    return train_set