In [2]:
import pandas as pd

In [3]:
DATA_SET_PATH = "/opt/luciapp/data/temp/data_set.xlsx"
df = pd.read_excel(DATA_SET_PATH)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Paragraphs,Entity,Relationship,output
0,0,This FIRST AMENDMENT TO ASSET PURCHASE AGREEME...,"DEXTERA SURIGCAL, INC.,--AESCULAP, INC.,",FIRST AMENDMENT TO ASSET PURCHASE AGREEMENT,O Relationship Relationship Relationship Relat...
1,1,"THIS ASSET PURCHASE AGREEMENT (""Agreement"") is...","GREEN ARC SUPPLY, L.L.C.,--MAGNEGAS CORPORATION,",ASSET PURCHASE AGREEMENT,O Relationship Relationship Relationship O O O...
2,2,This MORTGAGE ASSET PURCHASE AGREEMENT (this A...,"TPG RE Finance Trust CLO Loan Seller, LLC,--TP...",MORTGAGE ASSET PURCHASE AGREEMENT,O Relationship Relationship Relationship Relat...
3,3,"THIS ASSET PURCHASE AGREEMENT, dated as of Feb...","Allscripts Healthcare, LLC,--PF2 EIS LLC,--All...","ASSET PURCHASE AGREEMENT,",O Relationship Relationship Relationship O O O...
4,4,"This Consulting Agreement (this ""Agreement"") i...","Green Energy Management Services Holdings, Inc...",Consulting Agreement,O Relationship Relationship O O O O O O O O O ...


In [5]:
df = df.drop('Unnamed: 0', axis=1)

In [6]:
# Check the first row
for col in df.columns:
    print(col)
    print(df.iloc[0][col])

Paragraphs
This FIRST AMENDMENT TO ASSET PURCHASE AGREEMENT (this "Amendment") is made and entered into as of January __, 2018, by and between (i) DEXTERA SURIGCAL, INC., a Delaware corporation formerly known as "Cardica, Inc." ("Seller") and (ii) AESCULAP, INC., a California corporation ("Buyer"). Seller and Buyer are sometimes referred to herein individually each as a "Party" and collectively as the "Parties."
Entity
DEXTERA SURIGCAL, INC.,--AESCULAP, INC.,
Relationship
FIRST AMENDMENT TO ASSET PURCHASE AGREEMENT
output
O Relationship Relationship Relationship Relationship Relationship Relationship O O O O O O O O O O O O O O O O B-Org B-Org B-Org O O O O O O O O O O O B-Org B-Org O O O O O O O O O O O O O O O O O O O O O O


In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [42]:
unique_tags = list()
for row in df.output:
    for tag in row.split(' '):
        if tag not in unique_tags:
            unique_tags.append(tag)
print(unique_tags)

['O', 'Relationship', 'B-Org']


In [7]:
sentence = df.iloc[0]['Paragraphs']
doc = nlp(sentence)

In [8]:
for ent in doc.ents:
    print(ent.text, ent.label_)

January DATE
_, 2018 DATE
Delaware GPE
Cardica, ORG
Seller WORK_OF_ART
California GPE
Buyer WORK_OF_ART
herein ORG


In [10]:
entity = df.iloc[0]['Entity']
relationship = df.iloc[0]['Relationship']

In [19]:
list_of_entity = entity.split(',')
for word in list_of_entity:
    start_in = sentence.find(word)
    print(f"{word} in {start_in} - {start_in + len(word) - 1}")

DEXTERA SURIGCAL in 136 - 151
 INC. in 153 - 157
--AESCULAP in -1 - 8
 INC. in 153 - 157
 in 0 - -1


In [43]:
# Load the NER data
ner_data = pd.read_csv('/opt/luciapp/data/temp/entity-annotated-corpus/ner_dataset.csv', encoding='latin').fillna(method='ffill')

In [44]:
ner_data.head(n=20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [45]:
ner_data.POS.unique()

array(['NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC',
       'JJ', '.', 'VBD', 'WP', '``', 'CD', 'PRP', 'VBZ', 'POS', 'VBG',
       'RB', ',', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', ':', 'JJS', 'WP$',
       'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'LRB', 'RRB', '$', 'RBR', ';',
       'UH', 'FW'], dtype=object)

In [46]:
ner_data.Tag.unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [47]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [49]:
# Get full document data struce
getter = SentenceGetter(ner_data)

In [50]:
# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [51]:
# Get pos data
poses = [[s[1] for s in sent] for sent in getter.sentences]
print(poses[0])

['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']


In [52]:
# Get tag labels data
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [63]:
tags_vals = list(set(ner_data["Tag"].values))

In [64]:
# Add X  label for word piece support
# Add [CLS] and [SEP] as BERT need
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')

In [65]:
tags_vals = set(tags_vals)

In [66]:
tags_vals

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O',
 'X',
 '[CLS]',
 '[SEP]'}

In [67]:
# Recommend to set it by manual define, good for reusing
tag2idx={'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X':17,
 'O': 9,
 '[CLS]':18,
 '[SEP]':19}

In [68]:
tag2idx

{'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X': 17,
 'O': 9,
 '[CLS]': 18,
 '[SEP]': 19}

In [69]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

Make training data

Make raw data into trainable data for BERT, including:

Set gpu environment

Load tokenizer and tokenize

Set 3 embedding, token embedding, mask word embedding, segmentation embedding

Split data set into train and validate, then send them to dataloader

In [77]:
# Import the required libraries for loading and fine-tuning
import torch
from transformers import BertTokenizer, BertConfig

In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [73]:
n_gpu

0

In [74]:
# Load tokenizer
vocabulary = "/opt/luciapp/data/temp/models/bert-base-cased/vocab.txt"
max_len = 45 # setting the max len to 45 from 512

In [78]:
# load tokenizer, with manual file address or pretrained address
tokenizer=BertTokenizer(vocab_file=vocabulary,do_lower_case=False)

## Tokenizer text

In hunggieface for bert, when come across OOV, will word piece the word

We need to adjust the labels base on the tokenize result, “##abc” need to set label "X"

Need to set "[CLS]" at front and "[SEP]" at the end, as what the paper do, BERT indexer should add [CLS] and [SEP] tokens


In [82]:
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list, label in zip(sentences, labels):
    temp_lable = []
    temp_token = []
    
    # Add CLS at the begin
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word, lab in zip(word_list, label):
        token_list = tokenizer.tokenize(word)
        for m, token in enumerate(token_list):
            temp_token.append(token)
            if m == 0:
                temp_lable.append(lab)
            else:
                temp_lable.append('X')
                
    # Add SEP at the end
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d, len%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d, len%d"%(i_inc,len(temp_lable)))
        print("labels:%s"%(" ".join(temp_lable)))
    i_inc += 1

No.0, len28
texts:[CLS] Thousands of demons ##tra ##tors have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country . [SEP]
No.0, len28
labels:[CLS] O O O X X O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O [SEP]
No.1, len29
texts:[CLS] Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an I ##A ##EA surveillance system begins functioning . [SEP]
No.1, len29
labels:[CLS] B-gpe O O O O O O O O O O O O O O B-tim O O O B-org X X O O O O O [SEP]
No.2, len44
texts:[CLS] He ##lic ##op ##ter guns ##hips Saturday pounded militant hide ##outs in the Or ##ak ##zai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South W ##azi ##rist ##an . [SEP]
No.2, len44
labels:[CLS] O X X X O X B-tim O O O X O O B-geo X X O O O O O B-org O O O O O O O O O O O O O O B-geo I-geo X X X O [SEP]
No.3, len16
texts:[CLS] They lef

## Set token embedding

In [83]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [92]:
# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])

[  101 26159  1104  8568  4487  5067  1138  9639  1194  1498  1106  5641
  1103  1594  1107  5008  1105  4555  1103 10602  1104  1418  2830  1121
  1115  1583   119   102     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0]


In [94]:
# Make label into id, pad with "O" meaning others
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
print(tags[0])

[18  9  9  9 17 17  9  9  9  0  9  9  9  9  9  0  9  9  9  9  9 13  9  9
  9  9  9 19  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9]


### Set mask word embedding

In [95]:
# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
attention_masks[0];

### Set segment embedding(Seem like for sequance tagging task, it's not necessary to make this embedding)

In [97]:
# Since only one sentence, all the segment set to 0
segment_ids = [[0] * len(input_id) for input_id in input_ids]
segment_ids[0];

### Split data into train and validate

In [99]:
from sklearn.model_selection import train_test_split
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(
    input_ids,
    tags,
    attention_masks,
    segment_ids,
    random_state=4,
    test_size=0.3
)

In [100]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(33571, 14388, 33571, 14388)

### Set data to tensor

In [102]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

  """Entry point for launching an IPython kernel.


### Put data into data loader 

In [104]:
# Set batch num
batch_num = 32

In [105]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [107]:
# Only set token embedding, attention embedding, no segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make the batch trainig better for the last one
train_dataloader = DataLoader(train_data, batch_size=batch_num, sampler=train_sampler, drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## Train Model

In [108]:
model_file_path = "/opt/luciapp/data/temp/models/bert-base-cased"
model_file_name = 'bert-base-cased-pytorch_model.bin'
config_file_name = 'bert-base-cased-config.json'
from transformers import BertForTokenClassification, AdamW

In [111]:
# will load config and weight with from_pretrained()
model = BertForTokenClassification.from_pretrained(model_file_path,num_labels=len(tag2idx))


In [115]:
# Model
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis