In [1]:
!pip install seqeval -q
!pip install flair -q

[?25l[K     |███████▌                        | 10 kB 33.4 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 34.9 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 37.6 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 28.9 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.7 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 319 kB 12.8 MB/s 
[K     |████████████████████████████████| 19.7 MB 74.2 MB/s 
[K     |████████████████████████████████| 48 kB 6.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 788 kB 38.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 53.5 MB/s 
[K     |████████████████████████████████| 2.9 MB 50.4 MB/s 
[K     |████████████████████████████████| 56 kB 6

In [2]:
import pandas as pd
import io
from flair.data import Sentence
from flair.models import SequenceTagger
from transformers import LukeTokenizer, LukeForEntitySpanClassification
import timeit
import ast

import unicodedata

from google.colab import files

uploaded = files.upload()

import numpy as np
import seqeval.metrics
import spacy
import torch
from tqdm import tqdm, trange

Saving retrain_processed.csv to retrain_processed.csv
Saving processed_df.csv to processed_df.csv


In [3]:
class FlairModel:
    def __init__(self):
        # load tagger
        self.tagger = SequenceTagger.load("flair/ner-english-large")

    def get_entity_list(self, input_string):
        sentence = Sentence(input_string)
        # print(sentence)
        # predict NER tags
        self.tagger.predict(sentence)
        sentence_length = len(sentence)
        values = ["O"] * len(input_string.split(" "))
        total_string = ""
        tagged_string = sentence.to_tagged_string()
        true_index = 0
        # print(tagged_string)
        count_entities = 0
        punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

        # print(tagged_string.split(">"))
        # print(sentence.to_dict(tag_type='ner'))
        tagged_dict = sentence.to_dict(tag_type='ner')
        named_entities = tagged_dict["entities"]
        total_entities = []
        total_text = []

        for i in named_entities:
          text = named_entities[0]["text"]
          space_count = text.count(" ")

          entities = []
          current_entity = str(i["labels"][0])[:5]
          current_text = str(i["text"])

          if "ORG" in current_entity:
            current_entity = "ORG"
          if "MISC" in current_entity:
            current_entity = "MISC"
          if "PER" in current_entity:
            current_entity = "PER"
          if "LOC" in current_entity:
            current_entity = "LOC"
          total_text.append(current_text)
          entities.append("B-"+current_entity)
          if space_count >=1: 
            for i in range(space_count):
              entities.append("I-"+current_entity)
          total_entities.append(entities)

        copy_string = input_string

        for i, te in enumerate(total_text):
            copy_string = copy_string.replace(te, (str(total_entities[i]).replace(" ", "")), 1)

        entity_list = []

        for i in copy_string.split(" "):
            prefix = (i[0:4])
            if prefix == "['B-":
                entry = [n.strip() for n in ast.literal_eval(i)]
                entity_list.extend(entry)
            else:
                entity_list.append("O")
                
        return entity_list

In [4]:
class LukeModel:
    def __init__(self):
        self.tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
        self.model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

    def get_entity_list(self, input_text):
        input_text = input_text.strip()

        split_text = input_text.split(" ")

        word_start_positions = [0]
        word_end_positions = [len(split_text[0])]
        words = [[word_start_positions[0], word_end_positions[0]]]

        for word in split_text[1:]:
            start_index = word_end_positions[-1] + 1
            word_start_positions.append(start_index)
            end_index = len(word) + word_start_positions[-1]
            word_end_positions.append(end_index)
            words.append([start_index, end_index])

        entity_spans = []
        for index, start_pos in enumerate(word_start_positions):
            for end_pos in word_end_positions[index:]:
                entity_spans.append((start_pos, end_pos))

        inputs = self.tokenizer(input_text, entity_spans=entity_spans, return_tensors="pt")
        outputs = self.model(**inputs)
        logits = outputs.logits

        predicted_class_indices = logits.argmax(-1).squeeze().tolist()
        if type(predicted_class_indices) == int:
            predicted_class_indices = [predicted_class_indices]

        text_entities = []
        total_entities = []

        for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
            if predicted_class_idx != 0:
                current_text = input_text[span[0]:span[1]]
                current_entity = str(self.model.config.id2label[predicted_class_idx])
                current_entities = ["B-" + current_entity]
                num_spaces = current_text.count(" ")
                if num_spaces >= 1:
                    current_entities.extend(["I-" + current_entity] * num_spaces)
                total_entities.append(current_entities)
                text_entities.append(current_text)

        copy_string = input_text
        for i, te in enumerate(text_entities):
            copy_string = copy_string.replace(te, (str(total_entities[i]).replace(" ", "")), 1)
        entity_list = []

        for i in copy_string.split(" "):
            prefix = (i[0:4])
            if prefix == "['B-":
                entry = [n.strip() for n in ast.literal_eval(i)]
                entity_list.extend(entry)
            else:
                entity_list.append("O")

        return entity_list


In [5]:
# luke_model = LukeModel()
# flair_model = FlairModel()

In [6]:
# luke_model.get_entity_list("hello I'm David Peletz")

In [7]:
# flair_model.get_entity_list("hello I'm David Peletz")

In [8]:
# Download the testb set of the CoNLL-2003 dataset
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb

--2021-10-21 15:00:44--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748096 (731K) [text/plain]
Saving to: ‘eng.testb’


2021-10-21 15:00:45 (17.2 MB/s) - ‘eng.testb’ saved [748096/748096]



In [9]:
model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
model.eval()
model.to("cuda")

# Load the tokenizer
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

Downloading:   0%|          | 0.00/877 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeForEntitySpanClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

In [10]:
def load_documents(dataset_file):
    documents = []
    words = []
    labels = []
    sentence_boundaries = []
    with open(dataset_file) as f:
        for line in f:
            line = line.rstrip()
            if line.startswith("-DOCSTART"):
                if words:
                    documents.append(dict(
                        words=words,
                        labels=labels,
                        sentence_boundaries=sentence_boundaries
                    ))
                    words = []
                    labels = []
                    sentence_boundaries = []
                continue

            if not line:
                if not sentence_boundaries or len(words) != sentence_boundaries[-1]:
                    sentence_boundaries.append(len(words))
            else:
                items = line.split(" ")
                words.append(items[0])
                labels.append(items[-1])

    if words:
        documents.append(dict(
            words=words,
            labels=labels,
            sentence_boundaries=sentence_boundaries
        ))
        
    return documents


def load_examples(documents):
    examples = []
    max_token_length = 510
    max_mention_length = 30

    for document in tqdm(documents):
        words = document["words"]
        subword_lengths = [len(tokenizer.tokenize(w)) for w in words]
        total_subword_length = sum(subword_lengths)
        sentence_boundaries = document["sentence_boundaries"]

        for i in range(len(sentence_boundaries) - 1):
            sentence_start, sentence_end = sentence_boundaries[i:i+2]
            if total_subword_length <= max_token_length:
                # if the total sequence length of the document is shorter than the
                # maximum token length, we simply use all words to build the sequence
                context_start = 0
                context_end = len(words)
            else:
                # if the total sequence length is longer than the maximum length, we add
                # the surrounding words of the target sentence　to the sequence until it
                # reaches the maximum length
                context_start = sentence_start
                context_end = sentence_end
                cur_length = sum(subword_lengths[context_start:context_end])
                while True:
                    if context_start > 0:
                        if cur_length + subword_lengths[context_start - 1] <= max_token_length:
                            cur_length += subword_lengths[context_start - 1]
                            context_start -= 1
                        else:
                            break
                    if context_end < len(words):
                        if cur_length + subword_lengths[context_end] <= max_token_length:
                            cur_length += subword_lengths[context_end]
                            context_end += 1
                        else:
                            break

            text = ""
            for word in words[context_start:sentence_start]:
                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
                    text = text.rstrip()
                text += word
                text += " "

            sentence_words = words[sentence_start:sentence_end]
            sentence_subword_lengths = subword_lengths[sentence_start:sentence_end]

            word_start_char_positions = []
            word_end_char_positions = []
            for word in sentence_words:
                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
                    text = text.rstrip()
                word_start_char_positions.append(len(text))
                text += word
                word_end_char_positions.append(len(text))
                text += " "

            for word in words[sentence_end:context_end]:
                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
                    text = text.rstrip()
                text += word
                text += " "
            text = text.rstrip()

            entity_spans = []
            original_word_spans = []
            for word_start in range(len(sentence_words)):
                for word_end in range(word_start, len(sentence_words)):
                    if sum(sentence_subword_lengths[word_start:word_end]) <= max_mention_length:
                        entity_spans.append(
                            (word_start_char_positions[word_start], word_end_char_positions[word_end])
                        )
                        original_word_spans.append(
                            (word_start, word_end + 1)
                        )

            examples.append(dict(
                text=text,
                words=sentence_words,
                entity_spans=entity_spans,
                original_word_spans=original_word_spans,
            ))

    return examples


def is_punctuation(char):
    cp = ord(char)
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

In [11]:
# test_documents = load_documents("eng.testb")
# test_examples = load_examples(test_documents)

In [12]:
# batch_size = 2
# all_logits = []

# for batch_start_idx in trange(0, len(test_examples), batch_size):
#     batch_examples = test_examples[batch_start_idx:batch_start_idx + batch_size]
#     texts = [example["text"] for example in batch_examples]
#     entity_spans = [example["entity_spans"] for example in batch_examples]

#     inputs = tokenizer(texts, entity_spans=entity_spans, return_tensors="pt", padding=True)
#     inputs = inputs.to("cuda")
#     with torch.no_grad():
#         outputs = model(**inputs)
#     all_logits.extend(outputs.logits.tolist())

In [13]:
# final_labels = [label for document in test_documents for label in document["labels"]]
# final_predictions = []
# for example_index, example in enumerate(test_examples):
#     logits = all_logits[example_index]
#     max_logits = np.max(logits, axis=1)
#     max_indices = np.argmax(logits, axis=1)
#     original_spans = example["original_word_spans"]
#     predictions = []
#     for logit, index, span in zip(max_logits, max_indices, original_spans):
#         if index != 0:  # the span is not NIL
#             predictions.append((logit, span, model.config.id2label[index]))

#     # construct an IOB2 label sequence
#     predicted_sequence = ["O"] * len(example["words"])
#     for _, span, label in sorted(predictions, key=lambda o: o[0], reverse=True):
#         if all([o == "O" for o in predicted_sequence[span[0] : span[1]]]):
#             predicted_sequence[span[0]] = "B-" + label
#             if span[1] - span[0] > 1:
#                 predicted_sequence[span[0] + 1 : span[1]] = ["I-" + label] * (span[1] - span[0] - 1)

#     final_predictions += predicted_sequence

In [14]:
# print(seqeval.metrics.classification_report([final_labels], [final_predictions], digits=4)) 

In [15]:
def get_entity_list_luke(input_text):
  input_text = input_text.strip()

  split_text = input_text.split(" ")

  word_start_positions = [0]
  word_end_positions = [len(split_text[0])]
  words = [[word_start_positions[0], word_end_positions[0]]]

  for word in split_text[1:]:
      start_index = word_end_positions[-1] + 1
      word_start_positions.append(start_index)
      end_index = len(word) + word_start_positions[-1]
      word_end_positions.append(end_index)
      words.append([start_index, end_index])

  entity_spans = []
  for index, start_pos in enumerate(word_start_positions):
      for end_pos in word_end_positions[index:]:
          entity_spans.append((start_pos, end_pos))

  inputs = tokenizer(input_text, entity_spans=entity_spans, return_tensors="pt")
  outputs = model(**inputs)
  logits = outputs.logits

  predicted_class_indices = logits.argmax(-1).squeeze().tolist()
  if type(predicted_class_indices) == int:
      predicted_class_indices = [predicted_class_indices]

  text_entities = []
  total_entities = []

  for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
      if predicted_class_idx != 0:
          current_text = input_text[span[0]:span[1]]
          current_entity = str(model.config.id2label[predicted_class_idx])
          current_entities = ["B-" + current_entity]
          num_spaces = current_text.count(" ")
          if num_spaces >= 1:
              current_entities.extend(["I-" + current_entity] * num_spaces)
          total_entities.append(current_entities)
          text_entities.append(current_text)

  copy_string = input_text
  for i, te in enumerate(text_entities):
      copy_string = copy_string.replace(te, (str(total_entities[i]).replace(" ", "")), 1)
  entity_list = []

  for i in copy_string.split(" "):
      prefix = (i[0:4])
      if prefix == "['B-":
          entry = [n.strip() for n in ast.literal_eval(i)]
          entity_list.extend(entry)
      else:
          entity_list.append("O")

  return entity_list


In [16]:
def generate_labels(input_text):
  input_text = str(input_text)
  if input_text.count(" ") > 0:
    if "went to the store" in input_text:
      if input_text.count(" ") > 4:
        return ["B-PER", "I-PER", "O", "O", "O", "O"]
      return ["B-PER", "O", "O", "O", "O"]
    return ["B-PER", "I-PER"]
  else: 
    return ["B-PER"]

def get_sentence_from_name(input_name):
  input_name = str(input_name)
  return input_name.split(" ")

In [17]:
def generate_entities(input_string):
  return model.predict([input_string])[0]

# Build Base NERDA Model:

In [18]:
!pip install nerda -q

  Building wheel for progressbar (setup.py) ... [?25l[?25hdone


In [19]:
from NERDA.datasets import get_conll_data, download_conll_data 
download_conll_data()
training = get_conll_data('train')
validation = get_conll_data('valid')


Reading https://data.deepai.org/conll2003.zip


In [20]:
tag_scheme = [
'B-PER',
'I-PER',
'B-ORG',
'I-ORG',
'B-LOC',
'I-LOC',
'B-MISC',
'I-MISC'
]

transformer = 'studio-ousia/luke-large-finetuned-conll-2003'

# hyperparameters for network
dropout = 0.1

# hyperparameters for training
# training_hyperparameters = {
# 'epochs' : 5,
# 'warmup_steps' : 2500,                                                   
# 'train_batch_size': 2048,                                         
# 'learning_rate': 1e-5
# }

training_hyperparameters = {
'epochs' : 2,
'warmup_steps' : 500,                                                   
'train_batch_size': 13,                                         
'learning_rate': 1e-5
}

In [21]:
from NERDA.models import NERDA
model = NERDA(
dataset_training = training,
dataset_validation = validation,
tag_scheme = tag_scheme, 
tag_outside = 'O',
transformer = transformer,
dropout = dropout,
hyperparameters = training_hyperparameters
)

Device automatically set to: cuda


Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeModel: ['classifier.bias', 'luke.embeddings.position_ids', 'classifier.weight']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
model.train()


 Epoch 1 / 2


100%|██████████| 1080/1080 [10:01<00:00,  1.80it/s]
100%|██████████| 407/407 [00:48<00:00,  8.43it/s]


Train Loss = 0.2576782017478426 Valid Loss = 0.05563149609373017

 Epoch 2 / 2


100%|██████████| 1080/1080 [10:00<00:00,  1.80it/s]
100%|██████████| 407/407 [00:48<00:00,  8.38it/s]

Train Loss = 0.04309534241867907 Valid Loss = 0.04744180275179636





'Model trained successfully'

In [23]:
processed_test_df = pd.read_csv("processed_df.csv", index_col=0)
processed_test_df["tags_list"] = processed_test_df["Name"].apply(lambda x: generate_labels(x))
processed_test_df["sentences"] = processed_test_df["Name"].apply(lambda x: get_sentence_from_name(x))

In [24]:
processed_white_df = processed_test_df.loc[processed_test_df["Race"]=="White"].reset_index(drop=True)
processed_black_df = processed_test_df.loc[processed_test_df["Race"]=="Black"].reset_index(drop=True)
processed_api_df = processed_test_df.loc[processed_test_df["Race"]=="API"].reset_index(drop=True)
processed_hispanic_df = processed_test_df.loc[processed_test_df["Race"]=="Hispanic"].reset_index(drop=True)

processed_test_dict_w = {"sentences": list(processed_white_df["sentences"]), "tags": list(processed_white_df["tags_list"])}
processed_test_dict_b = {"sentences": list(processed_black_df["sentences"]), "tags": list(processed_black_df["tags_list"])}
processed_test_dict_a = {"sentences": list(processed_api_df["sentences"]), "tags": list(processed_api_df["tags_list"])}
processed_test_dict_h = {"sentences": list(processed_hispanic_df["sentences"]), "tags": list(processed_hispanic_df["tags_list"])}

In [25]:
curated_test_labels_w = processed_test_dict_w["tags"]
curated_pred_labels_w = [generate_entities(processed_test_dict_w["sentences"][i]) for i in range(len(processed_test_dict_w["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_w, curated_pred_labels_w, digits=4)) 

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.9975    0.9767    0.9870      8480

   micro avg     0.9906    0.9767    0.9836      8480
   macro avg     0.3325    0.3256    0.3290      8480
weighted avg     0.9975    0.9767    0.9870      8480



In [26]:
curated_test_labels_b = processed_test_dict_b["tags"]
curated_pred_labels_b = [generate_entities(processed_test_dict_b["sentences"][i]) for i in range(len(processed_test_dict_b["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_b, curated_pred_labels_b, digits=4)) 

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC     0.0000    0.0000    0.0000         0
        MISC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.9329    0.6417    0.7604      8476

   micro avg     0.7128    0.6417    0.6754      8476
   macro avg     0.2332    0.1604    0.1901      8476
weighted avg     0.9329    0.6417    0.7604      8476



In [27]:
curated_test_labels_a = processed_test_dict_a["tags"]
curated_pred_labels_a = [generate_entities(processed_test_dict_a["sentences"][i]) for i in range(len(processed_test_dict_a["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_a, curated_pred_labels_a, digits=4)) 

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        MISC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.8567    0.7213    0.7832      8468

   micro avg     0.8361    0.7213    0.7745      8468
   macro avg     0.2856    0.2404    0.2611      8468
weighted avg     0.8567    0.7213    0.7832      8468



In [28]:
curated_test_labels_h = processed_test_dict_h["tags"]
curated_pred_labels_h = [generate_entities(processed_test_dict_h["sentences"][i]) for i in range(len(processed_test_dict_h["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_h, curated_pred_labels_h, digits=4)) 

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC     0.0000    0.0000    0.0000         0
        MISC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.9711    0.8290    0.8945      8480

   micro avg     0.8634    0.8290    0.8459      8480
   macro avg     0.2428    0.2073    0.2236      8480
weighted avg     0.9711    0.8290    0.8945      8480



In [29]:
processed_test_dict = {"sentences": list(processed_test_df["sentences"]), "tags": list(processed_test_df["tags_list"])}
curated_test_labels = processed_test_dict["tags"]
curated_pred_labels = [generate_entities(processed_test_dict["sentences"][i]) for i in range(len(processed_test_dict["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels, curated_pred_labels, digits=4)) 

Exception ignored in: <function _after_fork at 0x7f7790ca47a0>
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 1375, in _after_fork
    thread._reset_internal_locks(False)
  File "/usr/lib/python3.7/threading.py", line 811, in _reset_internal_locks
    self._started._reset_internal_locks()
  File "/usr/lib/python3.7/threading.py", line 505, in _reset_internal_locks
    self._cond.__init__(Lock())
  File "/usr/lib/python3.7/threading.py", line 228, in __init__
    except AttributeError:
KeyboardInterrupt: 


KeyboardInterrupt: ignored