# Library Installs and Imports

In [1]:
!pip install nerda -q
!pip install seqeval -q
!pip install flair -q

[K     |████████████████████████████████| 2.9 MB 11.0 MB/s 
[K     |████████████████████████████████| 895 kB 49.3 MB/s 
[K     |████████████████████████████████| 56 kB 5.7 MB/s 
[K     |████████████████████████████████| 596 kB 52.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 58.2 MB/s 
[?25h  Building wheel for progressbar (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 319 kB 8.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 981 kB 46.9 MB/s 
[K     |████████████████████████████████| 788 kB 49.9 MB/s 
[K     |████████████████████████████████| 64 kB 3.4 MB/s 
[K     |████████████████████████████████| 48 kB 6.5 MB/s 
[K     |███████████████████████

In [2]:
from NERDA.datasets import get_conll_data, download_conll_data 
from google.colab import files
import pandas as pd
import ast
import unicodedata

import numpy as np
import seqeval.metrics
import spacy
import torch
from tqdm import tqdm, trange
from flair.data import Sentence
from flair.models import SequenceTagger
import timeit
from sklearn.model_selection import train_test_split 

# must upload processed_df.csv for test set
uploaded = files.upload()

download_conll_data()
training = get_conll_data('train')
validation = get_conll_data('valid')
testing = get_conll_data('test')

Saving processed_df.csv to processed_df.csv
Saving retrain_processed.csv to retrain_processed.csv
Reading https://data.deepai.org/conll2003.zip


In [3]:
# Download the testb set of the CoNLL-2003 dataset
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb

--2021-10-24 12:22:04--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748096 (731K) [text/plain]
Saving to: ‘eng.testb’


2021-10-24 12:22:05 (26.9 MB/s) - ‘eng.testb’ saved [748096/748096]



In [7]:
def generate_labels(input_text):
  input_text = str(input_text)
  if input_text.count(" ") > 0:
    if "went to the store" in input_text:
      if input_text.count(" ") > 4:
        return ["B-PER", "I-PER", "O", "O", "O", "O"]
      return ["B-PER", "O", "O", "O", "O"]
    return ["B-PER", "I-PER"]
  else: 
    return ["B-PER"]

def get_sentence_from_name(input_name):
  input_name = str(input_name)
  return input_name.split(" ")

# Curated Test Data Manipulation

In [9]:
processed_test_df = pd.read_csv("processed_df.csv", index_col=0)
processed_test_df["tags_list"] = processed_test_df["Name"].apply(lambda x: generate_labels(x))
processed_test_df["sentences"] = processed_test_df["Name"].apply(lambda x: get_sentence_from_name(x))
processed_test_dict = {"sentences": list(processed_test_df["sentences"]), "tags": list(processed_test_df["tags_list"])}

processed_white_df = processed_test_df.loc[processed_test_df["Race"]=="White"].reset_index(drop=True)
processed_black_df = processed_test_df.loc[processed_test_df["Race"]=="Black"].reset_index(drop=True)
processed_api_df = processed_test_df.loc[processed_test_df["Race"]=="API"].reset_index(drop=True)
processed_hispanic_df = processed_test_df.loc[processed_test_df["Race"]=="Hispanic"].reset_index(drop=True)

processed_test_dict_w = {"sentences": list(processed_white_df["sentences"]), "tags": list(processed_white_df["tags_list"])}
processed_test_dict_b = {"sentences": list(processed_black_df["sentences"]), "tags": list(processed_black_df["tags_list"])}
processed_test_dict_a = {"sentences": list(processed_api_df["sentences"]), "tags": list(processed_api_df["tags_list"])}
processed_test_dict_h = {"sentences": list(processed_hispanic_df["sentences"]), "tags": list(processed_hispanic_df["tags_list"])}

# Code for Baseline Flair NER

In [10]:
# load tagger
tagger = SequenceTagger.load("flair/ner-english-large")

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

2021-10-24 12:23:31,673 loading file /root/.flair/models/ner-english-large/07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29


Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

In [11]:
def generate_entities_flair_baseline(input_string):
    output_length = len(input_string)
    input_string = " ".join(input_string)
    sentence = Sentence(input_string)
    # predict NER tags
    tagger.predict(sentence)
    sentence_length = len(sentence)
    values = ["O"] * output_length
    total_string = ""
    tagged_string = sentence.to_tagged_string()

    tagged_dict = sentence.to_dict(tag_type='ner')
    named_entities = tagged_dict["entities"]
    total_entities = []
    total_text = []

    for i in named_entities:
      text = i["text"]
      space_count = text.count(" ")
      entities = []
      current_entity = str(i["labels"][0])[:5]
      current_text = str(i["text"])
      if "ORG" in current_entity:
        current_entity = "ORG"
      if "MISC" in current_entity:
        current_entity = "MISC"
      if "PER" in current_entity:
        current_entity = "PER"
      if "LOC" in current_entity:
        current_entity = "LOC"
      total_text.append(current_text)
      entities.append("B-"+current_entity)
      if space_count >=1: 
        for j in range(space_count):
          entities.append("I-"+current_entity)
      total_entities.append(entities)
    copy_string = input_string

    for i, te in enumerate(total_text):
        copy_string = copy_string.replace(te, str(total_entities[i]).replace(" ", ""), 1)
            
    entity_list = []
    copy_string = copy_string.replace("'].", "']")
    
    for i in copy_string.split(" "):
        prefix = (i[0:4])
        if prefix == "['B-":
          if i[-1] != "]":
            i = i[:i.index("]") + 1]
          entry = [n.strip() for n in ast.literal_eval(i)]
          entity_list.extend(entry)
        else:
            entity_list.append("O")
    return entity_list

In [12]:
def get_named_entities_flair_baseline(input_row, index):
  words = input_row["words"]
  sentence_boundaries = input_row["sentence_boundaries"]
  start = 0
  total_labels = []

  for i in sentence_boundaries: 
    if i != 0: 
      current_string = words[start:i]
      if len(current_string) >= 120:
        midpoint = len(current_string) // 2
        first_half = current_string[:midpoint]
        second_half = current_string[midpoint:]
        prediction = generate_entities_flair_baseline(first_half) + generate_entities_flair_baseline(second_half)
      else: 
        prediction = generate_entities_flair_baseline(current_string)
      total_labels.extend(prediction)
      start = i
  return total_labels

# Cleaning of CoNLL-2003 Test Data

In [14]:
# Load the tokenizer
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

def load_documents(dataset_file):
    documents = []
    words = []
    labels = []
    sentence_boundaries = []
    with open(dataset_file) as f:
        for line in f:
            line = line.rstrip()
            if line.startswith("-DOCSTART"):
                if words:
                    documents.append(dict(
                        words=words,
                        labels=labels,
                        sentence_boundaries=sentence_boundaries
                    ))
                    words = []
                    labels = []
                    sentence_boundaries = []
                continue

            if not line:
                if not sentence_boundaries or len(words) != sentence_boundaries[-1]:
                    sentence_boundaries.append(len(words))
            else:
                items = line.split(" ")
                words.append(items[0])
                labels.append(items[-1])

    if words:
        documents.append(dict(
            words=words,
            labels=labels,
            sentence_boundaries=sentence_boundaries
        ))
        
    return documents


def load_examples(documents):
    examples = []
    max_token_length = 510
    max_mention_length = 30

    for document in tqdm(documents):
        words = document["words"]
        subword_lengths = [len(tokenizer.tokenize(w)) for w in words]
        total_subword_length = sum(subword_lengths)
        sentence_boundaries = document["sentence_boundaries"]

        for i in range(len(sentence_boundaries) - 1):
            sentence_start, sentence_end = sentence_boundaries[i:i+2]
            if total_subword_length <= max_token_length:
                # if the total sequence length of the document is shorter than the
                # maximum token length, we simply use all words to build the sequence
                context_start = 0
                context_end = len(words)
            else:
                # if the total sequence length is longer than the maximum length, we add
                # the surrounding words of the target sentence　to the sequence until it
                # reaches the maximum length
                context_start = sentence_start
                context_end = sentence_end
                cur_length = sum(subword_lengths[context_start:context_end])
                while True:
                    if context_start > 0:
                        if cur_length + subword_lengths[context_start - 1] <= max_token_length:
                            cur_length += subword_lengths[context_start - 1]
                            context_start -= 1
                        else:
                            break
                    if context_end < len(words):
                        if cur_length + subword_lengths[context_end] <= max_token_length:
                            cur_length += subword_lengths[context_end]
                            context_end += 1
                        else:
                            break

            text = ""
            for word in words[context_start:sentence_start]:
                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
                    text = text.rstrip()
                text += word
                text += " "

            sentence_words = words[sentence_start:sentence_end]
            sentence_subword_lengths = subword_lengths[sentence_start:sentence_end]

            word_start_char_positions = []
            word_end_char_positions = []
            for word in sentence_words:
                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
                    text = text.rstrip()
                word_start_char_positions.append(len(text))
                text += word
                word_end_char_positions.append(len(text))
                text += " "

            for word in words[sentence_end:context_end]:
                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
                    text = text.rstrip()
                text += word
                text += " "
            text = text.rstrip()

            entity_spans = []
            original_word_spans = []
            for word_start in range(len(sentence_words)):
                for word_end in range(word_start, len(sentence_words)):
                    if sum(sentence_subword_lengths[word_start:word_end]) <= max_mention_length:
                        entity_spans.append(
                            (word_start_char_positions[word_start], word_end_char_positions[word_end])
                        )
                        original_word_spans.append(
                            (word_start, word_end + 1)
                        )

            examples.append(dict(
                text=text,
                words=sentence_words,
                entity_spans=entity_spans,
                original_word_spans=original_word_spans,
            ))

    return examples


def is_punctuation(char):
    cp = ord(char)
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/877 [00:00<?, ?B/s]

In [15]:
test_documents = load_documents("eng.testb")
test_examples = load_examples(test_documents)

100%|██████████| 231/231 [00:03<00:00, 63.01it/s]


# Baseline Flair Evaluation

## CoNLL-2003 Test Data:

In [16]:
start = timeit.default_timer()
test_labels_flair = [test_documents[i]["labels"] for i in range(len(test_documents))]
pred_labels_flair = [get_named_entities_flair_baseline(test_documents[i], i) for i in range(len(test_documents))]
print(seqeval.metrics.classification_report(test_labels_flair, pred_labels_flair, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

              precision    recall  f1-score   support

         LOC     0.9402    0.8770    0.9075      1666
        MISC     0.8342    0.8759    0.8546       701
         ORG     0.8521    0.9302    0.8894      1647
         PER     0.9621    0.9669    0.9645      1602

   micro avg     0.9049    0.9181    0.9114      5616
   macro avg     0.8971    0.9125    0.9040      5616
weighted avg     0.9074    0.9181    0.9118      5616

Flair Runtime: 59.69918525400004 seconds


## Primarily White Names from Curated Test Data: 

In [17]:
start = timeit.default_timer()
curated_test_labels_w = processed_test_dict_w["tags"]
curated_pred_labels_w = [generate_entities_flair_baseline(processed_test_dict_w["sentences"][i]) for i in range(len(processed_test_dict_w["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_w, curated_pred_labels_w, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.9941    0.9709    0.9823      8480

   micro avg     0.9907    0.9709    0.9807      8480
   macro avg     0.3314    0.3236    0.3274      8480
weighted avg     0.9941    0.9709    0.9823      8480

Flair Runtime: 129.270667451 seconds


## Primarily Black / African American Names from Curated Test Data: 

In [18]:
start = timeit.default_timer()
curated_test_labels_b = processed_test_dict_b["tags"]
curated_pred_labels_b = [generate_entities_flair_baseline(processed_test_dict_b["sentences"][i]) for i in range(len(processed_test_dict_b["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_b, curated_pred_labels_b, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC     0.0000    0.0000    0.0000         0
        MISC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.9770    0.7429    0.8440      8476

   micro avg     0.8718    0.7429    0.8022      8476
   macro avg     0.2443    0.1857    0.2110      8476
weighted avg     0.9770    0.7429    0.8440      8476

Flair Runtime: 128.3547469350001 seconds


## Primarily Asian or Native Hawaiian or Other Pacific Islander Names from Curated Test Data: 

In [19]:
start = timeit.default_timer()
curated_test_labels_a = processed_test_dict_a["tags"]
curated_pred_labels_a = [generate_entities_flair_baseline(processed_test_dict_a["sentences"][i]) for i in range(len(processed_test_dict_a["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_a, curated_pred_labels_a, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.9567    0.7300    0.8281      8468

   micro avg     0.9119    0.7300    0.8109      8468
   macro avg     0.3189    0.2433    0.2760      8468
weighted avg     0.9567    0.7300    0.8281      8468

Flair Runtime: 127.977466542 seconds


## Primarily Hispanic / Latino Names from Curated Test Data: 

In [20]:
start = timeit.default_timer()
curated_test_labels_h = processed_test_dict_h["tags"]
curated_pred_labels_h = [generate_entities_flair_baseline(processed_test_dict_h["sentences"][i]) for i in range(len(processed_test_dict_h["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_h, curated_pred_labels_h, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         LOC     0.0000    0.0000    0.0000         0
        MISC     0.0000    0.0000    0.0000         0
         ORG     0.0000    0.0000    0.0000         0
         PER     0.9955    0.9580    0.9764      8480

   micro avg     0.9815    0.9580    0.9696      8480
   macro avg     0.2489    0.2395    0.2441      8480
weighted avg     0.9955    0.9580    0.9764      8480

Flair Runtime: 128.78846559599992 seconds
