In [None]:
!pip install nerda -q
!pip install seqeval -q
!pip install nerda -q
!pip install flair -q


In [None]:
from NERDA.datasets import get_conll_data, download_conll_data 
from google.colab import files
import pandas as pd
import ast
import unicodedata

import numpy as np
import seqeval.metrics
import spacy
import torch
from tqdm import tqdm, trange
from transformers import LukeTokenizer, LukeForEntitySpanClassification
from flair.data import Sentence
from flair.models import SequenceTagger
import timeit
from sklearn.model_selection import train_test_split 

uploaded = files.upload()


download_conll_data()
training = get_conll_data('train')
validation = get_conll_data('valid')
testing = get_conll_data('test')

In [None]:
# Download the testb set of the CoNLL-2003 dataset
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb

In [None]:
processed_test_df = pd.read_csv("processed_df.csv", index_col=0)
processed_test_df["tags_list"] = processed_test_df["Name"].apply(lambda x: generate_labels(x))
processed_test_df["sentences"] = processed_test_df["Name"].apply(lambda x: get_sentence_from_name(x))
processed_test_dict = {"sentences": list(processed_test_df["sentences"]), "tags": list(processed_test_df["tags_list"])}

In [None]:
# load tagger
tagger = SequenceTagger.load("flair/ner-english-large")

In [None]:
processed_white_df = processed_test_df.loc[processed_test_df["Race"]=="White"].reset_index(drop=True)
processed_black_df = processed_test_df.loc[processed_test_df["Race"]=="Black"].reset_index(drop=True)
processed_api_df = processed_test_df.loc[processed_test_df["Race"]=="API"].reset_index(drop=True)
processed_hispanic_df = processed_test_df.loc[processed_test_df["Race"]=="Hispanic"].reset_index(drop=True)

processed_test_dict_w = {"sentences": list(processed_white_df["sentences"]), "tags": list(processed_white_df["tags_list"])}
processed_test_dict_b = {"sentences": list(processed_black_df["sentences"]), "tags": list(processed_black_df["tags_list"])}
processed_test_dict_a = {"sentences": list(processed_api_df["sentences"]), "tags": list(processed_api_df["tags_list"])}
processed_test_dict_h = {"sentences": list(processed_hispanic_df["sentences"]), "tags": list(processed_hispanic_df["tags_list"])}

In [None]:
def generate_entities_flair_baseline(input_string):
    output_length = len(input_string)
    input_string = " ".join(input_string)
    sentence = Sentence(input_string)
    # predict NER tags
    tagger.predict(sentence)
    sentence_length = len(sentence)
    values = ["O"] * output_length
    total_string = ""
    tagged_string = sentence.to_tagged_string()

    tagged_dict = sentence.to_dict(tag_type='ner')
    named_entities = tagged_dict["entities"]
    total_entities = []
    total_text = []

    for i in named_entities:
      text = i["text"]
      space_count = text.count(" ")
      entities = []
      current_entity = str(i["labels"][0])[:5]
      current_text = str(i["text"])
      if "ORG" in current_entity:
        current_entity = "ORG"
      if "MISC" in current_entity:
        current_entity = "MISC"
      if "PER" in current_entity:
        current_entity = "PER"
      if "LOC" in current_entity:
        current_entity = "LOC"
      total_text.append(current_text)
      entities.append("B-"+current_entity)
      if space_count >=1: 
        for j in range(space_count):
          entities.append("I-"+current_entity)
      total_entities.append(entities)
    copy_string = input_string

    for i, te in enumerate(total_text):
        copy_string = copy_string.replace(te, str(total_entities[i]).replace(" ", ""), 1)
            
    entity_list = []
    copy_string = copy_string.replace("'].", "']")
    
    for i in copy_string.split(" "):
        prefix = (i[0:4])
        if prefix == "['B-":
          if i[-1] != "]":
            i = i[:i.index("]") + 1]
          entry = [n.strip() for n in ast.literal_eval(i)]
          entity_list.extend(entry)
        else:
            entity_list.append("O")
    return entity_list

In [None]:
def get_named_entities_flair_baseline(input_row, index):
  words = input_row["words"]
  sentence_boundaries = input_row["sentence_boundaries"]
  start = 0
  total_labels = []

  for i in sentence_boundaries: 
    if i != 0: 
      current_string = words[start:i]
      if len(current_string) >= 120:
        midpoint = len(current_string) // 2
        first_half = current_string[:midpoint]
        second_half = current_string[midpoint:]
        prediction = generate_entities_flair_baseline(first_half) + generate_entities_flair_baseline(second_half)
      else: 
        prediction = generate_entities_flair_baseline(current_string)
      total_labels.extend(prediction)
      start = i
  return total_labels

In [None]:
start = timeit.default_timer()
test_labels_flair = [test_documents[i]["labels"] for i in range(len(test_documents))]
pred_labels_flair = [get_named_entities_flair_baseline(test_documents[i], i) for i in range(len(test_documents))]
print(seqeval.metrics.classification_report(test_labels_flair, pred_labels_flair, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

In [None]:
start = timeit.default_timer()
curated_test_labels_w = processed_test_dict_w["tags"]
curated_pred_labels_w = [generate_entities_flair_baseline(processed_test_dict_w["sentences"][i]) for i in range(len(processed_test_dict_w["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_w, curated_pred_labels_w, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

In [None]:
start = timeit.default_timer()
curated_test_labels_b = processed_test_dict_b["tags"]
curated_pred_labels_b = [generate_entities_flair_baseline(processed_test_dict_b["sentences"][i]) for i in range(len(processed_test_dict_b["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_b, curated_pred_labels_b, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

In [None]:
start = timeit.default_timer()
curated_test_labels_a = processed_test_dict_a["tags"]
curated_pred_labels_a = [generate_entities_flair_baseline(processed_test_dict_a["sentences"][i]) for i in range(len(processed_test_dict_a["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_a, curated_pred_labels_a, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

In [None]:
start = timeit.default_timer()
curated_test_labels_h = processed_test_dict_h["tags"]
curated_pred_labels_h = [generate_entities_flair_baseline(processed_test_dict_h["sentences"][i]) for i in range(len(processed_test_dict_h["sentences"]))]
print(seqeval.metrics.classification_report(curated_test_labels_h, curated_pred_labels_h, digits=4)) 
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))