In [1]:
from google.colab import files
import pandas as pd
import io
from flair.data import Sentence
from flair.models import SequenceTagger
from transformers import LukeTokenizer, LukeForEntitySpanClassification
import timeit

uploaded = files.upload()


Saving processed_df.csv to processed_df (2).csv


In [8]:
df = pd.read_csv(io.BytesIO(uploaded['processed_df.csv']), index_col=0)

Unnamed: 0,Name,Race
0,Scott,White
1,Kathleen,White
2,Matthew,White
3,Jeffrey,White
4,Mark,White
...,...,...
16947,mario tejeda,Hispanic
16948,mario tavarez,Hispanic
16949,mario taveras,Hispanic
16950,mario villatoro,Hispanic


In [9]:
# load tagger
tagger = SequenceTagger.load("flair/ner-english")

def get_flair_result(input_string):
  # make example sentence
  sentence = Sentence(input_string)

  # predict NER tags
  tagger.predict(sentence)
  total_string = ""
  for entity in sentence.get_spans('ner'):
      total_string += str(entity)
      total_string += "\n"
  return total_string


2021-10-11 16:25:49,168 loading file /root/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4


In [10]:
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeForEntitySpanClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def get_num_entities(input_text):
  split_text = input_text.split("\n")
  n_entities = 0
  for i in split_text:
    split_i = i.split(" ")
    n_entities += (len(split_i) - 1)
  return n_entities

def get_true_num_entities(input_text):
  return len((input_text).split(" "))

def all_person_entities(input_text):
  split_text = input_text.split("\n")
  for i in split_text:
    if len(i) > 0 and i[-3:] != "PER":
      return False

  return True

def get_luke_entity(input_text):
  split_text = input_text.split(" ")
  word_start_positions = [0]
  word_end_positions = [len(split_text[0])]
  for i in split_text[1:]:
    word_start_positions.append(word_end_positions[-1] + 1)
    word_end_positions.append(len(i) + word_start_positions[-1])

  entity_spans = []
  for i, start_pos in enumerate(word_start_positions):
    for end_pos in word_end_positions[i:]:
      entity_spans.append((start_pos, end_pos))

  inputs = tokenizer(input_text, entity_spans=entity_spans, return_tensors="pt")
  outputs = model(**inputs)
  logits = outputs.logits
  predicted_class_indices = logits.argmax(-1).squeeze().tolist()
  if type(predicted_class_indices) == int:
    predicted_class_indices = [predicted_class_indices]
  total_string = ""
  for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
    if predicted_class_idx != 0:
      total_string += (input_text[span[0]:span[1]] + " " + str(model.config.id2label[predicted_class_idx]) + "\n")
  return total_string

In [12]:
start = timeit.default_timer()
df["luke"] = df["Name"].apply(lambda x: get_luke_entity(x))
stop = timeit.default_timer()
print('LUKE Runtime: {} seconds'.format(stop - start))

start = timeit.default_timer()
df["flair_results"] = df["Name"].apply(lambda x: get_flair_result(x))
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

df["num_luke_entities"] = df["luke"].apply(lambda x: get_num_entities(x))
df["true_num_entities"] = df["Name"].apply(lambda x: get_true_num_entities(x))
df["all_person_entities"] = df["luke"].apply(lambda x: all_person_entities(x))

LUKE Runtime: 6116.61287732 seconds
Flair Runtime: 3572.6969832800005 seconds


In [13]:
example_sentences = ["I spoke with {}", "{} went to the store"]

In [17]:
# df.to_csv("df_with_ner.csv")
# files.download('df_with_ner.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# df.loc[(df["all_person_entities"]!= True) | (df["true_num_entities"] != df["num_luke_entities"])]["Race"].value_counts()

In [15]:
# !pip install datasets
# from datasets import load_dataset
# dataset = load_dataset(
#    'conll2003')



# Scratch Code: 

In [None]:
# # load tagger
# tagger = SequenceTagger.load("flair/ner-english")

# # make example sentence
# sentence = Sentence("George Washington went to Washington")

# # predict NER tags
# tagger.predict(sentence)

# # print sentence
# print(sentence)

# # print predicted NER spans
# print('The following NER tags are found:')
# # iterate over entities and print
# for entity in sentence.get_spans('ner'):
#     print(entity)


In [None]:

# text = "Beyoncé lives in Los Angeles"

# # List all possible entity spans in the text
# word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
# word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
# entity_spans = []
# for i, start_pos in enumerate(word_start_positions):
#   for end_pos in word_end_positions[i:]:
#     entity_spans.append((start_pos, end_pos))

# inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
# outputs = model(**inputs)
# logits = outputs.logits
# predicted_class_indices = logits.argmax(-1).squeeze().tolist()
# for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
#   if predicted_class_idx != 0:
#     print(text[span[0]:span[1]], model.config.id2label[predicted_class_idx])
