In [1]:
!pip install flair -q
!pip install transformers -q

In [2]:
from google.colab import files
import pandas as pd
import io
from flair.data import Sentence
from flair.models import SequenceTagger
from transformers import LukeTokenizer, LukeForEntitySpanClassification
import timeit

uploaded = files.upload()


Saving processed_df.csv to processed_df (1).csv


In [3]:
df = pd.read_csv(io.BytesIO(uploaded['processed_df.csv']), index_col=0)

In [4]:
# load tagger
tagger = SequenceTagger.load("flair/ner-english-large")

2021-10-18 18:32:38,920 loading file /root/.flair/models/ner-english-large/07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29


In [42]:

def get_flair_result(input_string):
  # make example sentence
  sentence = Sentence(input_string)

  # predict NER tags
  tagger.predict(sentence)
  total_string = ""
  for entity in sentence.get_spans('ner'):
      total_string += str(entity)
      total_string += "\n"
  return total_string


In [6]:
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeForEntitySpanClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def get_num_entities(input_text):
  split_text = input_text.split("\n")
  n_entities = 0
  for i in split_text:
    split_i = i.split(" ")
    n_entities += (len(split_i) - 1)
  return n_entities

def get_true_num_entities(input_text):
  return len((input_text).split(" "))

def all_person_entities(input_text):
  split_text = input_text.split("\n")
  for i in split_text:
    if len(i) > 0 and i[-3:] != "PER":
      return False

  return True

def get_luke_entity(input_text):
  split_text = input_text.split(" ")
  word_start_positions = [0]
  word_end_positions = [len(split_text[0])]
  for i in split_text[1:]:
    word_start_positions.append(word_end_positions[-1] + 1)
    word_end_positions.append(len(i) + word_start_positions[-1])

  entity_spans = []
  for i, start_pos in enumerate(word_start_positions):
    for end_pos in word_end_positions[i:]:
      entity_spans.append((start_pos, end_pos))

  inputs = tokenizer(input_text, entity_spans=entity_spans, return_tensors="pt")
  outputs = model(**inputs)
  logits = outputs.logits
  predicted_class_indices = logits.argmax(-1).squeeze().tolist()
  if type(predicted_class_indices) == int:
    predicted_class_indices = [predicted_class_indices]
  total_string = ""
  for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
    if predicted_class_idx != 0:
      total_string += (input_text[span[0]:span[1]] + " " + str(model.config.id2label[predicted_class_idx]) + "\n")
  return total_string

In [8]:
test_df = df[-100:]
start = timeit.default_timer()
test_df["luke"] = test_df["Name"].apply(lambda x: get_luke_entity(x))
stop = timeit.default_timer()
print('LUKE Runtime: {} seconds'.format(stop - start))

start = timeit.default_timer()
test_df["flair_results"] = test_df["Name"].apply(lambda x: get_flair_result(x))
stop = timeit.default_timer()
print('Flair Runtime: {} seconds'.format(stop - start))

test_df["num_luke_entities"] = test_df["luke"].apply(lambda x: get_num_entities(x))
test_df["true_num_entities"] = test_df["Name"].apply(lambda x: get_true_num_entities(x))
test_df["all_person_entities"] = test_df["luke"].apply(lambda x: all_person_entities(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


LUKE Runtime: 66.99858215800009 seconds
{'text': 'mario lucatero', 'start_pos': 0, 'end_pos': 14, 'labels': [PER (1.0)]}
{'text': 'mario escutia', 'start_pos': 0, 'end_pos': 13, 'labels': [PER (1.0)]}
{'text': 'mario bahena', 'start_pos': 0, 'end_pos': 12, 'labels': [PER (1.0)]}
{'text': 'mario uriostegui', 'start_pos': 0, 'end_pos': 16, 'labels': [PER (1.0)]}
{'text': 'mario ruvalcaba', 'start_pos': 0, 'end_pos': 15, 'labels': [PER (1.0)]}
{'text': 'mario vences', 'start_pos': 0, 'end_pos': 12, 'labels': [PER (1.0)]}
{'text': 'mario plascencia', 'start_pos': 0, 'end_pos': 16, 'labels': [PER (1.0)]}
{'text': 'mario jimenes', 'start_pos': 0, 'end_pos': 13, 'labels': [PER (1.0)]}
{'text': 'mario tinajero', 'start_pos': 0, 'end_pos': 14, 'labels': [PER (1.0)]}
{'text': 'mario chairez', 'start_pos': 0, 'end_pos': 13, 'labels': [PER (1.0)]}
{'text': 'mario guadarrama', 'start_pos': 0, 'end_pos': 16, 'labels': [PER (1.0)]}
{'text': 'mario buenrostro', 'start_pos': 0, 'end_pos': 16, 'labels':

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http

['O',
 'B-PER',
 'E-PER',
 'O',
 'O',
 'O',
 'O',
 'S-PER',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'E-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'S-PER']

In [49]:
def get_flair_entities(input_string):
  sentence = Sentence(input_string)

  # predict NER tags
  tagger.predict(sentence)
  sentence_length = len(sentence)
  values = ["O"] * len(sentence)
  total_string = ""
  tagged_string = sentence.to_tagged_string()
  true_index = 0
  count_entities = 0
  for word in (tagged_string.split(" ")): 
    if word[0] == "<" and word[-1] == ">":
      entity_type = word[1:-1]
      if entity_type =="S-PER":
        entity_type = "B-PER"
      if entity_type =="E-PER":
        entity_type = "I-PER"

      values[true_index-1-count_entities] = entity_type
      count_entities += 1
    true_index += 1
  return values


In [51]:
get_flair_entities("hello George Johnson. I ran into Steve yesterday. Steven Minninger Peletz was smart. thanks again")

['O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O']

In [10]:
test_df

Unnamed: 0,Name,Race,luke,flair_results,num_luke_entities,true_num_entities,all_person_entities
25328,mario lucatero went to the store.,Hispanic,,"Span [1,2]: ""mario lucatero"" [− Labels: PER ...",0,6,True
25329,mario escutia went to the store.,Hispanic,,"Span [1,2]: ""mario escutia"" [− Labels: PER (...",0,6,True
25330,mario bahena went to the store.,Hispanic,,"Span [1,2]: ""mario bahena"" [− Labels: PER (1...",0,6,True
25331,mario uriostegui went to the store.,Hispanic,,"Span [1,2]: ""mario uriostegui"" [− Labels: PE...",0,6,True
25332,mario ruvalcaba went to the store.,Hispanic,,"Span [1,2]: ""mario ruvalcaba"" [− Labels: PER...",0,6,True
...,...,...,...,...,...,...,...
25423,mario tejeda went to the store.,Hispanic,,"Span [1,2]: ""mario tejeda"" [− Labels: PER (1...",0,6,True
25424,mario tavarez went to the store.,Hispanic,,"Span [1,2]: ""mario tavarez"" [− Labels: PER (...",0,6,True
25425,mario taveras went to the store.,Hispanic,,"Span [1,2]: ""mario taveras"" [− Labels: PER (...",0,6,True
25426,mario villatoro went to the store.,Hispanic,,"Span [1,2]: ""mario villatoro"" [− Labels: PER...",0,6,True


In [11]:
# start = timeit.default_timer()
# df["luke"] = df["Name"].apply(lambda x: get_luke_entity(x))
# stop = timeit.default_timer()
# print('LUKE Runtime: {} seconds'.format(stop - start))

# start = timeit.default_timer()
# df["flair_results"] = df["Name"].apply(lambda x: get_flair_result(x))
# stop = timeit.default_timer()
# print('Flair Runtime: {} seconds'.format(stop - start))

# df["num_luke_entities"] = df["luke"].apply(lambda x: get_num_entities(x))
# df["true_num_entities"] = df["Name"].apply(lambda x: get_true_num_entities(x))
# df["all_person_entities"] = df["luke"].apply(lambda x: all_person_entities(x))

In [12]:
example_sentences = ["I spoke with {}", "{} went to the store"]

In [13]:
# df.to_csv("df_with_ner.csv")
# files.download('df_with_ner.csv')


In [14]:
# df.loc[(df["all_person_entities"]!= True) | (df["true_num_entities"] != df["num_luke_entities"])]["Race"].value_counts()

In [15]:
# !pip install datasets
# from datasets import load_dataset
# dataset = load_dataset(
#    'conll2003')



# Scratch Code: 

In [16]:
# # load tagger
# tagger = SequenceTagger.load("flair/ner-english")

# # make example sentence
# sentence = Sentence("George Washington went to Washington")

# # predict NER tags
# tagger.predict(sentence)

# # print sentence
# print(sentence)

# # print predicted NER spans
# print('The following NER tags are found:')
# # iterate over entities and print
# for entity in sentence.get_spans('ner'):
#     print(entity)


In [17]:

# text = "Beyoncé lives in Los Angeles"

# # List all possible entity spans in the text
# word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
# word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
# entity_spans = []
# for i, start_pos in enumerate(word_start_positions):
#   for end_pos in word_end_positions[i:]:
#     entity_spans.append((start_pos, end_pos))

# inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
# outputs = model(**inputs)
# logits = outputs.logits
# predicted_class_indices = logits.argmax(-1).squeeze().tolist()
# for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
#   if predicted_class_idx != 0:
#     print(text[span[0]:span[1]], model.config.id2label[predicted_class_idx])
