In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
example = """The site is located in the Peninsular Ranges
geomorphic province. The Peninsular Ranges are a
northwest-
southwest oriented complex of blocks separated by
similarly trending faults.  They extend from the
Transverse Ranges and the Los Angeles Basin south to
the Mexican border and beyond to the tip of Baja
California and are bounded on the east by the
Colorado Desert and the Gulf of California.  The
Peninsular
Ranges contain minor Jurassic and extensive
Cretaceous igneous rocks associated with the Nevadan
plutonism.  Marine Cretaceous sedimentary rocks are
well represented, and post-Cretaceous rocks form a
restricted veneer of volcanic, marine, and nonmarine
sediments."""
print(example)



The site is located in the Peninsular Ranges
geomorphic province. The Peninsular Ranges are a
northwest-
southwest oriented complex of blocks separated by
similarly trending faults.  They extend from the
Transverse Ranges and the Los Angeles Basin south to
the Mexican border and beyond to the tip of Baja
California and are bounded on the east by the
Colorado Desert and the Gulf of California.  The
Peninsular
Ranges contain minor Jurassic and extensive
Cretaceous igneous rocks associated with the Nevadan
plutonism.  Marine Cretaceous sedimentary rocks are
well represented, and post-Cretaceous rocks form a
restricted veneer of volcanic, marine, and nonmarine
sediments.


In [67]:
ner_results = nlp(example)
startEntity = False
flagged_text = example
offset = 0
startflag = "[FLAG]"
endflag = "[/FLAG]"
for index, json in enumerate(ner_results):
    if json["entity"].startswith("B"):
        if index != 0:
            end = ner_results[index-1]["end"] + offset
            print(f"Start: {start}, end: {end}")
            
            flagged_text = flagged_text[:start] + startflag + flagged_text[start:end] + endflag + flagged_text[end:]
            offset += len(startflag) + len(endflag)
            start = json["start"] + offset
            
        elif index == 0:
            start = json["start"]

Start: 27, end: 44
Start: 83, end: 100
Start: 230, end: 247
Start: 269, end: 286
Start: 313, end: 320
Start: 366, end: 381
Start: 430, end: 445
Start: 467, end: 485
Start: 505, end: 522
Start: 550, end: 558
Start: 586, end: 596
Start: 644, end: 651
Start: 677, end: 694


In [68]:
print(flagged_text)

The site is located in the [FLAG]Peninsular Ranges[/FLAG]
geomorphic province. The [FLAG]Peninsular Ranges[/FLAG] are a
northwest-
southwest oriented complex of blocks separated by
similarly trending faults.  They extend from the
[FLAG]Transverse Ranges[/FLAG] and the [FLAG]Los Angeles Basin[/FLAG] south to
the [FLAG]Mexican[/FLAG] border and beyond to the tip of [FLAG]Baja
California[/FLAG] and are bounded on the east by the
[FLAG]Colorado Desert[/FLAG] and the [FLAG]Gulf of California[/FLAG].  The
[FLAG]Peninsular
Ranges[/FLAG] contain minor [FLAG]Jurassic[/FLAG] and extensive
[FLAG]Cretaceous[/FLAG] igneous rocks associated with the [FLAG]Nevadan[/FLAG]
plutonism.  [FLAG]Marine Cretaceous[/FLAG] sedimentary rocks are
well represented, and post-Cretaceous rocks form a
restricted veneer of volcanic, marine, and nonmarine
sediments.


In [47]:
for n in ner_results:
    print(n)

{'entity': 'B-LOC', 'score': 0.9130331, 'index': 7, 'word': 'Peninsula', 'start': 27, 'end': 36}
{'entity': 'I-LOC', 'score': 0.84307855, 'index': 8, 'word': '##r', 'start': 36, 'end': 37}
{'entity': 'I-LOC', 'score': 0.9842153, 'index': 9, 'word': 'Range', 'start': 38, 'end': 43}
{'entity': 'I-LOC', 'score': 0.97417194, 'index': 10, 'word': '##s', 'start': 43, 'end': 44}
{'entity': 'B-LOC', 'score': 0.843311, 'index': 17, 'word': 'Peninsula', 'start': 70, 'end': 79}
{'entity': 'I-LOC', 'score': 0.58908993, 'index': 18, 'word': '##r', 'start': 79, 'end': 80}
{'entity': 'I-LOC', 'score': 0.96282834, 'index': 19, 'word': 'Range', 'start': 81, 'end': 86}
{'entity': 'I-LOC', 'score': 0.95143247, 'index': 20, 'word': '##s', 'start': 86, 'end': 87}
{'entity': 'B-LOC', 'score': 0.9655876, 'index': 41, 'word': 'Trans', 'start': 204, 'end': 209}
{'entity': 'I-LOC', 'score': 0.9807289, 'index': 42, 'word': '##verse', 'start': 209, 'end': 214}
{'entity': 'I-LOC', 'score': 0.9941742, 'index': 43, 

In [3]:
print("\uf0e4")


