# Validation and Evaluation of the rule-based NER model

In [1]:
import spacy
from spacy import displacy
from spacy.lang.en import English

In [2]:
# reads in text file
with open('10_fulltexts.txt', 'r', encoding='utf-8') as f:
    text = f.read()

## Creating list with NER results of spacy model

In [3]:
# loads the model, nlp on text 
nlp = spacy.load('en_core_web_trf')
doc = nlp(str(text))



In [8]:
# checks whether species are found as POS PRPON 
for tok in doc:
    if tok.pos_=="PROPN":
        print(tok.text, tok.ent_type_) # write about in thesis, spacy found part of species names but doesn tag them as entity

ANI 
ANIb 
ANIm 
MUMmer 
Aliidiomarina 
Idiomarinaceae 
Alteromonadales 
Idiomarina 
Pseudidiomarina 
List 
Standing 
Nomenclature 
Pseudidiomarina 
Idiomarina 
Pseudidiomarina 
Aliidiomarina 
Pseudidiomarina 
Aliidiomarina 
Aliidiomarina 
Aliidiomarina 
Aliidiomarina 
Aliidiomarina 
Aliidiomarina 
Spirulina 
Aliidiomarina 
Aliidiomarina 
T 
Hutong LOC
Qagan LOC
Lake LOC
Ordos LOC
Inner GPE
Mongolia GPE
Autonomous GPE
Region GPE
China 
July DATE
LN 
NaCl 
Difco 
NaOH 
LN 
LN 
LN 
IM 
T 
Takara PRODUCT
JM109 
BioMed ORG
Witon ORG
Information ORG
Technology ORG
Co. ORG
Ltd. ORG
Beijing GPE
PR 
China GPE
EzBioCloud 
mega 
IM 
IM PRODUCT
T 
sanyensis 
IM 
T 
Aliidiomarina 
AIT1 
T 
GYP-17 
T 
AK5 
T 
Figs 
Pseudidiomarina 
Idiomarina 
S2 
Aliidiomarina 
Pseudidiomarina 
Idiomarina 
FastTree 
JTT+CAT 
IM PRODUCT
T 
Aliidiomarina 
IM 
T 
Biomarker ORG
Technologies ORG
Beijing GPE
PR 
China GPE
DNA 
CTAB 
Qubit 
Invitrogen ORG
Illumina ORG
NovaSeq 
SPAdes 
VJWL00000000 
GenBank 
NCBI ORG
NCBI

In [9]:
# tests, if model is able to find species labeled as PRODUCT
for i in doc.ents:
    if i.label_=='PRODUCT':
        print(i.text, i.label_)

Takara PRODUCT
IM 1326 PRODUCT
IM 1326 PRODUCT
the Genome-to-Genome Distance Calculator PRODUCT
IM 1326T PRODUCT
2216E PRODUCT
1326 PRODUCT
BX 51 PRODUCT
IM 1326 PRODUCT
IM 1326 PRODUCT
IM 1326 PRODUCT
DP3N28-2 PRODUCT
the Type (Strain) Genome Server PRODUCT
DP3N28-2 PRODUCT
HiSeq X-ten PRODUCT
Illumina PRODUCT
DP3N28-2 PRODUCT
CX43 PRODUCT
G2 F30 PRODUCT
DP3N28-2 PRODUCT
OD600 PRODUCT
DP3N28-2 PRODUCT
1.7290 PRODUCT
DP3N28-2 PRODUCT
Microbial ID system PRODUCT
Biolog GEN III MicroPlate assays, α-hydroxybutyric acid, l-arginine, l-lactic acid, β-hydroxy-d,l-butyric acid, d-fucose, acetoacetic acid and trehalose are oxidized. PRODUCT
Difco PRODUCT
TIANamp Bacteria DNA Kit PRODUCT
GenBank PRODUCT
N. pedocola JCM31011T PRODUCT
N. pedocola JCM31011 PRODUCT
N. pedocola JCM31011T PRODUCT
T. fusca DSM 15424 PRODUCT
T. fusca DSM 15424 PRODUCT
GC6890 PRODUCT
Agilent PRODUCT
18199 PRODUCT
31011 PRODUCT
15444 PRODUCT
GasPak PRODUCT
GasPak PRODUCT
5700 A PRODUCT
3370 A PRODUCT
GasPak PRODUCT


In [11]:
# creates list with found Entities labeled as PRODUCT
ner_spacy = []
for e in doc.ents:
    if e.label_=='PRODUCT':
        ner_spacy.append((e.start_char, e.end_char, e.text))

## Creating list with NER results of scispacy model

In [29]:
nlp_sci = spacy.load('en_ner_bionlp13cg_md')
doc_sci = nlp_sci(str(text))

In [45]:
# creates list with found Entities labeled as ORGANISM
ner_scispacy = []
for e in doc_sci.ents:
    if e.label_=='ORGANISM':
        ner_scispacy.append((e.start_char, e.end_char, e.text))

In [35]:
ner_scispacy

[(2354, 2359, 'brine'),
 (2768, 2780, 'fish peptone'),
 (3905, 3927, 'Escherichia coli JM109'),
 (16240, 16261, 'reagent sulfuric acid'),
 (24231, 24233, 'MA'),
 (25211, 25218, 'MP tree'),
 (27903, 27905, 'MA'),
 (60853, 60876, 'Streptococcus pharyngis'),
 (64581, 64614, 'Pseudomonas nosocomialis sp. nov.'),
 (71141, 71152, 'Holdeman (7'),
 (71373, 71380, 'Tissier'),
 (71543, 71551, 'Holdeman'),
 (72191, 72201, '2-week-old'),
 (72949, 72961, 'beef extract'),
 (73429, 73441, 'beef extract'),
 (73635, 73646, 'horse blood'),
 (73794, 73802, 'Holdeman'),
 (75139, 75152, 'Kovac reagent'),
 (76184, 76194, 'PVA B3/ 63'),
 (77585, 77596, 'horse blood'),
 (78898, 78909, 'Holdeman (5'),
 (79509, 79516, 'Tissier')]

In [36]:
# figures out, which spans are found, if label is ORGANISM
ner_scispacy = []
for e in doc_sci.ents:
    if e.label_=='ORGANISM':
        print(e.text,"-->",e.label_,"\n") #keep it for appendix, scispacy doesnt recognize species names and ORGANISM

brine --> ORGANISM 

fish peptone --> ORGANISM 

Escherichia coli JM109 --> ORGANISM 

reagent sulfuric acid --> ORGANISM 

MA --> ORGANISM 

MP tree --> ORGANISM 

MA --> ORGANISM 

Streptococcus pharyngis --> ORGANISM 

Pseudomonas nosocomialis sp. nov. --> ORGANISM 

Holdeman (7 --> ORGANISM 

Tissier --> ORGANISM 

Holdeman --> ORGANISM 

2-week-old --> ORGANISM 

beef extract --> ORGANISM 

beef extract --> ORGANISM 

horse blood --> ORGANISM 

Holdeman --> ORGANISM 

Kovac reagent --> ORGANISM 

PVA B3/ 63 --> ORGANISM 

horse blood --> ORGANISM 

Holdeman (5 --> ORGANISM 

Tissier --> ORGANISM 



## Creating list with NER results of custom model

In [15]:
# creates pipeline and adds custom entity ruler for species
nlp_custom = English()
ruler = nlp_custom.add_pipe("entity_ruler")
ruler.from_disk("./entity_ruler_species") 

doc_custom = nlp_custom(text)

In [16]:
# creates list with found entities labeled as species
ner_custom = []
for e in doc_custom.ents:
    if e.label_=='SPECIES':
        ner_custom.append((e.start_char, e.end_char, e.text))

In [19]:
# saves pipeline to disk
nlp_custom.to_disk('rule-based_NER_model')

## Creating list with NER annotations to test against

In [46]:
# copied and pasted start char, end char and labels of all entities from '10_fulltexts_annotation.jsonl' 
list_annotated_species = [[1092,1117,"SPECIES"],[1119,1151,"SPECIES"],[1153,1176,"SPECIES"],[1178,1198,"SPECIES"],[1200,1223,"SPECIES"],[1225,1248,"SPECIES"],[1250,1268,"SPECIES"],[1270,1289,"SPECIES"],[1291,1312,"SPECIES"],[1317,1340,"SPECIES"],[1461,1480,"SPECIES"],[3905,3921,"SPECIES"],[4742,4754,"SPECIES"],[4756,4768,"SPECIES"],[4770,4782,"SPECIES"],[4784,4805,"SPECIES"],[4807,4817,"SPECIES"],[4819,4828,"SPECIES"],[4833,4847,"SPECIES"],[5090,5104,"SPECIES"],[5112,5124,"SPECIES"],[5138,5159,"SPECIES"],[8052,8066,"SPECIES"],[8447,8459,"SPECIES"],[9002,9014,"SPECIES"],[9562,9576,"SPECIES"],[9681,9702,"SPECIES"],[9918,9932,"SPECIES"],[9952,9973,"SPECIES"],[14975,14989,"SPECIES"],[15009,15030,"SPECIES"],[15419,15433,"SPECIES"],[15507,15528,"SPECIES"],[17446,17483,"SPECIES"],[17497,17525,"SPECIES"],[20179,20193,"SPECIES"],[20213,20234,"SPECIES"],[20250,20262,"SPECIES"],[20275,20287,"SPECIES"],[20297,20306,"SPECIES"],[20318,20330,"SPECIES"],[20342,20349,"SPECIES"],[20364,20372,"SPECIES"],[20387,20397,"SPECIES"],[20410,20422,"SPECIES"],[20732,20746,"SPECIES"],[20767,20788,"SPECIES"],[20965,20977,"SPECIES"],[20987,20999,"SPECIES"],[21006,21015,"SPECIES"],[21024,21036,"SPECIES"],[21045,21052,"SPECIES"],[21064,21072,"SPECIES"],[21083,21093,"SPECIES"],[21106,21118,"SPECIES"],[21358,21372,"SPECIES"],[21392,21413,"SPECIES"],[21429,21441,"SPECIES"],[21466,21475,"SPECIES"],[21487,21499,"SPECIES"],[21511,21518,"SPECIES"],[21533,21541,"SPECIES"],[21556,21566,"SPECIES"],[21579,21591,"SPECIES"],[21962,21977,"SPECIES"],[22261,22276,"SPECIES"],[22278,22301,"SPECIES"],[22303,22323,"SPECIES"],[22325,22344,"SPECIES"],[22349,22374,"SPECIES"],[24085,24092,"SPECIES"],[24323,24330,"SPECIES"],[25720,25735,"SPECIES"],[25756,25780,"SPECIES"],[25799,25826,"SPECIES"],[25844,25863,"SPECIES"],[25880,25901,"SPECIES"],[25923,25945,"SPECIES"],[26058,26065,"SPECIES"],[26351,26358,"SPECIES"],[27207,27214,"SPECIES"],[28406,28413,"SPECIES"],[30086,30093,"SPECIES"],[30385,30392,"SPECIES"],[30810,30817,"SPECIES"],[30890,30897,"SPECIES"],[30992,30999,"SPECIES"],[31155,31162,"SPECIES"],[31378,31385,"SPECIES"],[31789,31796,"SPECIES"],[32778,32785,"SPECIES"],[32925,32932,"SPECIES"],[33200,33207,"SPECIES"],[33330,33337,"SPECIES"],[33536,33565,"SPECIES"],[33579,33599,"SPECIES"],[36578,36592,"SPECIES"],[36607,36614,"SPECIES"],[36913,36928,"SPECIES"],[36954,36961,"SPECIES"],[38408,38423,"SPECIES"],[38430,38451,"SPECIES"],[40670,40687,"SPECIES"],[40706,40727,"SPECIES"],[40748,40767,"SPECIES"],[40789,40808,"SPECIES"],[40826,40839,"SPECIES"],[40860,40877,"SPECIES"],[40894,40912,"SPECIES"],[40933,40950,"SPECIES"],[40968,40989,"SPECIES"],[41014,41040,"SPECIES"],[41058,41073,"SPECIES"],[41096,41116,"SPECIES"],[41142,41163,"SPECIES"],[41198,41222,"SPECIES"],[41453,41464,"SPECIES"],[41604,41624,"SPECIES"],[41642,41659,"SPECIES"],[41682,41700,"SPECIES"],[41723,41743,"SPECIES"],[41762,41785,"SPECIES"],[41806,41827,"SPECIES"],[41855,41881,"SPECIES"],[44000,44011,"SPECIES"],[44115,44126,"SPECIES"],[44201,44212,"SPECIES"],[44390,44401,"SPECIES"],[44490,44501,"SPECIES"],[44558,44569,"SPECIES"],[45431,45442,"SPECIES"],[45537,45548,"SPECIES"],[45648,45659,"SPECIES"],[45866,45877,"SPECIES"],[47087,47095,"SPECIES"],[47617,47625,"SPECIES"],[47716,47724,"SPECIES"],[47739,47750,"SPECIES"],[47893,47901,"SPECIES"],[47917,47928,"SPECIES"],[51639,51669,"SPECIES"],[51674,51707,"SPECIES"],[51722,51743,"SPECIES"],[54880,54904,"SPECIES"],[58487,58498,"SPECIES"],[58514,58525,"SPECIES"],[58541,58554,"SPECIES"],[58570,58585,"SPECIES"],[58601,58621,"SPECIES"],[58637,58651,"SPECIES"],[58668,58680,"SPECIES"],[58696,58703,"SPECIES"],[58721,58730,"SPECIES"],[58749,58764,"SPECIES"],[58782,58797,"SPECIES"],[58815,58826,"SPECIES"],[58842,58855,"SPECIES"],[58952,58963,"SPECIES"],[59218,59226,"SPECIES"],[59242,59253,"SPECIES"],[59270,59281,"SPECIES"],[59298,59307,"SPECIES"],[59323,59337,"SPECIES"],[59353,59365,"SPECIES"],[59382,59399,"SPECIES"],[59499,59507,"SPECIES"],[59741,59752,"SPECIES"],[59768,59779,"SPECIES"],[59795,59808,"SPECIES"],[59824,59839,"SPECIES"],[59855,59875,"SPECIES"],[59891,59905,"SPECIES"],[59922,59934,"SPECIES"],[59950,59957,"SPECIES"],[59975,59984,"SPECIES"],[60003,60018,"SPECIES"],[60036,60051,"SPECIES"],[60069,60080,"SPECIES"],[60096,60109,"SPECIES"],[60190,60201,"SPECIES"],[60436,60444,"SPECIES"],[60460,60471,"SPECIES"],[60488,60499,"SPECIES"],[60516,60525,"SPECIES"],[60541,60555,"SPECIES"],[60571,60583,"SPECIES"],[60600,60617,"SPECIES"],[60691,60699,"SPECIES"],[60853,60876,"SPECIES"],[61067,61110,"SPECIES"],[61157,61197,"SPECIES"],[61258,61288,"SPECIES"],[63544,63564,"SPECIES"],[63967,63990,"SPECIES"],[64581,64614,"SPECIES"],[70722,70757,"SPECIES"],[71002,71015,"SPECIES"],[71209,71222,"SPECIES"],[71498,71522,"SPECIES"],[80122,80135,"SPECIES"],[80328,80341,"SPECIES"],[81110,81146,"SPECIES"],[81154,81167,"SPECIES"]]

In [47]:
# pick only the start and end chars from list to compare with models
annotated_species = [tuple(x[0:2]) for x in list_annotated_species]

In [48]:
len(annotated_species)

203

## Evaluate the models

In [63]:
# counts true positives and false positives
def ner_tp_fp_counter(ner_results):
    
    # calcutaes True Positives
    counter_tp = 0
    for ent in ner_results:
        if ent[0:2] in annotated_species:
            counter_tp = counter_tp + 1
    
    # calcutaes False Positives
    counter_fp = 0
    for ent in ner_results:
        if ent[0:2] not in annotated_species:
            counter_fp = counter_fp + 1
            
     
    return counter_tp, counter_fp

In [64]:
# calculates recision, recall and f1-score
def calculate_pre_rec_f1(ner_results):
    tp, fp = ner_tp_fp_counter(ner_results)
    tn = 0
    fn = 0
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1score = 2 * precision * recall / (precision + recall)
    
    return precision, recall, f1score

In [80]:
ner_tp_fp_counter(ner_spacy)

(0, 44)

In [81]:
ner_tp_fp_counter(ner_scispacy)

(2, 20)

In [82]:
ner_tp_fp_counter(ner_custom)

(57, 3)

In [83]:
calculate_pre_rec_f1(ner_custom) # mention precision only in thesis because of tn and fn = 0

(0.95, 1.0, 0.9743589743589743)

In [84]:
calculate_pre_rec_f1(ner_scispacy)

(0.09090909090909091, 1.0, 0.16666666666666669)

In [85]:
calculate_pre_rec_f1(ner_spacy)

ZeroDivisionError: division by zero

In [73]:
# check which Fales Positives was found by custom entity ruler
for ent in ner_custom:
    if ent[0:2] not in annotated_species:
        print(ent)

(67554, 67569, 'Candidatus List')
(67688, 67703, 'Candidatus List')
(81347, 81371, 'Fusobacterium perfoetens')
