In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
import json
import jsonlines

In [2]:
NER = ["EFFECT"]
set = "test"

In [3]:
l = []
for ner in NER:
	test_sentences = pd.read_json(f"../NER/{ner}/test.json", lines=True)
	test_output = pd.read_json(f"../NER_output/{ner}/test_outputs.json")
	merged = pd.merge(test_sentences, test_output, left_index=True, right_index=True)
	merged.loc[:,"ner"] = ner
	l.append(merged)

df = pd.concat(l)

In [4]:
df = df.reset_index().drop(columns=["index"])

In [5]:
df

Unnamed: 0,tokens,ner_tags,id,predictions,label_ids,word_ids,ner
0,"[V., cholerae, V52, was, previously, found, to...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,"[[-2.378746032714843, -2.319230318069458, 5.14...","[-100, 2, -100, 2, 2, -100, 2, 2, 2, 2, 2, 2, ...","[None, 0, 0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10,...",EFFECT
1,"[In, order, to, identify, novel, endophytic, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,"[[-2.545095205307007, -2.5518798828125, 5.3851...","[-100, 2, 2, 2, 2, 2, 2, -100, -100, 2, -100, ...","[None, 0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, ...",EFFECT
2,"[PCR, targeting, the, partial, groEL, gene, fr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",2,"[[-2.704145431518554, -2.229583501815796, 4.97...","[-100, 2, 2, 2, 2, 2, -100, 2, 2, 2, 2, -100, ...","[None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 8, 9, ...",EFFECT
3,"[Here, we, present, ,, for, the, first, time, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",3,"[[-2.3269846439361572, -1.946883916854858, 4.6...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",EFFECT
4,"[P., aeruginosa, MPAO1, was, grown, planktonic...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",4,"[[-2.251236915588379, -2.402707815170288, 4.99...","[-100, 2, -100, 2, 2, -100, 2, 2, 2, -100, 2, ...","[None, 0, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, ...",EFFECT
...,...,...,...,...,...,...,...
619,"[Calderihabitans, maritimus, KKC1, is, a, ther...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",619,"[[-2.157610416412353, -1.377763867378234, 3.79...","[-100, 2, -100, -100, -100, -100, 2, -100, -10...","[None, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 4, ...",EFFECT
620,"[The, side, chains, of, the, residues, Y60, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",620,"[[-2.486406564712524, -2.085476398468017, 4.81...","[-100, 2, 2, 2, 2, 2, 2, 2, -100, 2, 2, -100, ...","[None, 0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10,...",EFFECT
621,"[Phylogenetic, analysis, based, on, 16S, rRNA,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",621,"[[-1.891680121421814, -0.525259137153625, 2.85...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -100, -10...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 10,...",EFFECT
622,"[Enterobacter, group, on, the, other, hand, wa...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",622,"[[-2.501919031143188, -2.979371786117553, 6.06...","[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",EFFECT


In [6]:
label_ids = df[["label_ids"]]

In [7]:
annotations = []
for j in range(len(df)):
	for i in zip(df.loc[j,"tokens"],df.loc[j,"ner_tags"]):
		if i[1] != "O":
			annotations.append({"label":i[1],"text":i[0]})

### Predict sentences again

In [8]:
ner = "EFFECT"

In [9]:
path= f"../NER_output/{ner}"

tokenizer = AutoTokenizer.from_pretrained(path, model_max_length=512)
model = AutoModelForTokenClassification.from_pretrained(path)

nlp = pipeline(task='ner', model=model, tokenizer=tokenizer,
               aggregation_strategy="average", device=0)


In [10]:
# Specify the path to the JSONL file
jsonl_file = f"../NER/{ner}/test.jsonls"

predictions = []
# Open the JSONL file
with jsonlines.open(jsonl_file) as reader:
    # Iterate over each line in the file
    for line in reader:
        for l in line:
            # Get the value from the "text" field
            text = l["data"]["text"]
            # Pass the value to the nlp() function
            print(text)
            predict = nlp(text)
            predictions.append(predict)
            print(predict)

V. cholerae V52 was previously found to be highly virulent against several Gram-negative bacteria, including E. coli and Salmonella enterica serovar Typhimurium, due to the type VI secretion system (T6SS) ( 54 ).
[]
In order to identify novel endophytic actinobacteria, SUK 42 was isolated from the internal tissue of stem from the plant, Antidesma neurocarpum Miq.
[]
PCR targeting the partial groEL gene fragment of HKU75 ^T yielded DNA products with lengths of approximately 600 bp in DNA samples extracted from the two oral swabs of the guinea pigs.
[]
Here we present, for the first time, a detailed developmental expression pattern of ecdysteroid biosynthesis and ecdysone response genes, highlighting the major differences in gene expression between male and female development in the mealybug Planococcus kraunhiae.
[]
P. aeruginosa MPAO1 was grown planktonically in SCFM1 shaking at 250 rpm at 37 °C overnight.
[]
An individual colony of SBW25 on an M9 agar plate was inoculated into 10 ml o



[]
The moisture content was measured to be 1.8%, and they were oven-dried at 110 °C for six hours then ground to small size before calcining.
[]
We analysed genetic diversity within coding sequences in all four Hymenoscyphus species.
[]
It was shown that this Thauera strain was able to degrade toluene under both aerobic and anaerobic conditions, and in the presence of oxygen it used a toluene-dioxygenase (tod) enzyme for initial activation of the aromatic ring (Shinoda et al.
[{'entity_group': 'I', 'score': 0.6687876, 'word': 'activation of the aromatic ring', 'start': 196, 'end': 227}]
However, that UMA51804 was isolated from the same marmoset host adds a measure of uncertainty to this hypothesis.
[]
One of the metabolites that are overabundant in HT115 compared with OP50 is GABA.
[]
The status of the tumor-suppressor gene, p53, is null in PC3 cells but mutated in DU145 cells, and contributes to differential cell fates [33-35].
[{'entity_group': 'I', 'score': 0.7474953, 'word': 'cell 

In [11]:
predictions = [p for p in predictions if p]

In [12]:
predictions

[[{'entity_group': 'I',
   'score': 0.66726655,
   'word': 'regulatory signals',
   'start': 127,
   'end': 145},
  {'entity_group': 'I',
   'score': 0.89935106,
   'word': 'maintenance of healthy gastrointestinal microbiota, protecting gut mucosal barrier function,',
   'start': 158,
   'end': 250},
  {'entity_group': 'I',
   'score': 0.90674835,
   'word': 'prevention of colitis',
   'start': 255,
   'end': 276}],
 [{'entity_group': 'I',
   'score': 0.6687876,
   'word': 'activation of the aromatic ring',
   'start': 196,
   'end': 227}],
 [{'entity_group': 'I',
   'score': 0.7474953,
   'word': 'cell fates',
   'start': 127,
   'end': 137}],
 [{'entity_group': 'I',
   'score': 0.6077973,
   'word': 'expression',
   'start': 128,
   'end': 138}],
 [{'entity_group': 'I',
   'score': 0.5642765,
   'word': 'plant growth - promoting attributes',
   'start': 15,
   'end': 48},
  {'entity_group': 'I',
   'score': 0.64026797,
   'word': 'nitrogen fixation',
   'start': 86,
   'end': 103},
 

In [13]:
annotations

[{'label': 'B', 'text': 'biosynthesis'},
 {'label': 'B', 'text': 'response'},
 {'label': 'B', 'text': 'initiates'},
 {'label': 'I', 'text': 'regulatory'},
 {'label': 'I', 'text': 'signals'},
 {'label': 'B', 'text': 'healthy'},
 {'label': 'I', 'text': 'gastrointestinal'},
 {'label': 'I', 'text': 'microbiota'},
 {'label': 'B', 'text': 'gut'},
 {'label': 'I', 'text': 'mucosal'},
 {'label': 'I', 'text': 'barrier'},
 {'label': 'I', 'text': 'function'},
 {'label': 'B', 'text': 'N'},
 {'label': 'I', 'text': 'sufficiency'},
 {'label': 'B', 'text': 'tumor-suppressor'},
 {'label': 'B', 'text': 'toxic'},
 {'label': 'B', 'text': 'negative'},
 {'label': 'I', 'text': 'phototaxis'},
 {'label': 'I', 'text': 'to'},
 {'label': 'I', 'text': 'UV'},
 {'label': 'I', 'text': 'light'},
 {'label': 'B', 'text': 'plant'},
 {'label': 'I', 'text': 'growth-promoting'},
 {'label': 'B', 'text': 'rate'},
 {'label': 'I', 'text': 'of'},
 {'label': 'I', 'text': 'adsorption'},
 {'label': 'B', 'text': 'preventing'},
 {'lab

In [14]:
for prediction in predictions:
    pred = prediction[0]["word"]
    count = 0
    for annotation in annotations:
        ann = annotation["text"]
        if ann in pred:
            count +=1
    if count <2:
        print(pred)

smoky
lipid
promotion
copper
spore
probiotic
colonize
promiscuity
proteolysis
deficiency
- specific


In [15]:
ners =["COMPOUND", "DISEASE", "EFFECT","ISOLATE","MEDIUM","METABOLITE","ORGANISM",
            "PHENOTYPE","PROTEIN","SPECIES","STRAIN"]
for ner in ners:
    test_sentences = pd.read_json(f"../NER/{ner}/test.json", lines=True)
    test_output = pd.read_json(f"../NER_output/{ner}/test_outputs.json")
    merged = pd.merge(test_sentences, test_output, left_index=True, right_index=True)
    merged.loc[:,"ner"] = ner
    label_ids = merged[["label_ids"]]

    annotations = []
    for j in range(len(merged)):
    	for i in zip(merged.loc[j,"tokens"],merged.loc[j,"ner_tags"]):
    		if i[1] != "O":
    			annotations.append({"label":i[1],"text":i[0]})
    
    path= f"../NER_output/{ner}"
    
    tokenizer = AutoTokenizer.from_pretrained(path, model_max_length=512)
    model = AutoModelForTokenClassification.from_pretrained(path)
    
    nlp = pipeline(task='ner', model=model, tokenizer=tokenizer,
                   aggregation_strategy="average", device=0)

    # Specify the path to the JSONL file
    jsonl_file = f"../NER/{ner}/test.jsonls"
    
    predictions = []
    # Open the JSONL file
    with jsonlines.open(jsonl_file) as reader:
        # Iterate over each line in the file
        for line in reader:
            for l in line:
    
                # Get the value from the "text" field
                text = l["data"]["text"]
                # Pass the value to the nlp() function
                predict = nlp(text)
                predictions.append(predict)
    predictions = [p for p in predictions if p]
    print("-----")
    print(ner)
    print("-----")
    for prediction in predictions:
        pred = prediction[0]["word"]
        pred.replace(" - ","-")
        count = 0
        for annotation in annotations:
            ann = annotation["text"].lower()
            if ann in pred:
                count +=1
        if count <1:
            print(pred)



-----
COMPOUND
-----
n
n
methyl
mn
mn
mg
dye
dh
v
1
h
h
h




-----
DISEASE
-----
anthrax
melanoma
visceral
urinary
diarrhea
sti
malaria
cystic
obese
snl
musculoskeletal
blast
global
illnesses
small
dengue




-----
EFFECT
-----
smoky




-----
ISOLATE
-----
diffuse
vermisources




-----
MEDIUM
-----
mineral
m
congo
luria - bertani
spent
gpy
alpha
luria - bertani
^




-----
METABOLITE
-----
rs
3
α
α
α
3
α




-----
ORGANISM
-----
f
b
s
h
pepper
b
te
m
b
sk
hek
ribes
fumigatus
eugenia farias
h
m
108t
h
root
m
t
pomegranate
a427
shrimp
j2
moorea
t
bees
s
1
pomegranate
daf
pepper
a549
d
vero
s2
ants
g
71
wb
bt
sp
m
u
t
b
e
spore
b
s
wheat
o
o
mda
s




-----
PHENOTYPE
-----
extra
gram
gram
gram
gram
co
non
gram
cold
free
antarctic
gram
gram
gram
n
gram
gram
phototrophic
gram
non
dehalogenide
alpha - hemolytic
non
stx
gram




-----
PROTEIN
-----
rls
p
sp
toll
s
3
89
l
α




-----
SPECIES
-----
v
b
b
d
m
b
b
x
h
a
r
h
d
v
m
m
k
l
a
b
p
t
a
b
b
p
f
a
p
o
z
p
h
a
p
l
r
p
y
l
p
l
b
p
t
p
p
p
r




-----
STRAIN
-----
k
g
c
fo
l
l2
l
v
c
l
n
c
n
y
l
k
n
n
l
fi
l
v
n2
l
x
l
z
f
l
c
fo
v
yjn - 5
l
l
‐
l
fo
w
l
c
c
c
k
l
k
f
n
