# Pretrained spaCy model NER test on samples of data with botanical names


In [7]:
%%capture
!pip install -U spacy<3.0.0
!pip install -U scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_scibert-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_ner_craft_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_ner_jnlpba_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_ner_bc5cdr_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_ner_bionlp13cg_md-0.3.0.tar.gz



In [8]:
import os
import spacy
import scispacy
from spacy import displacy
# nlp = spacy.load("en_core_web_sm",  disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])


In [9]:
ginseng_sentences_data_filepath = "../data/1-analysis/ginseng-sentences/"

ginseng_sentences_filenames = os.listdir(ginseng_sentences_data_filepath)

ginseng_sentences_filenames

['26850342-1.txt', '26850342-0.txt', '31885119-0.txt', '31880255-0.txt']

In [10]:
def run_ner_model(model: str, data_filepath: str, data_filenames: str) -> None:
    nlp = spacy.load(model)

    for _ in data_filenames:
        with open(os.path.join(data_filepath, _), "r") as f:
            text = f.read()
            doc = nlp(text)
            displacy.render(doc, style="ent")
            print("\n")

    # This produces the entity, label and position
    ent_bc = {}
    for x in doc.ents:
        ent_bc[x.text] = x.label_
        print(x.text, x.label_, x.start_char, x.end_char)


In [11]:
nlp = spacy.load("en_core_sci_sm")

In [12]:
run_ner_model("en_core_sci_sm", ginseng_sentences_data_filepath, ginseng_sentences_filenames)















Medicinal species ENTITY 0 17
garlic ENTITY 26 32
Allium sativum ENTITY 34 48
celery ENTITY 51 57
Apium graveolens ENTITY 59 75
Black Cumin ENTITY 78 89
Nigella sativa ENTITY 91 105
Ginseng ENTITY 111 118
Panax ENTITY 120 125
therapeutically ENTITY 157 172
plant derivatives ENTITY 178 195
controlling ENTITY 200 211
hypertension ENTITY 212 224
Asteraceae ENTITY 231 241
Apiaceae ENTITY 243 251
Rosaceae ENTITY 256 264
botanical families ENTITY 279 297


In [13]:
run_ner_model("en_core_sci_md", ginseng_sentences_data_filepath,ginseng_sentences_filenames)















Medicinal species ENTITY 0 17
garlic ENTITY 26 32
Allium sativum ENTITY 34 48
celery ENTITY 51 57
Apium graveolens ENTITY 59 75
Black Cumin ENTITY 78 89
Nigella sativa ENTITY 91 105
Ginseng ENTITY 111 118
Panax ENTITY 120 125
therapeutically ENTITY 157 172
plant derivatives ENTITY 178 195
controlling ENTITY 200 211
hypertension ENTITY 212 224
Asteraceae ENTITY 231 241
Apiaceae ENTITY 243 251
Rosaceae ENTITY 256 264
botanical families ENTITY 279 297


In [14]:
run_ner_model("en_core_sci_lg", ginseng_sentences_data_filepath, ginseng_sentences_filenames)















Medicinal species ENTITY 0 17
garlic ENTITY 26 32
Allium sativum ENTITY 34 48
celery ENTITY 51 57
Apium graveolens ENTITY 59 75
Black Cumin ENTITY 78 89
Nigella sativa ENTITY 91 105
Ginseng ENTITY 111 118
Panax ENTITY 120 125
therapeutically ENTITY 157 172
plant derivatives ENTITY 178 195
controlling ENTITY 200 211
hypertension ENTITY 212 224
Asteraceae ENTITY 231 241
Apiaceae ENTITY 243 251
Rosaceae ENTITY 256 264
botanical families ENTITY 279 297


In [15]:
run_ner_model("en_core_sci_scibert", ginseng_sentences_data_filepath, ginseng_sentences_filenames)

OSError: [E050] Can't find model 'en_core_sci_scibert'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
run_ner_model("en_ner_craft_md", ginseng_sentences_data_filepath, ginseng_sentences_filenames)



















In [None]:
run_ner_model("en_ner_jnlpba_md", ginseng_sentences_data_filepath, ginseng_sentences_filenames)



















In [None]:
run_ner_model("en_ner_bc5cdr_md", ginseng_sentences_data_filepath, ginseng_sentences_filenames)

















In [None]:
run_ner_model("en_ner_bionlp13cg_md", ginseng_sentences_data_filepath, ginseng_sentences_filenames)















