# Test pretrained scispacy models on PubMed hand-annotated abstracts

## 1: Setup 

### 1.1: Load models and packages

In [1]:
%%capture
!pip install -U spacy<3.0.0
!pip install -U scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz


In [26]:
import os
import json
from pathlib import Path
import pandas as pd
import spacy
import scispacy
from spacy import displacy


### 1.2: Load text examples

In [27]:
hand_annotated_pubmed_data_filename = "../../../data/hand_annotated_abstracts/test/part-00000-3f29966d-3707-4973-a811-6c53c50a0735-c000.json"
output_filepath = "../../../data/hand_annotated_abstracts/results/"

### 1.3: Define shared functions

In [55]:
def get_ner_model_results_from_json(
    model: str, input_filename: str, output_filepath: str, render: bool = False
) -> None:
    nlp = spacy.load(model)

    with open(input_filename, "r") as f:
        contents = [json.loads(json_line) for json_line in list(f)]

        result_rows = []
        for corpus in contents:
            # Create results tuple for every entity in each abstract.
            # Here's the boilerplate of every result in this abstract or corpus.
            result = (corpus["id"], corpus["pmid"])

            doc = nlp(corpus["data"])

            if render:
                displacy.render(doc, style="ent")
                print("\n")

            # This produces the entity, label and position
            for x in doc.ents:
                _entities = (
                    x.text,
                    x.label_,
                    x.start_char,
                    x.end_char,
                )

                result_rows.append(result + _entities)

    result_cols = [
        "id",
        "pmid",
        "test_ent_text",
        "test_ent_label",
        "test_ent_start",
        "test_ent_end",
    ]
    results_df = pd.DataFrame.from_records(result_rows, columns=result_cols)
    results_df = results_df.sort_values(["id", "test_ent_start"], ascending=(True, True))

    output_filepath = Path(output_filepath)
    output_filepath.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(Path(f"{output_filepath}/{model}.csv") , index=False)

## 2: NER Model test runs

These will produce results to CSV in this repo.

### 2.1: `en_core_sci_md`

In [56]:
get_ner_model_results_from_json(model="en_core_sci_md", input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath, render=False)

### 2.2: `en_core_sci_lg`

In [57]:
get_ner_model_results_from_json(model="en_core_sci_lg", input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath, render=False)

## 3: Analysis of NER Models

### 3.1: `en_core_sci_md`

In [60]:
# Load the results of this model
en_core_sci_md_results_df = pd.read_csv(os.path.join(output_filepath, "en_core_sci_md.csv"))
en_core_sci_md_results_df.head(20)

Unnamed: 0,id,pmid,test_ent_text,test_ent_label,test_ent_start,test_ent_end
0,24,33081197,Biochemical,ENTITY,4,15
1,24,33081197,Genetic Basis,ENTITY,20,33
2,24,33081197,Biosynthesis,ENTITY,42,54
3,24,33081197,Bioactive Compounds,ENTITY,58,77
4,24,33081197,Largest Medicinal Crops,ENTITY,124,147
5,24,33081197,Europe,ENTITY,151,157
6,24,33081197,Hypericum,ENTITY,162,171
7,24,33081197,Saint John's Wort,ENTITY,208,225
8,24,33081197,SJW,ENTITY,227,230
9,24,33081197,medicinal plant,ENTITY,249,264


In [67]:
# Split of entity labels
en_core_sci_md_results_df.value_counts("test_ent_label")

test_ent_label
ENTITY    1002
dtype: int64

In [64]:
# Total number of entities returned
len(en_core_sci_md_results_df)

1002

In [69]:
# Number of entities and unique entities for each abstract
en_core_sci_md_results_df[["id", "test_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,test_ent_label,test_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
24,42,1
25,137,1
26,18,1
27,130,1
28,84,1
29,19,1
30,14,1
31,81,1
32,91,1
33,33,1


### 3.2: `en_core_sci_lg`

In [61]:
# Load the results of this model
en_core_sci_lg_results_df = pd.read_csv(os.path.join(output_filepath, "en_core_sci_lg.csv"))
en_core_sci_lg_results_df.head(20)

Unnamed: 0,id,pmid,test_ent_text,test_ent_label,test_ent_start,test_ent_end
0,24,33081197,Biochemical,ENTITY,4,15
1,24,33081197,Genetic Basis,ENTITY,20,33
2,24,33081197,Biosynthesis,ENTITY,42,54
3,24,33081197,Bioactive Compounds,ENTITY,58,77
4,24,33081197,Hypericum,ENTITY,84,93
5,24,33081197,Medicinal Crops,ENTITY,132,147
6,24,33081197,Europe,ENTITY,151,157
7,24,33081197,Hypericum,ENTITY,162,171
8,24,33081197,L.,ENTITY,187,189
9,24,33081197,Saint John's Wort,ENTITY,208,225


In [62]:
# Split of entity labels
en_core_sci_lg_results_df.value_counts("test_ent_label")

test_ent_label
ENTITY    1003
dtype: int64

In [66]:
# Total number of entities returned
len(en_core_sci_lg_results_df)

1003

In [70]:
# Number of entities and unique entities for each abstract
en_core_sci_lg_results_df[["id", "test_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,test_ent_label,test_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
24,45,1
25,130,1
26,18,1
27,131,1
28,85,1
29,20,1
30,15,1
31,80,1
32,90,1
33,31,1


In [None]:
# Where the ent_start matches the ent_start of the original set
# Where the ent_end matches the ent_end of the original set

## Conclusions