# Test pretrained scispacy models on PubMed hand-annotated abstracts

## 1: Setup 

### 1.1: Load models and packages

In [1]:
%%capture
!pip install -U spacy<3.0.0
!pip install -U scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz


In [2]:
import os
import json
from pathlib import Path
import pandas as pd
import spacy
import scispacy
from spacy import displacy


### 1.2: Load text examples

In [8]:
hand_annotated_pubmed_data_filename = "../../../data/hand_annotated_abstracts/test/part-00000-c89cb926-5be3-4c52-a27a-cf9d031475b9-c000.json"
output_filepath = "../../../data/hand_annotated_abstracts/results/"

### 1.3: Define shared functions

In [9]:
def get_ner_model_results_from_json(
    model: str, input_filename: str, output_filepath: str, render: bool = False
) -> None:
    nlp = spacy.load(model)

    with open(input_filename, "r") as f:
        contents = [json.loads(json_line) for json_line in list(f)]

        result_rows = []
        for corpus in contents:
            # Create results tuple for every entity in each abstract.
            # Here's the boilerplate of every result in this abstract or corpus.
            result = (corpus["id"], corpus["pmid"])

            doc = nlp(corpus["data"])

            if render:
                displacy.render(doc, style="ent")
                print("\n")

            # This produces the entity, label and position
            for x in doc.ents:
                _entities = (
                    x.text,
                    x.label_,
                    x.start_char,
                    x.end_char,
                )

                result_rows.append(result + _entities)

    result_cols = [
        "id",
        "pmid",
        "test_ent_text",
        "test_ent_label",
        "test_ent_start",
        "test_ent_end",
    ]
    results_df = pd.DataFrame.from_records(result_rows, columns=result_cols)
    results_df = results_df.sort_values(["id", "test_ent_start"], ascending=(True, True))

    output_filepath = Path(output_filepath)
    output_filepath.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(Path(f"{output_filepath}/{model}.csv") , index=False)

## 2: NER Model test runs

These will produce results to CSV in this repo.

### 2.1: `en_core_sci_md`

In [10]:
get_ner_model_results_from_json(model="en_core_sci_md", input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath, render=False)

### 2.2: `en_core_sci_lg`

In [11]:
get_ner_model_results_from_json(model="en_core_sci_lg", input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath, render=False)

## 3: Analysis of NER Models

### 3.1: `en_core_sci_md`

In [12]:
# Load the results of this model
en_core_sci_md_results_df = pd.read_csv(os.path.join(output_filepath, "en_core_sci_md.csv"))
en_core_sci_md_results_df.head(20)

Unnamed: 0,id,pmid,test_ent_text,test_ent_label,test_ent_start,test_ent_end
0,23,33413787,Efficacy,ENTITY,0,8
1,23,33413787,Safety,ENTITY,13,19
2,23,33413787,Centella Asiatica,ENTITY,23,40
3,23,33413787,Wrinkles,ENTITY,54,62
4,23,33413787,Systematic Review,ENTITY,66,83
5,23,33413787,Published Data,ENTITY,87,101
6,23,33413787,Network Meta-Analysis,ENTITY,106,127
7,23,33413787,Centella asiatica,ENTITY,129,146
8,23,33413787,applications,ENTITY,156,168
9,23,33413787,cosmetics,ENTITY,172,181


In [13]:
# Split of entity labels
en_core_sci_md_results_df.value_counts("test_ent_label")

test_ent_label
ENTITY    856
dtype: int64

In [14]:
# Total number of entities returned
len(en_core_sci_md_results_df)

856

In [15]:
# Number of entities and unique entities for each abstract
en_core_sci_md_results_df[["id", "test_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,test_ent_label,test_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
23,81,1
24,20,1
25,88,1
26,97,1
27,101,1
28,19,1
29,14,1
30,21,1
31,60,1
32,44,1


### 3.2: `en_core_sci_lg`

In [16]:
# Load the results of this model
en_core_sci_lg_results_df = pd.read_csv(os.path.join(output_filepath, "en_core_sci_lg.csv"))
en_core_sci_lg_results_df.head(20)

Unnamed: 0,id,pmid,test_ent_text,test_ent_label,test_ent_start,test_ent_end
0,23,33413787,Efficacy,ENTITY,0,8
1,23,33413787,Safety,ENTITY,13,19
2,23,33413787,Centella Asiatica,ENTITY,23,40
3,23,33413787,Wrinkles,ENTITY,54,62
4,23,33413787,Systematic Review,ENTITY,66,83
5,23,33413787,Published Data,ENTITY,87,101
6,23,33413787,Network Meta-Analysis,ENTITY,106,127
7,23,33413787,Centella asiatica,ENTITY,129,146
8,23,33413787,cosmetics,ENTITY,172,181
9,23,33413787,wrinkle,ENTITY,193,200


In [17]:
# Split of entity labels
en_core_sci_lg_results_df.value_counts("test_ent_label")

test_ent_label
ENTITY    843
dtype: int64

In [18]:
# Total number of entities returned
len(en_core_sci_lg_results_df)

843

In [19]:
# Number of entities and unique entities for each abstract
en_core_sci_lg_results_df[["id", "test_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,test_ent_label,test_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
23,85,1
24,18,1
25,87,1
26,99,1
27,97,1
28,20,1
29,15,1
30,20,1
31,62,1
32,45,1


In [None]:
# Where the ent_start matches the ent_start of the original set
# Where the ent_end matches the ent_end of the original set

## Conclusions