# Pretrained scispacy model test on PubMed hand-annotated abstracts

## 1: Setup 

### 1.1: Load models and packages

In [1]:
%%capture
!pip install -U spacy<3.0.0
!pip install -U scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz


In [26]:
import os
import json
from pathlib import Path
import pandas as pd
import spacy
import scispacy
from spacy import displacy


### 1.2: Load text examples

In [27]:
hand_annotated_pubmed_data_filename = "../../../data/hand_annotated_abstracts/test/part-00000-3f29966d-3707-4973-a811-6c53c50a0735-c000.json"
output_filepath = Path("../../../data/hand_annotated_abstracts/results/")

### 1.3: Define shared functions

In [46]:
def get_ner_model_results_from_json(
    model: str, input_filename: str, output_filepath: str, render: bool = False
) -> None:
    nlp = spacy.load(model)

    with open(input_filename, "r") as f:
        contents = [json.loads(json_line) for json_line in list(f)]

        result_rows = []
        for corpus in contents:
            # Create results tuple for every entity in each abstract.
            # Here's the boilerplate of every result in this abstract or corpus.
            result = (corpus["id"], corpus["pmid"])

            doc = nlp(corpus["data"])

            if render:
                displacy.render(doc, style="ent")
                print("\n")

            # This produces the entity, label and position
            for x in doc.ents:
                _entities = (
                    x.text,
                    x.label_,
                    x.start_char,
                    x.end_char,
                )

            result_rows.append(result + _entities)

    result_cols = [
        "id",
        "pmid",
        "test_ent_text",
        "test_ent_label",
        "test_ent_start",
        "test_ent_end",
    ]
    results_df = pd.DataFrame.from_records(result_rows, columns=result_cols)
    output_filepath.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(Path(f"{output_filepath}/{model}.csv") , index=False)

## 2: NER Model test runs

These will produce results to CSV in this repo.

### 2.1: `en_core_sci_md`

In [47]:
get_ner_model_results_from_json(model="en_core_sci_md", input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath, render=False)

### 2.2: `en_core_sci_lg`

In [48]:
get_ner_model_results_from_json(model="en_core_sci_lg", input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath, render=False)

## 3: Analysis of NER Models

### 3.1: `en_core_sci_md`

In [49]:
# Load the results of this model
en_core_sci_md_results_df = pd.read_csv(os.path.join(output_filepath, "en_core_sci_md.csv"))
en_core_sci_md_results_df.head(10)

Unnamed: 0,id,pmid,test_ent_text,test_ent_label,test_ent_start,test_ent_end
0,24,33081197,medical industries,ENTITY,1200,1218
1,25,33383372,secretion,ENTITY,2412,2421
2,26,32101119,co-administered,ENTITY,370,385
3,27,32588370,diabetes,ENTITY,1936,1944
4,28,32668965,IRAK1-TAK1 signaling pathways,ENTITY,1811,1840
5,29,26219274,medicinal products,ENTITY,405,423
6,30,24188229,women,ENTITY,246,251
7,31,21688071,evaluation,ENTITY,1607,1617
8,32,33961968,investigation,ENTITY,2022,2035
9,33,30380351,grease,ENTITY,807,813


In [None]:
Where the ent_start matches the ent_start of the original set
Where the ent_end matches the ent_end of the original set

## Conclusions