# Test and analyse pretrained scispacy models on PubMed hand-annotated abstracts

## 1: Setup 

### 1.1: Load models and packages

In [181]:
%%capture
!pip install -U spacy<3.0.0
!pip install -U scispacy==0.3.0
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz


In [182]:
import os
import json
from pathlib import Path
import pandas as pd
import spacy
import scispacy
from spacy import displacy


### 1.2: Set up test and results filepaths

In [183]:
hand_annotated_pubmed_data_filename = "../../../data/hand_annotated_abstracts/test/part-00000-c89cb926-5be3-4c52-a27a-cf9d031475b9-c000.json"
expected_filename = "../../../data/hand_annotated_abstracts/answers/hand_annotated_answers.csv"
returned_filepath = "../../../data/hand_annotated_abstracts/results/"

### 1.3: Define shared functions

In [184]:
def get_ner_model_results_from_json(
    model: str, input_filename: str, output_filepath: str, render: bool = False
) -> None:
    nlp = spacy.load(model)

    with open(input_filename, "r") as f:
        contents = [json.loads(json_line) for json_line in list(f)]

        result_rows = []
        for corpus in contents:
            # Create results tuple for every entity in each abstract.
            # Here"s the boilerplate of every result in this abstract or corpus.
            result = (corpus["id"], corpus["pmid"])

            doc = nlp(corpus["data"])

            if render:
                displacy.render(doc, style="ent")
                print("\n")

            # This produces the entity, label and position
            for x in doc.ents:
                _entities = (
                    x.text,
                    x.label_,
                    x.start_char,
                    x.end_char,
                )

                result_rows.append(result + _entities)

    result_cols = [
        "id",
        "pmid",
        "test_ent_text",
        "test_ent_label",
        "test_ent_start",
        "test_ent_end",
    ]
    results_df = pd.DataFrame.from_records(result_rows, columns=result_cols)
    results_df = results_df.sort_values(["id", "test_ent_start"], ascending=(True, True))

    output_filepath = Path(output_filepath)
    output_filepath.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(Path(f"{output_filepath}/{model}.csv") , index=False)

## 2: NER Model test runs

These will produce results to CSV in this repo.

### 2.1: `en_core_sci_md`

In [218]:
get_ner_model_results_from_json(model="en_core_sci_md", input_filename=hand_annotated_pubmed_data_filename, output_filepath=returned_filepath, render=False)

### 2.2: `en_core_sci_lg`

In [219]:
get_ner_model_results_from_json(model="en_core_sci_lg", input_filename=hand_annotated_pubmed_data_filename, output_filepath=returned_filepath, render=False)

## 3: Analysis of NER Models

### 3.1: Load `en_core_sci_md`

In [None]:
# Load the results of this model
en_core_sci_md_results_df = pd.read_csv(os.path.join(returned_filepath, "en_core_sci_md.csv"))
en_core_sci_md_results_df.head(10)

Unnamed: 0,id,pmid,test_ent_text,test_ent_label,test_ent_start,test_ent_end
0,23,33413787,Efficacy,ENTITY,0,8
1,23,33413787,Safety,ENTITY,13,19
2,23,33413787,Centella Asiatica,ENTITY,23,40
3,23,33413787,Wrinkles,ENTITY,54,62
4,23,33413787,Systematic Review,ENTITY,66,83
5,23,33413787,Published Data,ENTITY,87,101
6,23,33413787,Network Meta-Analysis,ENTITY,106,127
7,23,33413787,Centella asiatica,ENTITY,129,146
8,23,33413787,applications,ENTITY,156,168
9,23,33413787,cosmetics,ENTITY,172,181


In [None]:
# Split of entity labels
en_core_sci_md_results_df.value_counts("test_ent_label")

test_ent_label
ENTITY    856
dtype: int64

In [None]:
# Total number of entities returned
len(en_core_sci_md_results_df)

856

In [None]:
# Number of entities and unique entities for each abstract
en_core_sci_md_results_df[["id", "test_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,test_ent_label,test_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
23,81,1
24,20,1
25,88,1
26,97,1
27,101,1
28,19,1
29,14,1
30,21,1
31,60,1
32,44,1


In [None]:
# Rename columns
list(en_core_sci_md_results_df.columns)
columns = [
    "id",
    "test_ent_text",
    "test_ent_label",
    "test_ent_start",
    "test_ent_end"
]

en_core_sci_md_results_df = en_core_sci_md_results_df[columns]

columns = {
    "id": "id",
    "test_ent_text": "ent_text",
    "test_ent_label": "ent_label",
    "test_ent_start": "ent_start",
    "test_ent_end": "ent_end",
}

en_core_sci_md_results_df = en_core_sci_md_results_df.rename(columns=columns)

### 3.2: Load `en_core_sci_lg`

In [None]:
# Load the results of this model
en_core_sci_lg_results_df = pd.read_csv(os.path.join(returned_filepath, "en_core_sci_lg.csv"))
en_core_sci_lg_results_df.head(10)

Unnamed: 0,id,pmid,test_ent_text,test_ent_label,test_ent_start,test_ent_end
0,23,33413787,Efficacy,ENTITY,0,8
1,23,33413787,Safety,ENTITY,13,19
2,23,33413787,Centella Asiatica,ENTITY,23,40
3,23,33413787,Wrinkles,ENTITY,54,62
4,23,33413787,Systematic Review,ENTITY,66,83
5,23,33413787,Published Data,ENTITY,87,101
6,23,33413787,Network Meta-Analysis,ENTITY,106,127
7,23,33413787,Centella asiatica,ENTITY,129,146
8,23,33413787,cosmetics,ENTITY,172,181
9,23,33413787,wrinkle,ENTITY,193,200


In [None]:
# Split of entity labels
en_core_sci_lg_results_df.value_counts("test_ent_label")

test_ent_label
ENTITY    843
dtype: int64

In [None]:
# Total number of entities returned
len(en_core_sci_lg_results_df)

843

In [None]:
# Number of entities and unique entities for each abstract
en_core_sci_lg_results_df[["id", "test_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,test_ent_label,test_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
23,85,1
24,18,1
25,87,1
26,99,1
27,97,1
28,20,1
29,15,1
30,20,1
31,62,1
32,45,1


In [None]:
# Rename columns
list(en_core_sci_lg_results_df.columns)
columns = [
    "id",
    "test_ent_text",
    "test_ent_label",
    "test_ent_start",
    "test_ent_end"
]

en_core_sci_lg_results_df = en_core_sci_lg_results_df[columns]
columns = {
    "id": "id",
    "test_ent_text": "ent_text",
    "test_ent_label": "ent_label",
    "test_ent_start": "ent_start",
    "test_ent_end": "ent_end",
}

en_core_sci_lg_results_df = en_core_sci_lg_results_df.rename(columns=columns)

### 3.3 Load expected file and join to test results

In [None]:
expected_df = pd.read_csv(expected_filename)
# Rename columns
list(expected_df.columns)
columns = [
    "id",
    "hand_ent_text",
    "hand_ent_label",
    "hand_ent_start",
    "hand_ent_end"
]

expected_df = expected_df[columns]
columns = {
    "id": "id",
    "hand_ent_text": "ent_text",
    "hand_ent_label": "ent_label",
    "hand_ent_start": "ent_start",
    "hand_ent_end": "ent_end",
}

expected_df = expected_df.rename(columns=columns)

expected_df.head(5)

Unnamed: 0,id,ent_text,ent_label,ent_start,ent_end
0,23,Centella Asiatica (L.) Urb.,scientific,23,50
1,23,Centella asiatica,common,129,146
2,24,Centella asiatica (L.) Urban,scientific,168,196
3,25,Centella asiatica,common,40,57
4,25,Centella asiatica (L.) Urb.,scientific,110,137


In [None]:
all_results_df = en_core_sci_md_results_df.merge(expected_df, on=["id", "ent_text", "ent_start", "ent_end"], how="outer", suffixes=("_sci_md", "_hand"))
all_results_df = en_core_sci_lg_results_df.merge(all_results_df, on=["id", "ent_text", "ent_start", "ent_end"], how="outer", suffixes=("_sci_lg", None))

# Tidy column headings and rearrange the columns for reading
all_results_df = all_results_df.rename(columns={"ent_label": "ent_label_sci_lg"})
# list(results_df.columns)

columns = [
    "id",
    "ent_text",
    "ent_label_sci_md",
    "ent_label_sci_lg",
    "ent_label_hand",
    "ent_start",
    "ent_end"
]
all_results_df = all_results_df[columns]
all_results_df.head(10)

Unnamed: 0,id,ent_text,ent_label_sci_md,ent_label_sci_lg,ent_label_hand,ent_start,ent_end
0,23,Efficacy,ENTITY,ENTITY,,0,8
1,23,Safety,ENTITY,ENTITY,,13,19
2,23,Centella Asiatica,ENTITY,ENTITY,,23,40
3,23,Wrinkles,ENTITY,ENTITY,,54,62
4,23,Systematic Review,ENTITY,ENTITY,,66,83
5,23,Published Data,ENTITY,ENTITY,,87,101
6,23,Network Meta-Analysis,ENTITY,ENTITY,,106,127
7,23,Centella asiatica,ENTITY,ENTITY,common,129,146
8,23,cosmetics,ENTITY,ENTITY,,172,181
9,23,wrinkle,,ENTITY,,193,200


In [217]:
en_core_sci_md_results_and_expected_df = en_core_sci_md_results_df.merge(expected_df, on=["id", "ent_text", "ent_start", "ent_end"], how="outer", suffixes=("_sci_md", "_hand"))

# Tidy column headings and rearrange the columns for reading
en_core_sci_md_results_and_expected_df = en_core_sci_md_results_and_expected_df.rename(columns={"ent_label": "ent_label_sci_md"})
# list(results_df.columns)

columns = [
    "id",
    "ent_text",
    "ent_label_sci_md",
    "ent_label_hand",
    "ent_start",
    "ent_end"
]
en_core_sci_md_results_and_expected_df = en_core_sci_md_results_and_expected_df[columns]
en_core_sci_md_results_and_expected_df.head(10)

Unnamed: 0,id,ent_text,ent_label_sci_md,ent_label_hand,ent_start,ent_end
0,23,Efficacy,ENTITY,,0,8
1,23,Safety,ENTITY,,13,19
2,23,Centella Asiatica,ENTITY,,23,40
3,23,Wrinkles,ENTITY,,54,62
4,23,Systematic Review,ENTITY,,66,83
5,23,Published Data,ENTITY,,87,101
6,23,Network Meta-Analysis,ENTITY,,106,127
7,23,Centella asiatica,ENTITY,common,129,146
8,23,applications,ENTITY,,156,168
9,23,cosmetics,ENTITY,,172,181


In [None]:
en_core_sci_lg_results_and_expected_df = en_core_sci_lg_results_df.merge(expected_df, on=["id", "ent_text", "ent_start", "ent_end"], how="outer", suffixes=("_sci_lg", "_hand"))

# Tidy column headings and rearrange the columns for reading
en_core_sci_lg_results_and_expected_df = en_core_sci_lg_results_and_expected_df.rename(columns={"ent_label": "ent_label_sci_lg"})
# list(results_df.columns)

columns = [
    "id",
    "ent_text",
    "ent_label_sci_lg",
    "ent_label_hand",
    "ent_start",
    "ent_end"
]
en_core_sci_lg_results_and_expected_df = en_core_sci_lg_results_and_expected_df[columns]
en_core_sci_lg_results_and_expected_df.head(20)

Unnamed: 0,id,ent_text,ent_label_sci_lg,ent_label_hand,ent_start,ent_end
0,23,Efficacy,ENTITY,,0,8
1,23,Safety,ENTITY,,13,19
2,23,Centella Asiatica,ENTITY,,23,40
3,23,Wrinkles,ENTITY,,54,62
4,23,Systematic Review,ENTITY,,66,83
5,23,Published Data,ENTITY,,87,101
6,23,Network Meta-Analysis,ENTITY,,106,127
7,23,Centella asiatica,ENTITY,common,129,146
8,23,cosmetics,ENTITY,,172,181
9,23,wrinkle,ENTITY,,193,200


### 3.4 Analysis of all results

In [None]:
# Common calculations
def calculate_precision(tp: int, fp: int) -> float:
    return tp / (tp + fp)

def calculate_recall(tp: int, fn: int) -> float:
    return tp / (tp + fn)

def calculate_f1(precision: float, recall: float) -> float:
    return (2 * precision * recall) / (precision + recall)

#### 3.4.1 Get metrics `for en_core_sci_md`
Get TP, FP, FN, and precision, recall and F1-score for `en_core_sci_md`

In [None]:
# Get TP, FP, FN for en_core_sci_md_results_and_expected_df
true_postive = (en_core_sci_md_results_and_expected_df["ent_label_sci_md"].notnull() & en_core_sci_md_results_and_expected_df["ent_label_hand"].notnull())
false_postive = (en_core_sci_md_results_and_expected_df["ent_label_sci_md"].notnull() & en_core_sci_md_results_and_expected_df["ent_label_hand"].isnull())
false_negative = (en_core_sci_md_results_and_expected_df["ent_label_sci_md"].isnull() & en_core_sci_md_results_and_expected_df["ent_label_hand"].notnull())

In [None]:
en_core_sci_md_results_and_expected_df[true_postive]
en_core_sci_md_results_tp = len(en_core_sci_md_results_and_expected_df[true_postive])

In [None]:
en_core_sci_md_results_and_expected_df[false_postive]
en_core_sci_md_results_fp = len(en_core_sci_md_results_and_expected_df[false_postive])

In [None]:
en_core_sci_md_results_and_expected_df[false_negative]
en_core_sci_md_results_fn = len(en_core_sci_md_results_and_expected_df[false_negative])

In [None]:
calculate_precision(tp=en_core_sci_md_results_tp, fp=en_core_sci_md_results_fp)

0.026869158878504672

In [None]:
calculate_recall(tp=en_core_sci_md_results_tp, fn=en_core_sci_md_results_fn)

0.42592592592592593

In [None]:
calculate_f1(precision=calculate_precision(tp=en_core_sci_md_results_tp, fp=en_core_sci_md_results_fp), recall=calculate_recall(tp=en_core_sci_md_results_tp, fn=en_core_sci_md_results_fn))

0.05054945054945055

#### 3.4.2 Get metrics `for en_core_sci_lg`
Get TP, FP, FN, and precision, recall and F1-score for `en_core_sci_lg`

In [187]:
# Get TP, FP, FN for en_core_sci_lg_results_and_expected_df
true_postive = (en_core_sci_lg_results_and_expected_df["ent_label_sci_lg"].notnull() & en_core_sci_lg_results_and_expected_df["ent_label_hand"].notnull())
false_postive = (en_core_sci_lg_results_and_expected_df["ent_label_sci_lg"].notnull() & en_core_sci_lg_results_and_expected_df["ent_label_hand"].isnull())
false_negative = (en_core_sci_lg_results_and_expected_df["ent_label_sci_lg"].isnull() & en_core_sci_lg_results_and_expected_df["ent_label_hand"].notnull())

In [188]:
en_core_sci_lg_results_and_expected_df[true_postive]
en_core_sci_lg_results_tp = len(en_core_sci_lg_results_and_expected_df[true_postive])

In [189]:
en_core_sci_lg_results_and_expected_df[false_postive]
en_core_sci_lg_results_fp = len(en_core_sci_lg_results_and_expected_df[false_postive])

In [222]:
en_core_sci_lg_results_and_expected_df[false_negative]
en_core_sci_lg_results_fn = len(en_core_sci_lg_results_and_expected_df[false_negative])

In [None]:
calculate_precision(tp=en_core_sci_lg_results_tp, fp=en_core_sci_lg_results_fp)

0.033214709371293

In [None]:
calculate_recall(tp=en_core_sci_lg_results_tp, fn=en_core_sci_lg_results_fn)

0.5185185185185185

In [None]:
calculate_f1(precision=calculate_precision(tp=en_core_sci_lg_results_tp, fp=en_core_sci_lg_results_fp), recall=calculate_recall(tp=en_core_sci_lg_results_tp, fn=en_core_sci_lg_results_fn))

0.06243032329988851

## 4 Further analysis

### 4.1 `en_core_sci_md`

In [192]:
# Get hits and misses with types of entities
en_core_sci_md_tp_common = (en_core_sci_md_results_and_expected_df["ent_label_sci_md"].notnull() & (en_core_sci_md_results_and_expected_df["ent_label_hand"]=="common"))
en_core_sci_md_tp_scientific = (en_core_sci_md_results_and_expected_df["ent_label_sci_md"].notnull() & (en_core_sci_md_results_and_expected_df["ent_label_hand"]=="scientific"))

en_core_sci_md_fn_common = (en_core_sci_md_results_and_expected_df["ent_label_sci_md"].isnull() & (en_core_sci_md_results_and_expected_df["ent_label_hand"]=="common"))
en_core_sci_md_fn_scientific = (en_core_sci_md_results_and_expected_df["ent_label_sci_md"].isnull() & (en_core_sci_md_results_and_expected_df["ent_label_hand"]=="scientific"))


In [196]:
en_core_sci_md_results_and_expected_df[en_core_sci_md_tp_common]
len(en_core_sci_md_results_and_expected_df[en_core_sci_md_tp_common])

14

In [197]:
en_core_sci_md_results_and_expected_df[en_core_sci_md_tp_scientific]
len(en_core_sci_md_results_and_expected_df[en_core_sci_md_tp_scientific])

9

In [203]:
en_core_sci_md_results_and_expected_df[en_core_sci_md_fn_common]
len(en_core_sci_md_results_and_expected_df[en_core_sci_md_fn_common])

16

In [199]:
en_core_sci_md_results_and_expected_df[en_core_sci_md_fn_scientific]
len(en_core_sci_md_results_and_expected_df[en_core_sci_md_fn_scientific])

15

### 4.2 `en_core_sci_lg`

In [209]:
# Get hits and misses with types of entities
en_core_sci_lg_tp_common = (en_core_sci_lg_results_and_expected_df["ent_label_sci_lg"].notnull() & (en_core_sci_lg_results_and_expected_df["ent_label_hand"]=="common"))
en_core_sci_lg_tp_scientific = (en_core_sci_lg_results_and_expected_df["ent_label_sci_lg"].notnull() & (en_core_sci_lg_results_and_expected_df["ent_label_hand"]=="scientific"))

en_core_sci_lg_fn_common = (en_core_sci_lg_results_and_expected_df["ent_label_sci_lg"].isnull() & (en_core_sci_lg_results_and_expected_df["ent_label_hand"]=="common"))
en_core_sci_lg_fn_scientific = (en_core_sci_lg_results_and_expected_df["ent_label_sci_lg"].isnull() & (en_core_sci_lg_results_and_expected_df["ent_label_hand"]=="scientific"))


In [210]:
en_core_sci_lg_results_and_expected_df[en_core_sci_lg_tp_common]
len(en_core_sci_lg_results_and_expected_df[en_core_sci_lg_tp_common])

18

In [211]:
en_core_sci_lg_results_and_expected_df[en_core_sci_lg_tp_scientific]
len(en_core_sci_lg_results_and_expected_df[en_core_sci_lg_tp_scientific])

10

In [212]:
en_core_sci_lg_results_and_expected_df[en_core_sci_lg_fn_common]
len(en_core_sci_lg_results_and_expected_df[en_core_sci_lg_fn_common])

12

In [213]:
en_core_sci_lg_results_and_expected_df[en_core_sci_lg_fn_scientific]
len(en_core_sci_lg_results_and_expected_df[en_core_sci_lg_fn_scientific])

14