# Process hand-annotated abstracts for analysis

## 1: Setup 

### 1.1: Load models and packages

In [1]:
import os
import json
from pathlib import Path
import pandas as pd


In [40]:

# Add some functions I've already written for the NER pipeline
from helpers.entity_replacement.functions import return_phrase_at_location, PhraseLocation

### 1.2: Load hand-annotated abstracts file

In [41]:
hand_annotated_pubmed_data_filename = "../../../data/hand_annotated_abstracts/test/part-00000-3f29966d-3707-4973-a811-6c53c50a0735-c000.json"
output_filepath = "../../../data/hand_annotated_abstracts/answers"

### 1.3: Define some loading and reformatting functions

In [54]:
def get_labels_from_json_abstracts(
    input_filename: str, output_filepath: str) -> None:

    with open(input_filename, "r") as f:
        contents = [json.loads(json_line) for json_line in list(f)]

        result_rows = []
        for corpus in contents:
            # Create labels tuple for every entity in each abstract.
            # Here's the boilerplate of every result in this abstract or corpus.
            result = (corpus["id"], corpus["pmid"], corpus["data"])

            # This produces the entity, label and position
            for lb in corpus["label"]:
                _phrase_location = PhraseLocation(start=int(lb[0]), end=int(lb[1]))
                _entities = (
                    return_phrase_at_location(loc=_phrase_location, corpus=corpus["data"]),
                    lb[2],
                    int(_phrase_location.start),
                    int(_phrase_location.end),
                )

                result_rows.append(result + _entities)

    result_cols = [
        "id",
        "pmid",
        "corpus",
        "hand_ent_text",
        "hand_ent_label",
        "hand_ent_start",
        "hand_ent_end",
    ]
    results_df = pd.DataFrame.from_records(result_rows, columns=result_cols)
    results_df = results_df.sort_values(["id", "hand_ent_start"], ascending=(True, True))

    output_filepath = Path(output_filepath)
    output_filepath.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(Path(f"{output_filepath}/hand_annotated_answers.csv") , index=False)

## 2: Create reformatted data for analysis

In [55]:
get_labels_from_json_abstracts(input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath)

## 3: Analysis of annotation answers in the test data

In [56]:
# Load the reformatted answers
hand_annotated_answers_df = pd.read_csv(os.path.join(output_filepath, "hand_annotated_answers.csv"))
hand_annotated_answers_df.head(10)

Unnamed: 0,id,pmid,corpus,hand_ent_text,hand_ent_label,hand_ent_start,hand_ent_end
0,24,33081197,The Biochemical and Genetic Basis for the Bios...,<i>Hypericum Perforatum</i> L.,scientific,81,111
1,24,33081197,The Biochemical and Genetic Basis for the Bios...,<i>Hypericum perforatum</i> L.,scientific,159,189
2,24,33081197,The Biochemical and Genetic Basis for the Bios...,Saint John's Wort,common,208,225
3,25,33383372,Improvements in estrogen deficiency-induced hy...,Hypericum perforatum L.,scientific,68,91
4,25,33383372,Improvements in estrogen deficiency-induced hy...,Hypericum perforatum L.,scientific,189,212
5,26,32101119,Expression of Escherichia coli Heat-Labile Ent...,Centella,common,68,76
6,26,32101119,Expression of Escherichia coli Heat-Labile Ent...,Centella asiatica (L.) Urban,scientific,78,106
7,27,32588370,Oral administration of Centella asiatica (L.) ...,Centella asiatica (L.) Urb,scientific,23,49
8,27,32588370,Oral administration of Centella asiatica (L.) ...,Centella asiatica,common,174,191
9,28,32668965,Inhibition of Inflammatory Responses by <i>Cen...,Centella asiatica,common,43,60


In [62]:
# Split of entity labels
hand_annotated_answers_df.value_counts("hand_ent_label")

hand_ent_label
common            26
scientific        22
pharmaceutical     1
dtype: int64

In [61]:
# Total number of entities annotated
len(hand_annotated_answers_df)

49

In [63]:
# Number of annotated test data/abstracts
hand_annotated_answers_df["id"].nunique()

16

In [69]:
# Number of entities and unique entities for each abstract
hand_annotated_answers_df[["id", "hand_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,hand_ent_label,hand_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
24,3,2
25,2,1
26,2,2
27,2,2
28,2,2
29,2,2
30,3,2
31,3,2
32,10,2
33,5,3
