# Process hand-annotated abstracts for analysis

## 1: Setup 

### 1.1: Load models and packages

In [4]:
import os
import json
from pathlib import Path
import pandas as pd


In [5]:

# Add some functions I've already written for the NER pipeline
from helpers.entity_replacement.functions import return_phrase_at_location, PhraseLocation

### 1.2: Load hand-annotated abstracts file

In [6]:
hand_annotated_pubmed_data_filename = "../../../data/hand_annotated_abstracts/test/part-00000-c89cb926-5be3-4c52-a27a-cf9d031475b9-c000.json"
output_filepath = "../../../data/hand_annotated_abstracts/answers"

### 1.3: Define some loading and reformatting functions

In [7]:
def get_labels_from_json_abstracts(
    input_filename: str, output_filepath: str) -> None:

    with open(input_filename, "r") as f:
        contents = [json.loads(json_line) for json_line in list(f)]

        result_rows = []
        for corpus in contents:
            # Create labels tuple for every entity in each abstract.
            # Here's the boilerplate of every result in this abstract or corpus.
            result = (corpus["id"], corpus["pmid"], corpus["data"])

            # This produces the entity, label and position
            for lb in corpus["label"]:
                _phrase_location = PhraseLocation(start=int(lb[0]), end=int(lb[1]))
                _entities = (
                    return_phrase_at_location(loc=_phrase_location, corpus=corpus["data"]),
                    lb[2],
                    int(_phrase_location.start),
                    int(_phrase_location.end),
                )

                result_rows.append(result + _entities)

    result_cols = [
        "id",
        "pmid",
        "corpus",
        "hand_ent_text",
        "hand_ent_label",
        "hand_ent_start",
        "hand_ent_end",
    ]
    results_df = pd.DataFrame.from_records(result_rows, columns=result_cols)
    results_df = results_df.sort_values(["id", "hand_ent_start"], ascending=(True, True))

    output_filepath = Path(output_filepath)
    output_filepath.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(Path(f"{output_filepath}/hand_annotated_answers.csv") , index=False)

## 2: Create reformatted data for analysis

In [8]:
get_labels_from_json_abstracts(input_filename=hand_annotated_pubmed_data_filename, output_filepath=output_filepath)

## 3: Analysis of annotation answers in the test data

In [9]:
# Load the reformatted answers
hand_annotated_answers_df = pd.read_csv(os.path.join(output_filepath, "hand_annotated_answers.csv"))
hand_annotated_answers_df.head(10)

Unnamed: 0,id,pmid,corpus,hand_ent_text,hand_ent_label,hand_ent_start,hand_ent_end
0,23,33413787,Efficacy and Safety of Centella Asiatica (L.) ...,Centella Asiatica (L.) Urb.,scientific,23,50
1,23,33413787,Efficacy and Safety of Centella Asiatica (L.) ...,Centella asiatica,common,129,146
2,24,32281154,Integration of mass spectral fingerprinting an...,Centella asiatica (L.) Urban,scientific,168,196
3,25,32668965,Inhibition of Inflammatory Responses by Centel...,Centella asiatica,common,40,57
4,25,32668965,Inhibition of Inflammatory Responses by Centel...,Centella asiatica (L.) Urb.,scientific,110,137
5,26,32948004,"Optimization of Light Intensity, Temperature, ...",St. John's Wort,common,120,135
6,26,32948004,"Optimization of Light Intensity, Temperature, ...",St. John's wort,common,137,152
7,26,32948004,"Optimization of Light Intensity, Temperature, ...",Hypericum perforatum L.,scientific,154,177
8,26,32948004,"Optimization of Light Intensity, Temperature, ...",St. John's wort,common,375,390
9,26,32948004,"Optimization of Light Intensity, Temperature, ...",St John's wort,common,611,625


In [10]:
# Split of entity labels
hand_annotated_answers_df.value_counts("hand_ent_label")

hand_ent_label
common        30
scientific    24
dtype: int64

In [11]:
# Total number of entities annotated
len(hand_annotated_answers_df)

54

In [12]:
# Number of annotated test data/abstracts
hand_annotated_answers_df["id"].nunique()

15

In [13]:
# Number of entities and unique entities for each abstract
hand_annotated_answers_df[["id", "hand_ent_label"]].groupby("id").agg(["count", "nunique"])

Unnamed: 0_level_0,hand_ent_label,hand_ent_label
Unnamed: 0_level_1,count,nunique
id,Unnamed: 1_level_2,Unnamed: 2_level_2
23,2,2
24,1,1
25,2,2
26,6,2
27,13,2
28,2,2
29,3,2
30,4,2
31,3,2
32,3,2
