In [None]:
!pip install -U pandas
!pip install -U scikit-learn
!pip install -U spacy
!pip install -U scispacy
!pip install -U https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz

In [1]:
import pandas as pd
import random

In [2]:
RESULT_PATH = "../DATASET/results"

# RE Evaluation

In [3]:
results = pd.read_csv(f"{RESULT_PATH}/raw_result/gena_data_raw.csv")
results.head(5)

Unnamed: 0,PMID,Sentence,E1,Type_E1,Relation,E2,Type_E2
0,31196764,"Flibanserin, a multifunctional serotonin recep...",Flibanserin,CHEMICAL,be approved in united states and canada for tr...,acquired generalized hypoactive sexual desire ...,MENTAL_HEALTH
1,31196764,"Flibanserin, a multifunctional serotonin recep...",Flibanserin,CHEMICAL,be approved in united states and canada for tr...,HSDD,DISEASE
2,31196764,"Flibanserin, a multifunctional serotonin recep...",multifunctional serotonin receptor agonist,CHEMICAL,be approved in united states and canada for tr...,acquired generalized hypoactive sexual desire ...,MENTAL_HEALTH
3,31196764,"Flibanserin, a multifunctional serotonin recep...",multifunctional serotonin receptor agonist,CHEMICAL,be approved in united states and canada for tr...,HSDD,DISEASE
4,31196764,"Flibanserin, a multifunctional serotonin recep...",multifunctional serotonin receptor agonist,BIOCHEMICAL,be approved in united states and canada for tr...,acquired generalized hypoactive sexual desire ...,MENTAL_HEALTH


In [7]:
list_id_papers = results['PMID'].unique().tolist()
sample_id = random.sample(list_id_papers, 250)
print("Numbers of ID in test sample:", len(sample_id))

Numbers of ID in test sample: 250


In [8]:
sample_test = results.sample(frac=1)[results['PMID'].isin(sample_id)]
sample_test.drop_duplicates(subset=['PMID'], inplace=True)
sample_test = sample_test.sample(n=250)
print("Size of test sample:", len(sample_test))
sample_test['E1_Result'] = [1]*len(sample_test)
sample_test['Relation_Result'] = [1]*len(sample_test)
sample_test['E2_Result'] = [1]*len(sample_test)

Size of test sample: 250


  sample_test = results.sample(frac=1)[results['PMID'].isin(sample_id)]


In [9]:
# Save to excel file
sample_test.to_excel(f"{RESULT_PATH}/evaluation/sample_test_final_R01.xlsx", index=False)
sample_test.to_excel(f"{RESULT_PATH}/evaluation/sample_test_final_R02.xlsx", index=False)

------------------------------------------------

In [4]:
result_1 = pd.read_excel(f"{RESULT_PATH}/evaluation/sample_test_final_R01.xlsx")
result_2 = pd.read_excel(f"{RESULT_PATH}/evaluation/sample_test_final_R02.xlsx")

In [5]:
len(result_1), len(result_2)

(250, 250)

In [19]:
import sklearn
from sklearn.metrics import cohen_kappa_score

print("Precision of Entities by Annotator 1: ", (sum(result_1['E1_Result']) + sum(result_1['E2_Result']))/(2.0*len(result_1)))
print("Precision of Entities by Annotator 2: ", (sum(result_2['E1_Result']) + sum(result_2['E2_Result']))/(2.0*len(result_2)))

print("IAA of Entities: ", cohen_kappa_score(result_1['E1_Result'].tolist() + result_1['E2_Result'].tolist(), result_2['E1_Result'].tolist() + result_2['E2_Result'].tolist()))

Precision of Entities by Annotator 1:  0.966
Precision of Entities by Annotator 2:  0.97
IAA of Entities:  0.8063266623628147


In [20]:
print("Precision of Relations by Annotator 1:", result_1['Relation_Result'].mean())
print("Precision of Relations by Annotator 2:", result_2['Relation_Result'].mean())

print("IAA of Entities: ", cohen_kappa_score(result_1['Relation_Result'].tolist(), result_2['Relation_Result']))

Precision of Relations by Annotator 1: 0.836
Precision of Relations by Annotator 2: 0.852
IAA of Entities:  0.8178063889226285


-----------------

# NER Evaluation

In [9]:
import pandas as pd
import random
import spacy
import scispacy

In [10]:
# Load 2 models
origin_nlp = spacy.load('en_ner_bc5cdr_md')
gena_nlp = spacy.load(f'{RESULT_PATH}/model/en_gena_sm')

In [11]:
results = pd.read_csv(f"{RESULT_PATH}/raw_result/gena_data_raw.csv")
random.seed(20)
pmids_100_random = random.sample(list(results['Sentence'].unique()), 100)
results_100 = results[results['Sentence'].isin(list(pmids_100_random))]
results_100.head(5)

Unnamed: 0,PMID,Sentence,E1,Type_E1,Relation,E2,Type_E2
87,14578199,N-methyl-D-aspartate receptor-induced toxicity...,acid/kainate receptor-type neurotoxicity,CHEMICAL,mediated by,voltage sensitive calcium channels,CHEMICAL
464,16389753,Enhanced intake of poly-unsaturated fatty acid...,poly-unsaturated fatty acids,NUTRITION,play important role in reversal of,related weight loss,DISEASE
465,16389753,Enhanced intake of poly-unsaturated fatty acid...,poly-unsaturated fatty acids,CHEMICAL,play important role in reversal of,related weight loss,DISEASE
481,15460168,n-3 long chain polyunsaturated fatty acids (n-...,n-3 long chain polyunsaturated fatty acids,NUTRITION,be present in mammal tissues from endogenous s...,n-3,CHEMICAL
482,15460168,n-3 long chain polyunsaturated fatty acids (n-...,n-3 long chain polyunsaturated fatty acids,NUTRITION,be present in mammal tissues from endogenous s...,desaturation,DISEASE


In [7]:
gena_results_ner_100 = []
used_pmid = {}
for idx, row in results_100.iterrows():
    gena_results_ner_100.append((row['PMID'], row['Sentence'], row['E1'], row['Type_E1']))
    gena_results_ner_100.append((row['PMID'], row['Sentence'], row['E2'], row['Type_E2']))
gena_results_ner_100 = pd.DataFrame(gena_results_ner_100, columns=['PMID', 'Sentence', 'Named-Entity', 'Label'])
gena_results_ner_100['Result'] = [1]*len(gena_results_ner_100)
gena_results_ner_100.drop_duplicates().reset_index(drop=True).to_csv(f"{RESULT_PATH}/evaluation/results_ner_100_gena.csv", index=False)

In [8]:
origins_100 = results_100[['PMID', 'Sentence']].drop_duplicates().reset_index(drop=True)
results_ner_100_origin = []
for idx, row in origins_100.iterrows():
    doc = origin_nlp(row['Sentence'])
    for ent in doc.ents:
        results_ner_100_origin.append((row['PMID'], row['Sentence'], ent.text, ent.label_))
results_ner_100_origin = pd.DataFrame(results_ner_100_origin, columns=['PMID', 'Sentence', 'Named-Entity', 'Label'])
results_ner_100_origin['Result'] = [1]*len(results_ner_100_origin)
results_ner_100_origin.drop_duplicates().reset_index(drop=True).to_csv(f"{RESULT_PATH}/evaluation/results_ner_100_origin.csv", index=False)

-------------

In [12]:
origin_results = pd.read_csv(f"{RESULT_PATH}/evaluation/results_ner_100_origin.csv")
gena_results = pd.read_csv(f"{RESULT_PATH}/evaluation/results_ner_100_gena.csv")

In [13]:
total_entities = origin_results['Result'].count()
precision = origin_results['Result'].mean()
print(f"Model origin has found {total_entities} entities with precision {precision:0.2f}")

Model origin has found 343 entities with precision 0.85


In [14]:
total_entities = gena_results['Result'].count()
precision = gena_results['Result'].mean()
print(f"Model gena has found {total_entities} entities with precision {precision:0.2f}")

Model gena has found 311 entities with precision 0.95


---------

# Mapping ID Evaluation

In [4]:
mapping_results = pd.read_excel(f"{RESULT_PATH}/evaluation/mapping_result.xlsx")
count_1 = len(mapping_results[mapping_results["Result"] == 1])
count_05 = len(mapping_results[mapping_results["Result"] == 0.5])
count_0 = len(mapping_results[mapping_results["Result"] == 0])
print(f"There are: {count_1} exact result; {count_05} related result and {count_0} wrong result")

There are: 80 exact result; 9 related result and 11 wrong result
