# Evaluate the entire processing of a set of ten protocols

In [None]:
import pandas as pd

In [None]:
df_testing = pd.read_excel("dsm_string_testing_v01.xlsx")

In [None]:
df_annotations = pd.read_csv("../data/ctgov/annotations/all_annotations.csv")

In [None]:
df_annotations.set_index("nct_id", inplace=True)

In [None]:
import os
cwd = os.getcwd()

In [None]:
os.chdir('../front_end')

In [None]:
from util.protocol_master_processor import MasterProcessor

master_processor = MasterProcessor("models/condition_classifier.pkl.bz2",
                                   "models/phase_rf_classifier.pkl.bz2",
                                   "models/spacy-textcat-phase-04-model-best",
                                   "models/sap_classifier_document_level.pkl.bz2",
                                   "models/sap_classifier.pkl.bz2",
                                   "models/effect_estimate_classifier.pkl.bz2",
                                   "models/num_subjects_classifier.pkl.bz2",
                                   "models/subjects_classifier_document_level.pkl.bz2",
                                   "models/arms_classifier_document_level.pkl.bz2",
                                   "models/spacy-textcat-arms-21-model-best",
                                   "models/spacy-textcat-international-11-model-best",
                                   "models/spacy-textcat-country-16-model-best",
                                   "models/international_classifier.pkl.bz2",
                                   "models/country_ensemble_model.pkl.bz2",
                                   "models/simulation_classifier.pkl.bz2")

In [None]:
ctgov_n = []
ctgov_sap = []
for idx in range(len(df_testing)):
    nct = df_testing.Protocol.iloc[idx]
    n = None
    sap = None
    if nct in df_annotations.index:
        n = df_annotations.num_subjects[nct]
        sap = df_annotations.has_sap[nct]
    ctgov_n.append(n)
    ctgov_sap.append(sap)
df_testing["CTGov_N"] = ctgov_n
df_testing["CTGov_SAP"] = ctgov_sap

In [None]:
file_to_page = {}
import json
texts = []
for idx in range(len(df_testing)):
    nct = df_testing.Protocol.iloc[idx]
    file_name = None
    if nct in df_annotations.index:
        file_name = df_annotations.file[nct]
    print (file_name)
    with open("/media/thomas/642d0db5-2c98-4156-b591-1a3572c5868c/data_open/read_ctgov_data/json/" + file_name + ".json", "r", encoding="utf-8") as f:
        pages = json.load(f)
    texts.append(pages)
    file_to_page[file_name] = pages
df_testing["pages"] = texts

In [None]:
# import bz2, pickle as pkl
# with bz2.open("demo_data/demo_protocols.pkl.bz2", "wb") as f:
#     pkl.dump(file_to_page, f)

In [None]:
pred_cond = []
pred_phase = []
pred_arms = []
pred_sap = []
pred_subjects = []
pred_sim = []
pred_effect = []
pred_countries = []

for idx in range(len(df_testing)):
    tokenised_pages, condition_to_pages, phase_to_pages, sap_to_pages, \
               effect_estimate_to_pages, num_subjects_to_pages,\
    num_arms_to_pages, country_to_pages, simulation_to_pages = master_processor.process_protocol(df_testing["pages"].iloc[idx])
    pred_cond.append(condition_to_pages["prediction"])
    pred_phase.append(phase_to_pages["prediction"])
    pred_arms.append(num_arms_to_pages["prediction"])
    pred_sap.append(sap_to_pages["prediction"])
    pred_subjects.append(num_subjects_to_pages["prediction"])    
    pred_sim.append(simulation_to_pages["prediction"])    
    pred_effect.append(effect_estimate_to_pages["prediction"])        
    pred_countries.append(country_to_pages["prediction"])        

In [None]:
df_testing["Indication"]

In [None]:
df_testing["new_ai_indication"] = pred_cond
df_testing["new_ai_phase"] = pred_phase
df_testing["new_ai_arms"] = pred_arms
df_testing["new_ai_sap"] = pred_sap
df_testing["new_ai_n"] = pred_subjects
df_testing["new_ai_effect"] = pred_effect
df_testing["new_ai_sim"] = pred_sim
df_testing["new_ai_countries"] = pred_countries

In [None]:
df_testing[["Protocol", "AI_phase", "Rev_phase", "new_ai_phase"]]

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
import re
phase_map = {"1/2":"0.5"}
accuracy_score(df_testing.Rev_phase.apply(str).apply(lambda x : phase_map.get(x, x)), df_testing.AI_phase.apply(str)), \
    accuracy_score(df_testing.Rev_phase.apply(str), df_testing.new_ai_phase.apply(str).apply(lambda x : re.sub(r'\.0', '', x)))

In [None]:
df_testing[["AI_effect", "Rev_effect", "new_ai_effect"]]

In [None]:
accuracy_score(df_testing.Rev_effect.str.contains("yes").apply(int), df_testing.AI_effect.str.contains("yes").apply(int)), \
    accuracy_score(df_testing.Rev_effect.str.contains("yes").apply(int), df_testing.new_ai_effect)

In [None]:
clean_sap_gt = df_testing.Rev_SAP.apply(lambda x : int(float(re.sub(r'p.+', '', x)) > 1))

In [None]:
df_testing[["AI_SAP", "Rev_SAP", "new_ai_sap"]]

In [None]:
accuracy_score(clean_sap_gt, df_testing.AI_SAP.map({"yes":1,"no":0})), \
    accuracy_score(clean_sap_gt, df_testing.new_ai_sap)

In [None]:
df_testing[["AI_N", "Rev_N", "new_ai_n"]]

In [None]:
accuracy_score(df_testing.Rev_N, df_testing.AI_N), \
    accuracy_score(df_testing.Rev_N, df_testing.new_ai_n.apply(int))

In [None]:
df_testing[["AI_sim", "Rev_sim", "new_ai_sim"]]

In [None]:
accuracy_score(df_testing.Arms_TW, df_testing.new_ai_arms)

In [None]:
df_testing[["Arms_TW", "new_ai_arms"]]

In [None]:
df_testing[["Countries_TW", "new_ai_countries"]]

In [None]:
accuracy_score(df_testing.Countries_TW.apply(lambda x : ",".join(sorted(x.split(",")))), df_testing.new_ai_countries.apply(lambda x : ",".join(sorted(x))))

In [None]:
df_testing.to_excel(cwd + "/output.xlsx", index=False)