In [None]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict
import pandas as pd
import imodelsx.llm
import numpy as np
import paper_setup
import paper_parsing
import prompts
import openai
import eval
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
imodelsx.llm.LLM_CONFIG['LLM_REPEAT_DELAY'] = 1

### Load df with groundtruth values and paper ids

In [None]:
# paper_setup.download_open_source_papers(df) 
# need to first download papers from https://drive.google.com/drive/folders/1OUXtsddxEAOl3tKEZegBQSwArUecb-6J into ../papers
df, ids_with_paper = paper_setup.download_gsheet()

### Extract info from the odfs -- add values to the following columns:
- num_male, num_female, num_total, num_male_evidence_span, num_female_evidence_span, num_total_evidence_span
- num_white, num_black, num_latino, num_asian, race_evidence_span

In [None]:
# extract text from pdfs (create file num.txt for each file num.pdf)
paper_setup.extract_texts_from_pdf(ids_with_paper, papers_dir=paper_setup.papers_dir)

# get prompt
llm = imodelsx.llm.get_llm("gpt-4-0613") # gpt-3.5-turbo-0613

# properties, functions, content_str = prompts.get_prompts_gender_and_race()
# print('attempting to add', properties.keys())
# paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

properties, functions, content_str = prompts.get_prompts_gender()
print('attempting to add', properties.keys())
paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

properties, functions, content_str = prompts.get_prompts_race()
print('attempting to add', properties.keys())
paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

### Evaluate
Evaluates whether each extracted number is within 1 of the human-labeled value

In [32]:
# process counts (convert percentages to nums if conditions are correct)
def process_gender_counts(row):
    m = row['num_male']
    f = row['num_female']
    tot = row['num_total']
    if eval.str_contains_number(m) and eval.str_is_percentage(m) \
    and eval.str_contains_number(f) and eval.str_is_percentage(f) and eval.str_contains_number(tot) and not eval.str_is_percentage(tot):
        m = eval.percentage_to_num(m)
        f = eval.percentage_to_num(f)
        tot = int(tot)
        # print(m, f, tot)
        m = round(m * tot / 100)
        f = tot - m
    return m, f

gender_count_processed = df.apply(process_gender_counts, axis=1)
df['num_male'] = gender_count_processed.apply(lambda x: x[0]).astype(str)
df['num_female'] = gender_count_processed.apply(lambda x: x[1]).astype(str)

In [33]:
d = defaultdict(list)
print("total n", len(ids_with_paper))
for k in df.columns:
    # if k.startswith('num_') and k + '_corrected' in df.columns:
    if k in ["num_male", "num_female", "num_total"]:
        idxs_with_labels = df[k + "_corrected"].notnull() & ~(
            df[k + "_corrected"] == "Unk"
        )
        gt = df[k + "_corrected"][idxs_with_labels].astype(int)
        pred = df[k].apply(eval.cast_int)[idxs_with_labels].astype(int)
        recall = (np.abs(gt - pred) <= 1).sum()
        d["target"].append(k)
        d["recall"].append(recall)
        d["n_labeled"].append(len(gt))
        # d['n_predicted'].append(df[k].notnull().sum())
        # count number of values which contain a number
        d["n_predicted_num"].append(df[k].apply(eval.str_contains_number).sum())
d = pd.DataFrame.from_dict(d).round(2)
d

total n 184


Unnamed: 0,target,recall,n_labeled,n_predicted_num
0,num_male,69,85,101
1,num_female,69,85,101
2,num_total,84,105,146


In [36]:
df = paper_parsing.check_race_keywords(df, ids_with_paper)
df['paper_contains_race_keywords'].sum()
# convert columns to int
cols_int = ['ref_year', 'found_paper (0=no, 1=yes)', 'num_total_corrected']
for c in cols_int:
    df[c] = df[c].apply(eval.int_or_empty)
df.to_csv('../data/main.csv', index=False)

100%|██████████| 184/184 [00:00<00:00, 1118.92it/s]


In [37]:
df.head()

Unnamed: 0,id,full_title_en,short_description_en,ref_text,ref_href,ref_year,ref_href_corrected,"found_paper (0=no, 1=yes)",num_male,num_female,...,num_black,num_latino,num_asian,evidence_span_race,num_white_corrected,num_black_corrected,num_latino_corrected,num_asian_corrected,paper_contains_race_keywords,paper_contains_keywords
0,19,Absolute Neutrophil Count (ANC),Neutropenia (after chemotherapy).,"Al-Gwaiz LA, Babay HH. The diagnostic value of...",https://www.ncbi.nlm.nih.gov/pubmed/17709921,2007,,1,55.0,50.0,...,,,,,Unk,Unk,Unk,Unk,0.0,
1,23,APGAR Score,Assesses neonates 1 & 5 mins postpartum.,Apgar V. A proposal for a new method of evalua...,http://www.ncbi.nlm.nih.gov/pubmed/13083014,1953,,1,,,...,,,,,,,,,0.0,
2,25,Basal Energy Expenditure,Estimates minimum caloric requirements.,"Harris J, Benedict F. A biometric study of bas...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,1919,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,1,136.0,103.0,...,103.0,94.0,,"Measurements on 136 men, 103 women and 94 new-...",,,,,0.0,
3,31,Calcium Correction for Hypoalbuminemia,Corrects Ca for hypoalbuminemia.,"Payne RB, Little AJ, Williams RB, Milner JR. I...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,1973,,1,,,...,0.0,0.0,0.0,,,,,,1.0,
4,33,PSI/PORT Score: Pneumonia Severity Index for CAP,"Inpatient risk of VTE, need for anticoagulation.","Barbar S, Noventa F, Rossetto V, Ferrari A, Br...",https://www.ncbi.nlm.nih.gov/pubmed/20738765,2010,,1,365.0,346.0,...,,,,,,,,,0.0,
