In [None]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict
import pandas as pd
import imodelsx.llm
import numpy as np
import paper_parsing
import prompts
import openai
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()

### Load papers and extract text

In [None]:
# pubmed.download_open_source_papers(df) 
# need to first download papers from https://drive.google.com/drive/folders/1OUXtsddxEAOl3tKEZegBQSwArUecb-6J into ../papers
df, ids_with_paper = paper_parsing.download_gsheet()

### Extract info from the text -- add values to the following columns:
- num_male, num_female, num_total, num_male_evidence_span, num_female_evidence_span, num_total_evidence_span
- num_white, num_black, num_latino, num_asian, race_evidence_span

In [None]:
# extract text from pdfs (create file num.txt for each file num.pdf)
paper_parsing.extract_texts_from_pdf(ids_with_paper, papers_dir='../papers')

# get prompt
llm = imodelsx.llm.get_llm("gpt-4-0613") # gpt-3.5-turbo-0613
imodelsx.llm.LLM_CONFIG['LLM_REPEAT_DELAY'] = 10

# properties, functions, content_str = prompts.get_prompts_gender_and_race()
# print('attempting to add', properties.keys())
# paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

properties, functions, content_str = prompts.get_prompts_gender()
print('attempting to add', properties.keys())
paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

properties, functions, content_str = prompts.get_prompts_demographics()
print('attempting to add', properties.keys())
paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

### Evaluate
Evaluates whether each extracted number is within 1 of the human-labeled value

In [None]:
d = defaultdict(list)
print('total n', len(ids_with_paper))
for k in df.columns:
    if k.startswith('num_') and k + '_corrected' in df.columns:
        idxs_with_labels = (df[k + '_corrected'].notnull() & ~(df[k + '_corrected'] == 'Unk'))
        gt = df[k + '_corrected'][idxs_with_labels].astype(int)
        pred = df[k].apply(paper_parsing.cast_int)[idxs_with_labels].astype(int)
        acc = (np.abs(gt -pred) <= 1).mean()
        d['target'].append(k)
        d['acc'].append(acc)
        d['n_labeled'].append(len(gt))
d = pd.DataFrame.from_dict(d).round(2)
d

In [None]:
df.to_csv('../data/main.csv', index=False)