In [None]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict
import pandas as pd
import imodelsx.llm
import numpy as np
import paper_setup
import paper_parsing
import prompts
import openai
import eval
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
imodelsx.llm.LLM_CONFIG['LLM_REPEAT_DELAY'] = 1

### Load df with groundtruth values and paper ids

In [None]:
# paper_setup.download_open_source_papers(df) 
# need to first download papers from https://drive.google.com/drive/folders/1OUXtsddxEAOl3tKEZegBQSwArUecb-6J into ../papers
df, ids_with_paper = paper_setup.download_gsheet()

### Extract info from the odfs -- add values to the following columns:
- num_male, num_female, num_total, num_male_evidence_span, num_female_evidence_span, num_total_evidence_span
- num_white, num_black, num_latino, num_asian, race_evidence_span

In [None]:
# extract text from pdfs (create file num.txt for each file num.pdf)
paper_setup.extract_texts_from_pdf(ids_with_paper, papers_dir=paper_setup.papers_dir)

# get prompt
llm = imodelsx.llm.get_llm("gpt-4-0613") # gpt-3.5-turbo-0613

# properties, functions, content_str = prompts.get_prompts_gender_and_race()
# print('attempting to add', properties.keys())
# paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

properties, functions, content_str = prompts.get_prompts_gender()
print('attempting to add', properties.keys())
paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

properties, functions, content_str = prompts.get_prompts_race()
print('attempting to add', properties.keys())
paper_parsing.add_columns_based_on_properties(df, ids_with_paper, properties, functions, content_str, llm)

### Evaluate
Evaluates whether each extracted number is within 1 of the human-labeled value

In [None]:
# process counts (convert percentages to nums if conditions are correct)
def process_gender_counts(row):
    m = row['num_male']
    f = row['num_female']
    tot = row['num_total']
    if eval.str_contains_number(m) and eval.str_is_percentage(m) \
    and eval.str_contains_number(f) and eval.str_is_percentage(f) and eval.str_contains_number(tot) and not eval.str_is_percentage(tot):
        m = eval.percentage_to_num(m)
        f = eval.percentage_to_num(f)
        tot = int(tot)
        # print(m, f, tot)
        m = round(m * tot / 100)
        f = tot - m
    return m, f

gender_count_processed = df.apply(process_gender_counts, axis=1)
df['num_male'] = gender_count_processed.apply(lambda x: x[0]).astype(str)
df['num_female'] = gender_count_processed.apply(lambda x: x[1]).astype(str)

In [None]:
d = defaultdict(list)
print("total n", len(ids_with_paper))
for k in df.columns:
    # if k.startswith('num_') and k + '_corrected' in df.columns:
    if k in ["num_male", "num_female", "num_total"]:
        idxs_with_labels = df[k + "_corrected"].notnull() & ~(
            df[k + "_corrected"] == "Unk"
        )
        gt = df[k + "_corrected"][idxs_with_labels].astype(int)
        pred = df[k].apply(eval.cast_int)[idxs_with_labels].astype(int)
        recall = (np.abs(gt - pred) <= 1).sum()
        d["target"].append(k)
        d["recall"].append(recall)
        d["n_labeled"].append(len(gt))
        # d['n_predicted'].append(df[k].notnull().sum())
        # count number of values which contain a number
        d["n_predicted_num"].append(df[k].apply(eval.str_contains_number).sum())
d = pd.DataFrame.from_dict(d).round(2)
d

In [None]:
df = paper_parsing.check_race_keywords(df, ids_with_paper)
df['paper_contains_race_keywords'].sum()
df.to_csv('../data/main.csv', index=False)