In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict
import pandas as pd
import imodelsx.llm
import numpy as np
import paper_setup
import paper_parsing
import openai
import eval
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
imodelsx.llm.LLM_CONFIG['LLM_REPEAT_DELAY'] = 1

### Load df with groundtruth values and paper ids

In [2]:
# paper_setup.download_open_source_papers(df) 
# need to first download papers from https://drive.google.com/drive/folders/1OUXtsddxEAOl3tKEZegBQSwArUecb-6J into ../papers
df, ids_with_paper = paper_setup.download_gsheet()

# export missing papers
# cols = ['id', 'ref_text', 'ref_href']
# idx = df['found_paper (0=no, 1=yes)'] == '0'
# df[idx][cols].to_csv('missing_papers.csv', index=False)

### Extract info from the pdfs -- add values to the following columns:
- num_male, num_female, num_total, num_male_evidence_span, num_female_evidence_span, num_total_evidence_span
- num_white, num_black, num_latino, num_asian, race_evidence_span

In [None]:
paper_parsing.extract_nums_and_add_to_df(df, ids_with_paper, extract_texts=False)

### Evaluate
Evaluates whether each extracted number is within 1 of the human-labeled value

In [3]:
gender_count_processed = df.apply(eval.process_gender_counts, axis=1)
df["num_male"] = gender_count_processed.apply(lambda x: x[0]).astype(str)
df["num_female"] = gender_count_processed.apply(lambda x: x[1]).astype(str)

# view metrics in DataFrame d
print("total n", len(ids_with_paper))
eval.compute_metrics(df, columns_without_corrected=["num_male", "num_female", "num_total"])

total n 537


Unnamed: 0,target,recall,n_labeled,n_predicted_num
0,num_male,192,221,254
1,num_female,192,221,257
2,num_total,219,259,384


### Final process and save

In [4]:
df = paper_parsing.check_race_keywords(df, ids_with_paper)
df["paper_contains_race_keywords"].sum()
# convert columns to int
cols_int = ["ref_year", "found_paper (0=no, 1=yes)", "paper_contains_race_keywords"]
for c in cols_int:
    df[c] = df[c].apply(eval.int_or_neg1)
df = df.sort_values(
    by=["found_paper (0=no, 1=yes)", "paper_contains_race_keywords", "ref_year", "id"],
    ascending=False,
)
df.to_csv("../data/main.csv", index=False)

100%|██████████| 537/537 [00:00<00:00, 1172.82it/s]
