In [None]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
from tqdm import tqdm
import pathlib
import imodelsx.llm
import json
import requests
import numpy as np
import openai
import pubmed
import paper_parsing
import prompts
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()

df = pd.read_csv('../data/main.csv')
df["ref_href"] = pubmed.get_updated_refs(df)
# pubmed.download_open_source_papers(df)

# check that found papers are present
ids_with_paper = df[df["found_paper (0=no, 1=yes)"] > 0].id.astype(int).values
ids_found = sorted(
    [int(x.replace(".pdf", "")) for x in os.listdir("../papers") if x.endswith(".pdf")]
)
df = paper_parsing.check_papers(df, ids_with_paper, ids_found)

# extract text from pdfs
paper_parsing.extract_texts_from_pdf(ids_with_paper)

### Ask questions about the text

In [None]:
# llm = imodelsx.llm.get_llm("gpt-3.5-turbo-0613")
llm = imodelsx.llm.get_llm("gpt-4-0613")
# llm = imodelsx.llm.get_llm("gpt-4-32k-0613")

# properties, functions, content_str = prompts.get_prompts_demographics()
properties, functions, content_str = prompts.get_prompts_gender()
messages = [
    {
        "role": "user",
        "content": content_str,
    }
]

In [None]:
# initialize
for k in properties.keys():
    if not k in df.columns:
        df.loc[:, k] = None

# run loop
for id in tqdm(ids_with_paper):
    i = df[df.id == id].index[0]
    row = df.iloc[i]
    paper_file = join("../papers", str(int(row.id)) + ".txt")

    try:
        real_input = pathlib.Path(paper_file).read_text()
        args = paper_parsing.call_on_subsets(real_input)

        # print(json.dumps(args, indent=2))
        if args is not None:
            for k in properties.keys():
                if k in args:
                    df.loc[i, k] = paper_parsing.rename_to_none(args[k])

                    # remove spans if they are not actually contained in the text
                    if k in ["num_male_evidence_span", "num_female_evidence_span"]:
                        if not paper_parsing.check_evidence(args[k], real_input):
                            df.loc[i, k] = None
    except Exception as e:
        print(row.id, e)
print("completed!")

In [None]:
for k in ['num_male', 'num_female']:
    idxs = (df[k + '_corrected'].notnull() & ~(df[k + '_corrected'] == 'Unk'))
    gt = df[k + '_corrected'][idxs].astype(int)
    pred = df[k].apply(cast_int)[idxs].astype(int)
    acc = (gt == pred).mean()
    print(f'{k} acc={acc:0.2f} n={len(gt)}')

In [None]:
df.to_csv('../data/main.csv', index=False)

# Look at gender ratios

In [None]:
idxs = (df['num_male_corrected'].notnull() & ~(df['num_male_corrected'] == 'Unk')) & (df['num_female_corrected'].notnull() & ~(df['num_female_corrected'] == 'Unk'))
male = df['num_male_corrected'][idxs].astype(int)
female = df['num_female_corrected'][idxs].astype(int)

In [None]:
ratios = (male / female).values
# drop inf
print(sorted(ratios))

In [None]:
plt.figure(figsize=(4, 2), dpi=300)
r = ratios[~np.isinf(ratios)]
logr = np.log10(r)
print('mean', r.mean(), 'frac>0', (r > 1).sum(), '/', len(r), 'excluding', len(ratios) - len(r), 'all-men studies')
plt.hist(logr[logr < 0], color='pink') #, bins=100)
plt.hist(logr[logr >= 0], color='C0') #, bins=100)
plt.axvline(0, color='black', ls='--')
ticks = plt.xticks()[0]
plt.xticks(ticks, [f'$10^{{{t}}}$' for t in ticks])
plt.xlabel('Ratio (male / female)')
plt.ylabel('Count')
plt.show()