In [None]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
import fitz
from tqdm import tqdm
import pathlib
import imodelsx.llm
import json
import requests
import numpy as np
import openai
import prompts
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()

df = pd.read_csv('../data/main.csv')
df = df[df['id'].notna()]

# extract text from pdfs
ids_with_paper = df[
    (df["found_paper (0=no, 1=yes)"] == 1)
    | (df["found_paper (0=no, 1=yes)"] == 2)
].id

# d = pd.read_csv('../data/mini.csv')
def extract_texts_from_pdf(ids, papers_dir='../papers'):
    for id in tqdm(ids):
        paper_file = join(papers_dir, str(id) + ".pdf")
        if pathlib.Path(paper_file).exists():
            with fitz.open(paper_file) as doc:  # open document
                text = chr(12).join([page.get_text() for page in doc])
                text = text.replace('-\n', '')
                pathlib.Path(join(papers_dir, str(id) + ".txt")).write_bytes(
                    text.encode()
                )

extract_texts_from_pdf(ids_with_paper)

### Ask questions about the text

In [None]:
# llm = imodelsx.llm.get_llm("gpt-3.5-turbo-0613")
llm = imodelsx.llm.get_llm("gpt-4-0613")
# llm = imodelsx.llm.get_llm("gpt-4-32k-0613")

In [None]:
# properties, functions, content_str = prompts.get_prompts_demographics()
properties, functions, content_str = prompts.get_prompts_gender()
messages = [
    {
        "role": "user",
        "content": content_str,
    }
]

In [None]:
# example with answer: One hundred and five patients, 55 males and 50 females
toy_input1 = """This study was about treating diabetes. It was a very difficult study.
One hundred and five patients, 55 males and 50 females were included.
The study took 200 days to complete. The study was conducted in the United States.
The study was conducted by the University of California, San Francisco."""

# example with answer: One hundred and five patients, 55 males and 50 females, 10 white, 75 black
toy_input2 = """This study was about treating diabetes. It was a very difficult study.
One hundred and five patients, 55 males and 50 females were included.
The study took 200 days to complete. The study was conducted in the United States.
Ten of the patients were white, 20 were asian, and the rest were black.
The study was conducted by the University of California, San Francisco."""

# messages[0]['content'] = content_str.format(input=toy_input1)
# msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
# args = json.loads(msg.get('function_call')['arguments'])
# print(json.dumps(args, indent=2))

# messages[0]['content'] = content_str.format(input=toy_input2)
# msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
# args = json.loads(msg.get('function_call')['arguments'])
# print(json.dumps(args, indent=2))

In [21]:
def rename_to_none(x: str):
    if x in {"", "unknown", "N/A"}:
        return None
    else:
        return x


def call_on_subsets(x: str, subset_len_tokens=4750, max_calls=3):
    subset_len_chars = subset_len_tokens * 4

    args = None
    subset_num = 0

    while args is None and subset_num < max_calls:
        subset = x[subset_num * subset_len_chars : (subset_num + 1) * subset_len_chars]

        # if approx_tokens < 6000:
        messages[0]["content"] = content_str.format(input=subset)
        msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
        if msg is not None and msg.get("function_call") is not None:
            args = json.loads(msg.get("function_call")["arguments"])
            return args

        subset_num += 1

        # next segment should have atleast 0.5 * subset_len_chars_left
        if len(x) < (subset_num + 0.5) * subset_len_chars:
            break

    return None


def check_evidence(ev: str, real_input: str):
    if ev is not None:
        # remove all whitespace
        ev = "".join(ev.split())
        real_input = "".join(real_input.split())
        return ev.lower() in real_input.lower()
    return False


# initialize
for k in properties.keys():
    df.loc[:, k] = None
# df["approx_tokens"] = None

# run loop
for id in tqdm(ids_with_paper):
    i = df[df.id == id].index[0]
    row = df.iloc[i]
    paper_file = join("../papers", str(int(row.id)) + ".txt")
    real_input = pathlib.Path(paper_file).read_text()
    # gpt4 has 8k token window (some of it is functions, etc.)
    # approx_tokens = (len(real_input) / 4)
    # df.loc[i, "approx_tokens"] = approx_tokens

    args = call_on_subsets(real_input)

    # print(json.dumps(args, indent=2))
    if args is not None:
        for k in properties.keys():
            if k in args:
                # set the value at row number i and column k to the value of args[k]
                df.loc[i, k] = rename_to_none(args[k])

                # remove spans if they are not actually contained in the text
                if k in ["num_male_evidence_span", "num_female_evidence_span"]:
                    if not check_evidence(args[k], real_input):
                        df.loc[i, k] = None
print("completed!")

In [None]:
def cast_int(x):
    try:
        return int(x)
    except:
        return -1


for k in ['num_male', 'num_female']:
    idxs = (df[k + '_corrected'].notnull() & ~(df[k + '_corrected'] == 'Unk'))
    gt = df[k + '_corrected'][idxs].astype(int)
    pred = df[k].apply(cast_int)[idxs].astype(int)
    acc = (gt == pred).mean()
    print(f'{k} acc={acc:0.2f} n={len(gt)}')

In [None]:
df.to_csv('../data/main.csv', index=False)

# Look at gender ratios

In [None]:
idxs = (df['num_male_corrected'].notnull() & ~(df['num_male_corrected'] == 'Unk')) & (df['num_female_corrected'].notnull() & ~(df['num_female_corrected'] == 'Unk'))
male = df['num_male_corrected'][idxs].astype(int)
female = df['num_female_corrected'][idxs].astype(int)

In [None]:
ratios = (male / female).values
# drop inf
ratios = ratios[~np.isinf(ratios)]
sorted(ratios)