In [None]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
import fitz
from tqdm import tqdm
import pathlib
import imodelsx.llm
import json
import requests
import numpy as np
import openai
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()

df = pd.read_csv('../data/main_updated.csv')
df = df[df['id'].notna()]

# extract text from pdfs
d = df[
    (df["found_paper (0=no, 1=yes, 2=low-qual)"] == 1)
    | (df["found_paper (0=no, 1=yes, 2=low-qual)"] == 2)
]
d.index = np.arange(d.shape[0])
for i, row in tqdm(d.iterrows()):
    paper_file = join("../papers", str(row.id) + ".pdf")
    if pathlib.Path(paper_file).exists():
        with fitz.open(paper_file) as doc:  # open document
            text = chr(12).join([page.get_text() for page in doc])
            text = text.replace('-\n', '')
            pathlib.Path(join("../papers", str(row.id) + ".txt")).write_bytes(
                text.encode()
            )

### Ask questions about the text

In [None]:
# llm = imodelsx.llm.get_llm("gpt-3.5-turbo-0613")
llm = imodelsx.llm.get_llm("gpt-4-0613")
# llm = imodelsx.llm.get_llm("gpt-4-32k-0613")
# gpt-4-32k-0613

In [None]:
properties = {
    "num_male": {
        "type": "string",
        "description": "The number of male patients in the study",
    },
    "num_female": {
        "type": "string",
        "description": "The number of female patients in the study",
    },
    "num_male_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_male.",
    },
    "num_female_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_female.",
    },
    "num_white": {
        "type": "string",
        "description": "The number of white/caucasian patients in the study",
    },
    "num_black": {
        "type": "string",
        "description": "The number of black/african american patients in the study",
    },
    "num_latino": {
        "type": "string",
        "description": "The number of latino patients in the study",
    },
    "num_white_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_white.",
    },
    "num_black_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_black.",
    },
    "num_latino_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_latino.",
    },
}

functions = [
    {
        "name": "extract_patient_nums",
        "description": "Get the number of patients in this study for each gender and race.",
        "parameters": {
            "type": "object",
            "properties": properties,
            "required": [
                "num_male",
                "num_female",
                "num_male_evidence_span",
                "num_female_evidence_span",
            ],
        },
    },
]
content_str = """### QUESTION: How many male and female patients were in this study?

###  STUDY: {input}"""
messages = [
    {
        "role": "user",
        "content": content_str,
    }
]

In [None]:
# example with answer: One hundred and five patients, 55 males and 50 females
toy_input1 = """This study was about treating diabetes. It was a very difficult study.
One hundred and five patients, 55 males and 50 females were included.
The study took 200 days to complete. The study was conducted in the United States.
The study was conducted by the University of California, San Francisco."""

# example with answer: One hundred and five patients, 55 males and 50 females, 10 white, 75 black
toy_input2 = """This study was about treating diabetes. It was a very difficult study.
One hundred and five patients, 55 males and 50 females were included.
The study took 200 days to complete. The study was conducted in the United States.
Ten of the patients were white, 20 were asian, and the rest were black.
The study was conducted by the University of California, San Francisco."""

# messages[0]['content'] = content_str.format(input=toy_input1)
# msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
# args = json.loads(msg.get('function_call')['arguments'])
# print(json.dumps(args, indent=2))

# messages[0]['content'] = content_str.format(input=toy_input2)
# msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
# args = json.loads(msg.get('function_call')['arguments'])
# print(json.dumps(args, indent=2))

In [None]:
for k in properties.keys():
    d[k] = None
d["approx_tokens"] = None
for i in range(d.shape[0]):
    row = d.iloc[i]
    paper_file = join("../papers", str(int(row.id)) + ".txt")
    # example with answer: One hundred and five patients, 55 males and 50 females
    real_input = pathlib.Path(paper_file).read_text()
    approx_tokens = len(real_input) / 4
    print(
        "approx tokens", approx_tokens
    )  # gpt4 has 8k token window (some of it is functions, etc.)
    real_input = real_input[: 5000 * 4]
    d.loc[d.index[i], "approx_tokens"] = approx_tokens

    # if approx_tokens < 6000:
    messages[0]["content"] = content_str.format(input=real_input)
    try:
        msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
        args = json.loads(msg.get("function_call")["arguments"])
        print(json.dumps(args, indent=2))
        for k in properties.keys():
            if k in args:
                # set the value at row number i and column k to the value of args[k]
                d.loc[d.index[i], k] = args[k]

    except Exception as e:
        print(e)
print('completed!')

In [None]:
def check_evidence(ev: str, real_input: str):
    if ev is not None:
        # remove all whitespace
        ev = "".join(ev.split())
        real_input = "".join(real_input.split())
        return ev.lower() in real_input.lower()
    return False

# check evidence
for i in range(d.shape[0]):
    row = d.iloc[i]
    paper_file = join("../papers", str(int(row.id)) + ".txt")
    real_input = pathlib.Path(paper_file).read_text()
    for k in ["num_male_evidence_span", "num_female_evidence_span"]:
        ev = row[k]
        if ev is not None:
            print(row.id, 'contained', check_evidence(ev, real_input))
            print('\t', ev)

In [None]:
d[
    [
        "id",
        "num_male",
        "num_female",
        "num_male_evidence_span",
        "num_female_evidence_span",
    ]
].to_csv("../data/gender_counts.csv", index=False)