In [1]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
import fitz
from tqdm import tqdm
import pathlib
import imodelsx.llm
import json
import requests
import numpy as np
import openai
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()

df = pd.read_csv('../data/main_updated.csv')
df = df[df['id'].notna()]

### Extract text from pdfs

In [2]:
d = df[
    (df["found_paper (0=no, 1=yes, 2=low-qual)"] == 1)
    | (df["found_paper (0=no, 1=yes, 2=low-qual)"] == 2)
]
for i, row in tqdm(d.iterrows()):
    paper_file = join("../papers", str(row.id) + ".pdf")
    if pathlib.Path(paper_file).exists():
        with fitz.open(paper_file) as doc:  # open document
            text = chr(12).join([page.get_text() for page in doc])
            pathlib.Path(join("../papers", str(row.id) + ".txt")).write_bytes(
                text.encode()
            )

0it [00:00, ?it/s]

16it [00:00, 23.45it/s]


### Ask questions about the text

In [8]:
# llm = imodelsx.llm.get_llm("gpt-3.5-turbo-0613")
llm = imodelsx.llm.get_llm("gpt-4-0613")
# llm = imodelsx.llm.get_llm("gpt-4-32k-0613")
# gpt-4-32k-0613

In [9]:
properties = {
    "num_male": {
        "type": "string",
        "description": "The number of male patients in the study",
    },
    "num_female": {
        "type": "string",
        "description": "The number of female patients in the study",
    },
    "num_male_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_male.",
    },
    "num_female_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_female.",
    },
    "num_white": {
        "type": "string",
        "description": "The number of white/caucasian patients in the study",
    },
    "num_black": {
        "type": "string",
        "description": "The number of black/african american patients in the study",
    },
    "num_latino": {
        "type": "string",
        "description": "The number of latino patients in the study",
    },
    "num_white_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_white.",
    },
    "num_black_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_black.",
    },
    "num_latino_evidence_span": {
        "type": "string",
        "description": "The long text span in the input that includes evidence for num_latino.",
    },
}

functions = [
    {
        "name": "extract_patient_nums",
        "description": "Get the number of patients in this study for each gender and race.",
        "parameters": {
            "type": "object",
            "properties": properties,
            "required": [
                "num_male",
                "num_female",
                "num_male_evidence_span",
                "num_female_evidence_span",
            ],
        },
    },
]
content_str = """### QUESTION: How many male and female patients were in this study?

###  STUDY: {input}"""
messages = [
    {
        "role": "user",
        "content": content_str,
    }
]

In [10]:
# example with answer: One hundred and five patients, 55 males and 50 females
toy_input1 = """This study was about treating diabetes. It was a very difficult study.
One hundred and five patients, 55 males and 50 females were included.
The study took 200 days to complete. The study was conducted in the United States.
The study was conducted by the University of California, San Francisco."""

# example with answer: One hundred and five patients, 55 males and 50 females, 10 white, 75 black
toy_input2 = """This study was about treating diabetes. It was a very difficult study.
One hundred and five patients, 55 males and 50 females were included.
The study took 200 days to complete. The study was conducted in the United States.
Ten of the patients were white, 20 were asian, and the rest were black.
The study was conducted by the University of California, San Francisco."""

# messages[0]['content'] = content_str.format(input=toy_input1)
# msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
# args = json.loads(msg.get('function_call')['arguments'])
# print(json.dumps(args, indent=2))

messages[0]['content'] = content_str.format(input=toy_input2)
msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
args = json.loads(msg.get('function_call')['arguments'])
print(json.dumps(args, indent=2))

not cached
'choices'
{
  "num_male": "55",
  "num_female": "50",
  "num_male_evidence_span": "One hundred and five patients, 55 males and 50 females were included.",
  "num_female_evidence_span": "One hundred and five patients, 55 males and 50 females were included.",
  "num_white": "10",
  "num_black": "75",
  "num_white_evidence_span": "Ten of the patients were white, 20 were asian, and the rest were black.",
  "num_black_evidence_span": "Ten of the patients were white, 20 were asian, and the rest were black."
}


In [11]:
for k in properties.keys():
    d[k] = None
d['approx_tokens'] = None
for i in range(d.shape[0]):
    row = d.iloc[i]
    paper_file = join("../papers", str(int(row.id)) + ".txt")
    # example with answer: One hundred and five patients, 55 males and 50 females
    real_input = pathlib.Path(paper_file).read_text()
    approx_tokens = len(real_input) / 4
    print('approx tokens', approx_tokens) # gpt4 has 8k token window (some of it is functions, etc.)
    real_input = real_input[:6000 * 4]
    d.loc[d.index[i], 'approx_tokens'] = approx_tokens

    # if approx_tokens < 6000:
    messages[0]['content'] = content_str.format(input=real_input)
    try:
        msg = llm(messages, functions=functions, return_str=False, temperature=0.0)
        args = json.loads(msg.get('function_call')['arguments'])
        print(json.dumps(args, indent=2))
        for k in properties.keys():
            if k in args:
                # set the value at row number i and column k to the value of args[k]
                d.loc[d.index[i], k] = args[k]

    except:
        pass

approx tokens 3790.25
not cached


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[k] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[k] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[k] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:/

'choices'
{
  "num_male": "55",
  "num_female": "50",
  "num_male_evidence_span": "One hundred and five patients, 55 males and 50 females ranging in age from 2 days to 93 years, mean age of 34 years, with bacterial infections documented by culture, were studied.",
  "num_female_evidence_span": "One hundred and five patients, 55 males and 50 females ranging in age from 2 days to 93 years, mean age of 34 years, with bacterial infections documented by culture, were studied."
}
approx tokens 5680.0
not cached
'choices'
{
  "num_male": "",
  "num_female": "",
  "num_male_evidence_span": "",
  "num_female_evidence_span": ""
}
approx tokens 2964.5
not cached
'choices'
{
  "num_male": "136",
  "num_female": "103",
  "num_male_evidence_span": "Measurements on 136 men, 103 women and 94 new-born infants have been",
  "num_female_evidence_span": "Measurements on 136 men, 103 women and 94 new-born infants have been"
}
approx tokens 5702.0
not cached


In [None]:
d

Unnamed: 0,id,full_title_en,short_description_en,ref_text,ref_href,ref_year,ref_href_corrected,"found_paper (0=no, 1=yes, 2=low-qual)"
0,19,Absolute Neutrophil Count (ANC),Neutropenia (after chemotherapy).,"Al-Gwaiz LA, Babay HH. The diagnostic value of...",https://www.ncbi.nlm.nih.gov/pubmed/17709921,2007.0,,1.0
1,23,APGAR Score,Assesses neonates 1 & 5 mins postpartum.,Apgar V. A proposal for a new method of evalua...,http://www.ncbi.nlm.nih.gov/pubmed/13083014,1953.0,,1.0
2,25,Basal Energy Expenditure,Estimates minimum caloric requirements.,"Harris J, Benedict F. A biometric study of bas...",,1919.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,2.0
5,31,Calcium Correction for Hypoalbuminemia,Corrects Ca for hypoalbuminemia.,"Payne RB, Little AJ, Williams RB, Milner JR. I...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,1973.0,,2.0
6,33,PSI/PORT Score: Pneumonia Severity Index for CAP,"Inpatient risk of VTE, need for anticoagulation.","Barbar S, Noventa F, Rossetto V, Ferrari A, Br...",https://www.ncbi.nlm.nih.gov/pubmed/20738765,2010.0,,1.0
7,38,Framingham Risk Score for Hard Coronary Heart ...,Heart attack risk.,"Wilson PW, et. al. Prediction of Coronary Hear...",http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,1998.0,https://pubmed.ncbi.nlm.nih.gov/9603539/,1.0
8,40,CHADS<sub>2</sub> Score for Atrial Fibrillatio...,Stroke risk in AFib.,"Gage BF, Waterman AD, Shannon W, Boechler M, R...",http://www.ncbi.nlm.nih.gov/pubmed/11401607,2001.0,,1.0
9,43,Creatinine Clearance (Cockcroft-Gault Equation),Estimates creatinine clearance (kidney function).,"Cockcroft DW, Gault MH. Prediction of creatini...",http://www.ncbi.nlm.nih.gov/pubmed/1244564,1976.0,,2.0
11,50,Sodium Correction for Hyperglycemia,CML survival.,"Sokal JE, Cox EB, Baccarani M, Tura S, Gomez G...",http://www.ncbi.nlm.nih.gov/pubmed/6584184,1984.0,,2.0
12,56,Maddrey's Discriminant Function for Alcoholic ...,Fluid maintenance.,"Holiday MA, Segar WE. The maintenance need for...",https://www.ncbi.nlm.nih.gov/pubmed/13431307,1957.0,,1.0
