In [1]:
import os, asyncio, json, tqdm, dotenv
import numpy as np
import pandas as pd
from langchain import llm_cache
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.schema import Document
from langchain.document_loaders import CSVLoader
from langchain.prompts import PromptTemplate
from langchain.cache import InMemoryCache
from typing import Tuple, Union, Dict, Iterable, Callable

dotenv.load_dotenv()

# cache llm calls (faster when repeating queries and prompts)
llm_cache = InMemoryCache()
os.environ["PRETRAINED_SUMMARY_MODEL_NAME"]='gpt-3.5-turbo-16k'

In [2]:
DATA_PATH = "data/ctg-studies.csv"
df = pd.read_csv(DATA_PATH)
df.head(3)

Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents
0,NCT02659943,T Cells Expressing a Fully-human AntiCD19 Chim...,https://beta.clinicaltrials.gov/study/NCT02659943,,COMPLETED,Background:\n\nThe immune system fights infect...,YES,"Lymphoma, B-Cell|Lymphoma, Non-hodgkins",BIOLOGICAL: Anti-cluster of differentiation 19...,Percentage of Enrolled Participants Who Actual...,...,Allocation: NON_RANDOMIZED|Intervention Model:...,160054|16-C-0054,2016-01-21,2018-06-12,2022-12-31,2016-01-21,2021-02-22,2023-04-19,"National Institutes of Health Clinical Center,...",Study Protocol and Statistical Analysis Plan|I...
1,NCT03486197,Neutron Radiation Therapy and Pembrolizumab in...,https://beta.clinicaltrials.gov/study/NCT03486197,,ACTIVE_NOT_RECRUITING,This phase II trial studies how well neutron r...,NO,Urothelial Carcinoma,OTHER: Laboratory Biomarker Analysis|BIOLOGICA...,Overall response rate per immune-modified resp...,...,Allocation: NA|Intervention Model: SINGLE_GROU...,9940|NCI-2018-00412|9940|RG3118000,2019-01-17,2023-11-01,2024-05-01,2018-04-03,,2023-02-17,Fred Hutch/University of Washington Cancer Con...,Informed Consent Form
2,NCT02073097,"Carfilzomib, Rituximab, and Combination Chemot...",https://beta.clinicaltrials.gov/study/NCT02073097,,ACTIVE_NOT_RECRUITING,This phase I/II trial studies the side effects...,NO,Contiguous Stage II Adult Diffuse Large Cell L...,DRUG: Carfilzomib|BIOLOGICAL: Rituximab|DRUG: ...,"Recommended Phase II dose (Phase I), Highest d...",...,Allocation: NA|Intervention Model: SINGLE_GROU...,CASE3413,2015-01-28,2021-04-26,2022-12,2014-02-27,,2022-01-13,"Roswell Park Comprehensive Cancer Center, Buff...",Study Protocol and Statistical Analysis Plan|I...


In [3]:
df.columns

Index(['NCT Number', 'Study Title', 'Study URL', 'Acronym', 'Study Status',
       'Brief Summary', 'Study Results', 'Conditions', 'Interventions',
       'Primary Outcome Measures', 'Secondary Outcome Measures',
       'Other Outcome Measures', 'Sponsor', 'Collaborators', 'Sex', 'Age',
       'Phases', 'Enrollment', 'Funder Type', 'Study Type', 'Study Design',
       'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents'],
      dtype='object')

In [4]:
df = df.replace(np.nan, "unknown")
PROCESSED_DATA_PATH  = "data/processed-ctg-studies.csv"
df.to_csv(PROCESSED_DATA_PATH, index=False)

In [5]:
doc_loader = CSVLoader(PROCESSED_DATA_PATH, encoding="utf8")
doc_splitter = RecursiveCharacterTextSplitter(chunk_size=4_000, chunk_overlap=200)
docs = doc_loader.load()
docs = doc_splitter.split_documents(docs)
print(f"number of documents: {len(docs)}")
docs[0]

number of documents: 1884


Document(page_content='NCT Number: NCT02659943\nStudy Title: T Cells Expressing a Fully-human AntiCD19 Chimeric Antigen Receptor for Treating B-cell Malignancies\nStudy URL: https://beta.clinicaltrials.gov/study/NCT02659943\nAcronym: unknown\nStudy Status: COMPLETED\nBrief Summary: Background:\n\nThe immune system fights infection and can affect cancer cells. T cells are white blood cells that are a major part of the immune system. T cells can destroy tumors. Researchers want to try to manipulate the immune system to better recognize and kill tumor cells.\n\nObjective:\n\nTo test the safety of giving T cells expressing a novel fully-human anti-cluster of differentiation 19 (CD19) chimeric antigen receptor (CAR) to people with advanced B-cell cancer.\n\nEligibility:\n\nPeople ages 18-73 with a B-cell cancer that has not been controlled by other therapies.\n\nDesign:\n\nParticipants will be screened with:\n\nPhysical exam\n\nBlood and urine tests\n\nHeart tests\n\nBone marrow sample take

In [6]:
class DataQueue:
    def __init__(self):
        self.data = []

    def __len__(self):
        return len(self.data)
        
    def enqueue(self, entry: Dict[str, str]):
        self.data.append(entry)
    
    def dequeue(self, idx: int):
        self.data.pop(idx)
    
    def __repr__(self):
        return f"{self.data}"
    
    def __getiitem__(self, idx: int):
        return self.data[idx]

In [7]:
def summarize(summary_chain: BaseCombineDocumentsChain, doc_id: int, document: Document, queue: DataQueue):
    summary = summary_chain.run([document])
    entry = dict(id=doc_id, document=document.page_content, summary=summary)
    queue.enqueue(entry)


async def summary_coroutine(f: Callable, args: Tuple, semaphore: asyncio.Semaphore):
    running_loop = asyncio.get_running_loop()
    summarize_func = lambda : f(*args)
    async with semaphore:
        await running_loop.run_in_executor(None, summarize_func, )


async def main(
    docs: Iterable[Document], 
    summary_chain: BaseCombineDocumentsChain, 
    queue: DataQueue, 
    n_concurrency: int=10):
    tasks = []
    semaphore = asyncio.Semaphore(value=n_concurrency)
    for i, doc in enumerate(docs):
        task = summary_coroutine(summarize, args=(summary_chain, i, doc, queue), semaphore=semaphore)
        tasks.append(task)
    [
        await _ for _ in tqdm.tqdm(asyncio.as_completed(tasks))
    ]

    

In [8]:
llm = ChatOpenAI(model=os.environ["PRETRAINED_SUMMARY_MODEL_NAME"], temperature=0.2)
summary_chain = load_summarize_chain(llm, chain_type="map_reduce")
queue = DataQueue()

await main(docs, summary_chain, n_concurrency=20, queue=queue)

559it [05:48,  1.17it/s]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.co

In [11]:
JSON_DATA_PATH = "data/doc_summary_pair.json"
queue.data = sorted(queue.data, key= lambda x : x["id"])
with open(JSON_DATA_PATH, "w") as f:
    json.dump(queue.data, f, indent=4)
f.close()