In [4]:
# Import necessary libraries
import os
import requests
import json
import openai
import asyncio
import nltk
from IPython.display import display, clear_output, FileLink
import ipywidgets as widgets
from tqdm.notebook import tqdm
import logging
import pandas as pd
import time
import tiktoken
import numpy as np
import io
from openai import AsyncOpenAI

client=AsyncOpenAI()

# Securely prompt the user for the OpenAI API Key
openai_api_key = getpass("Enter your OpenAI API Key: ")

# Set the OpenAI API key
openai.api_key = openai_api_key

# Initialize token usage tracking
token_usage = {
    'gpt-4': {
        'prompt_tokens': 0,
        'completion_tokens': 0,
        'total_tokens': 0
    },
    'gpt-3.5-turbo': {
        'prompt_tokens': 0,
        'completion_tokens': 0,
        'total_tokens': 0
    },
    'text-embedding-ada-002': {
        'total_tokens': 0
    }
}

# Apply nest_asyncio to allow nested event loops in Jupyter Notebook
try:
    import nest_asyncio
    nest_asyncio.apply()
except ImportError:
    !pip install nest_asyncio
    import nest_asyncio
    nest_asyncio.apply()

# Download necessary NLTK data quietly
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Configure logging for debugging and monitoring
logging.basicConfig(level=logging.INFO, filename='clinical_trials.log', filemode='a',
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Function to clean extracted text by removing escape characters and unnecessary whitespace
def clean_text(text):
    """
    Cleans and formats text by removing escape characters and unnecessary whitespace.
    """
    if isinstance(text, str):
        return text.replace('\\>', '>').replace('\\<', '<').replace('\\', '').strip()
    return text

from nltk.corpus import wordnet as wn

def get_broader_terms(condition):
    synsets = wn.synsets(condition, pos=wn.NOUN)
    broader_terms = set()
    for syn in synsets:
        for hypernym in syn.hypernyms():
            broader_terms.update(lemma.name().replace('_', ' ') for lemma in hypernym.lemmas())
    return list(broader_terms)

# Expand keywords using broader terms
def expand_keywords(keywords_list):
    expanded_keywords = set()
    for keyword in keywords_list:
        expanded_keywords.add(keyword)
        broader_terms = get_broader_terms(keyword)
        expanded_keywords.update(broader_terms)
    logging.info(f"Expanded Keywords: {expanded_keywords}")
    return list(expanded_keywords)

# Function to generate medical condition keywords from patient data using OpenAI's GPT
async def generate_keywords(patient_data):
    """
    Generate medical condition keywords from patient data using OpenAI's API asynchronously.
    """
    prompt = (
        f"Extract specific medical condition keywords from the following patient data. "
        f"Focus on diseases, disorders, and only medical conditions. "
        f"Exclude general terms and all non-medical information and terms.\n"
        f"Patient Data:\n{patient_data}\n\n"
        f"Medical Conditions (comma-separated):"
    )
    try:
        response = await client.chat.completions.create(
            model='gpt-3.5-turbo',
            messages=[{"role": "user", "content": prompt}],
            max_tokens=60,
            temperature=0.5
        )
        # Collect token usage for gpt-3.5-turbo
        usage = response.usage
        token_usage['gpt-3.5-turbo']['prompt_tokens'] += usage.prompt_tokens
        token_usage['gpt-3.5-turbo']['completion_tokens'] += usage.completion_tokens
        token_usage['gpt-3.5-turbo']['total_tokens'] += usage.total_tokens
        
        keywords_text = response.choices[0].message.content.strip()
        keywords_list = [kw.strip() for kw in keywords_text.split(',') if kw.strip()]
        
        # Remove general terms
        general_terms = {'consent', 'permission', 'therapy', 'treatment', 'medical procedure', 'medical science',
                         'hospital room', 'room', 'palliative care', 'clinical trial', 'surgery', 'radiotherapy',
                         'drug treatment', 'ecog performance status'}
        keywords_list = [kw for kw in keywords_list if kw.lower() not in general_terms]
        
        logging.info(f"Generated Keywords: {keywords_list}")
        
        # Expand the keywords using broader terms
        expanded_keywords = expand_keywords(keywords_list)
        return expanded_keywords
        
    except Exception as e:
        logging.error(f"Error generating keywords: {e}")
        print(f"Error generating keywords: {e}")
        return []

# Function to read NCT Numbers from CSV
def load_nct_numbers(csv_file_path):
    """
    Load NCT Numbers from a CSV file.
    """
    df = pd.read_csv(csv_file_path)
    nct_numbers = df['NCT Number'].dropna().unique().tolist()
    logging.info(f"Loaded {len(nct_numbers)} NCT Numbers from CSV.")
    return nct_numbers

def fetch_trials_by_nct_numbers(nct_numbers):
    """
    Fetch trial details for given NCT Numbers.
    """
    base_url = 'https://clinicaltrials.gov/api/v2/studies'
    trials_data = []
    batch_size = 20  # Reduced batch size

    # Load cached trials if available
    cache_file = 'trials_cache.json'
    if os.path.exists(cache_file):
        with open(cache_file, 'r') as f:
            try:
                cached_trials = json.load(f)
            except json.JSONDecodeError:
                cached_trials = []
        cached_nct_ids = set(trial['protocolSection']['identificationModule']['nctId'] for trial in cached_trials)
    else:
        cached_trials = []
        cached_nct_ids = set()

    # NCT IDs that need to be fetched
    nct_numbers_to_fetch = [nct for nct in nct_numbers if nct not in cached_nct_ids]

    nct_numbers_not_found = []

    for i in range(0, len(nct_numbers_to_fetch), batch_size):
        batch_nct_ids = nct_numbers_to_fetch[i:i+batch_size]
        params = {
            'format': 'json',
            'filter.ids': ','.join(batch_nct_ids),
            'fields': (
                'protocolSection.identificationModule.nctId,'
                'protocolSection.identificationModule.briefTitle,'
                'protocolSection.identificationModule.officialTitle,'
                'protocolSection.conditionsModule.conditions,'
                'protocolSection.descriptionModule.briefSummary,'
                'protocolSection.eligibilityModule.eligibilityCriteria'
            )
        }
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            logging.error(f"Error fetching data: HTTP {response.status_code}")
            print(f"Error fetching data for NCT IDs: {batch_nct_ids}")
            print(f"HTTP Status Code: {response.status_code}")
            print(f"Response content: {response.text}")
            continue
        data = response.json()
        studies = data.get('studies', [])
        if not studies:
            logging.warning(f"No studies found for NCT IDs: {batch_nct_ids}")
            nct_numbers_not_found.extend(batch_nct_ids)
        else:
            fetched_nct_ids = [study['protocolSection']['identificationModule']['nctId'] for study in studies]
            not_found_ids = set(batch_nct_ids) - set(fetched_nct_ids)
            nct_numbers_not_found.extend(not_found_ids)
            trials_data.extend(studies)
        time.sleep(1)  # Be polite and avoid hitting the API too hard

    # Combine cached and newly fetched trials
    all_trials = cached_trials + trials_data

    # Remove duplicates
    all_trials_unique = {trial['protocolSection']['identificationModule']['nctId']: trial for trial in all_trials}
    all_trials = list(all_trials_unique.values())

    # Save the combined trials to cache
    with open(cache_file, 'w') as f:
        json.dump(all_trials, f)

    logging.info(f"Fetched {len(trials_data)} new trials. Total trials: {len(all_trials)}")

    if nct_numbers_not_found:
        logging.warning(f"Could not find data for {len(nct_numbers_not_found)} NCT IDs: {nct_numbers_not_found}")
        print(f"Could not find data for the following NCT IDs:")
        print(nct_numbers_not_found)

    return all_trials

# Function to filter trials based on condition keywords
def filter_trials_by_conditions(trials, keywords, include_solid_tumor_markers=False):
    """
    Filter trials based on condition keywords and optionally solid tumor markers.
    """
    filtered_trials = []
    keyword_set = set(kw.lower() for kw in keywords)
    if include_solid_tumor_markers:
        keyword_set.add('solid tumor')
    for trial in trials:
        conditions = trial.get('protocolSection', {}).get('conditionsModule', {}).get('conditions', [])
        # Expand conditions with synonyms and broader terms
        condition_terms = set()
        for cond in conditions:
            cond_lower = cond.lower()
            condition_terms.add(cond_lower)
            condition_terms.update(get_broader_terms(cond_lower))
        if condition_terms & keyword_set:
            filtered_trials.append(trial)
    logging.info(f"Filtered {len(filtered_trials)} trials based on condition keywords.")
    return filtered_trials

# Function to extract relevant information from each study
def extract_trial_info(study):
    """
    Extract relevant information from a study.
    """
    protocol_section = study.get('protocolSection', {})
    identification_module = protocol_section.get('identificationModule', {})
    description_module = protocol_section.get('descriptionModule', {})
    conditions_module = protocol_section.get('conditionsModule', {})
    eligibility_module = protocol_section.get('eligibilityModule', {})

    nct_id = identification_module.get('nctId', 'N/A')
    brief_title = identification_module.get('briefTitle', 'N/A')
    official_title = identification_module.get('officialTitle', 'N/A')
    conditions = conditions_module.get('conditions', [])
    brief_summary = description_module.get('briefSummary', 'N/A')
    eligibility_criteria = eligibility_module.get('eligibilityCriteria', 'N/A')

    # Clean and format text fields
    brief_summary = clean_text(brief_summary)
    eligibility_criteria = clean_text(eligibility_criteria)
    # Concatenate conditions if multiple
    conditions_str = '; '.join(conditions) if isinstance(conditions, list) else conditions

    trial_info = {
        'nct_id': nct_id,
        'title': official_title or brief_title,
        'conditions': conditions_str,
        'brief_summary': brief_summary,
        'eligibility_criteria': eligibility_criteria
    }
    return trial_info

# Asynchronous function to summarize eligibility criteria
async def summarize_eligibility_criteria(eligibility_criteria):
    prompt = (
        f"Please summarize the following eligibility criteria in 500 characters or less:\n\n"
        f"{eligibility_criteria}"
    )
    try:
        response = await client.chat.completions.create(
            model='gpt-3.5-turbo',
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0.5
        )
        # Collect token usage for gpt-3.5-turbo
        usage = response['usage']
        token_usage['gpt-3.5-turbo']['prompt_tokens'] += usage.get('prompt_tokens', 0)
        token_usage['gpt-3.5-turbo']['completion_tokens'] += usage.get('completion_tokens', 0)
        token_usage['gpt-3.5-turbo']['total_tokens'] += usage.get('total_tokens', 0)
        
        summary = response['choices'][0]['message']['content'].strip()
        return summary
    except Exception as e:
        logging.error(f"Error summarizing eligibility criteria: {e}")
        return eligibility_criteria  # Fallback to original if summarization fails

# Asynchronous function to evaluate patient eligibility for a trial using OpenAI's GPT
async def match_patient_to_trial_async(patient_data, eligibility_criteria):
    prompt = (
        f"As a medical expert, assess the patient's eligibility for the clinical trial based on the eligibility criteria below.\n"
        f"Assume that any missing information is favorable to the patient's eligibility unless it is critical for safety or efficacy.\n"
        f"Patient Information:\n{patient_data}\n"
        f"Eligibility Criteria:\n{eligibility_criteria}\n"
        f"Provide a match score from 0% to 100% indicating how well the patient matches the eligibility criteria based on the available information. If anything matches exclusion criteria, it's 'Not Eligible'.\n"
        f"Conclude 'Eligible', 'Not Eligible', or 'More Info Needed'.\n"
        f"Explain briefly why the patient is eligible or not.\n"
        f"Please output in the following format:\n"
        f"Match Score: [percentage]\n"
        f"Eligibility: [Eligible / Not Eligible / More Info Needed]\n"
        f"Reason:\n[Your brief explanation here]"
    )
    try:
        from openai import AsyncOpenAI
        client = AsyncOpenAI()
        response = await client.chat.completions.create(
            model='gpt-4',
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=300,
            temperature=0
        )
       # Collect token usage for gpt-4
        usage = response.usage
        token_usage['gpt-4']['prompt_tokens'] += usage.prompt_tokens
        token_usage['gpt-4']['completion_tokens'] += usage.completion_tokens
        token_usage['gpt-4']['total_tokens'] += usage.total_tokens
        result = response.choices[0].message.content.strip()
        logging.info(f"OpenAI API response for trial: {result}")

        # Parse the output
        match_score = 0.0
        eligibility = 'Unknown'
        reason = ''
        lines = result.split('\n')
        for line in lines:
            if line.startswith('Match Score:'):
                match_score_text = line[len('Match Score:'):].replace('%', '').strip()
                try:
                    match_score = float(match_score_text)
                except ValueError:
                    match_score = 0.0
            elif line.startswith('Eligibility:'):
                eligibility = line[len('Eligibility:'):].strip()
            elif line.startswith('Reason:'):
                reason_index = lines.index(line)
                reason = '\n'.join(lines[reason_index+1:]).strip()
                break  # No need to parse further

        trial_data = {
            'match_result': result,
            'score': match_score,
            'eligibility': eligibility,
            'reason': reason
        }
        return trial_data
    except Exception as e:
        logging.error(f"Error in match_patient_to_trial_async: {e}")
        return {
            'match_result': '',
            'score': 0.0,
            'eligibility': 'Not Eligible',
            'reason': ''
        }

# Initialize tokenizer
encoding = tiktoken.encoding_for_model('text-embedding-ada-002')

def count_tokens(text, model='text-embedding-ada-002'):
    tokens = encoding.encode(text)
    return len(tokens)
import numpy as np

async def get_embedding(text):
    token_count = count_tokens(text)
    token_usage['text-embedding-ada-002']['total_tokens'] += token_count
    from openai import AsyncOpenAI
    client= AsyncOpenAI()
    response = await client.embeddings.create(
        input=[text],  # Input as list
        model='text-embedding-ada-002'
    )
    # Access embedding using dot notation
    return response.data[0].embedding

async def rank_trials_with_embeddings(patient_data, trials):
    patient_embedding = await get_embedding(patient_data)
    trial_infos = []
    for trial in trials:
        trial_info = extract_trial_info(trial)
        eligibility_criteria = trial_info['eligibility_criteria']
        if not eligibility_criteria or eligibility_criteria == 'N/A':
            continue
        trial_info['eligibility_criteria_summary'] = eligibility_criteria  # Use full criteria
        trial_infos.append(trial_info)
    
    logging.info(f"Matching patient to {len(trial_infos)} trials using embeddings.")
    print("Matching trials using embeddings...")
    trial_results = []
    
    for trial_info in tqdm(trial_infos):
        trial_embedding = await get_embedding(trial_info['eligibility_criteria_summary'])
        similarity = np.dot(patient_embedding, trial_embedding) / (np.linalg.norm(patient_embedding) * np.linalg.norm(trial_embedding))
        trial_data = {
            'nct_id': trial_info['nct_id'],
            'title': trial_info['title'],
            'similarity_score': similarity,
            'eligibility_criteria': trial_info['eligibility_criteria_summary']
        }
        trial_results.append(trial_data)
    return trial_results


# Function to generate patient profile paragraph
def generate_patient_paragraph(patient_info):
    # Convert the patient_info dictionary to a formatted string
    formatted_info = '\n'.join([f"{key}: {value}" for key, value in patient_info.items()])
    
    prompt = (
        "Create a detailed patient profile paragraph based solely on the following information from the CSV. "
        "Do not add any additional information or make any assumptions beyond what is provided. "
        "Ensure that the paragraph includes a statement where the patient gives consent for their data to be used.\n\n"
        f"{formatted_info}"
    )
    
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that creates patient profiles based only on provided data."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=300,
            temperature=0.7,
        )
        # Access the response correctly using attribute notation
        paragraph = response.choices[0].message.content.strip()
        return paragraph
    except Exception as e:
        return f"Error generating paragraph: {str(e)}"

# Widgets for the Voila app
# Create a file upload widget
upload_widget = widgets.FileUpload(
    accept='.csv',  # Accept only CSV files
    multiple=False,  # Do not allow multiple uploads
    description='Upload CSV'
)

# Create patient data textarea (from code 1)
patient_data_textarea = widgets.Textarea(
    value='',
    placeholder='Patient data will appear here...',
    description='Patient Data:',
    layout=widgets.Layout(width='800px', height='200px')
)

include_solid_tumor_checkbox = widgets.Checkbox(
    value=False,
    description='Include Solid Tumor Markers',
    disabled=False,
    indent=False
)

submit_button = widgets.Button(
    description='Submit',
    button_style='success',
    tooltip='Click to submit',
    icon='check'
)

# Create an output area to display results
output_area = widgets.Output()

# Attach the callback to the submit button
def on_submit_button_clicked(b):
    with output_area:
        clear_output()
        patient_data = patient_data_textarea.value.strip()
        include_solid_tumor_markers = include_solid_tumor_checkbox.value
        if not patient_data:
            print("Please enter patient data.")
            return

        print("Generating keywords from patient data...")
        loop = asyncio.get_event_loop()
        keywords = loop.run_until_complete(generate_keywords(patient_data))
        if not keywords:
            print("Failed to generate keywords.")
            return
        print("Keywords generated:", ', '.join(keywords))

        # Filter trials based on condition keywords
        print("\nFiltering trials based on condition keywords...")
        filtered_trials = filter_trials_by_conditions(trials_data, keywords, include_solid_tumor_markers)
        print(f"Total trials fetched: {total_trials_fetched}")
        print(f"Filtered {len(filtered_trials)} trials based on condition keywords.")

        if not filtered_trials:
            print("No trials found matching the condition keywords.")
            return

        # Limit the number of trials to process to manage API usage
        max_trials_to_process = 20  # Adjust as needed
        trials_to_process = filtered_trials[:max_trials_to_process]

        print(f"\nMatching patient to {len(trials_to_process)} trials (this may take some time)...")

        # Run the ranking and matching asynchronously
        try:
            trial_results = loop.run_until_complete(process_trials_concurrently(patient_data, trials_to_process))
        except Exception as e:
            print(f"An error occurred during trial matching: {e}")
            return

        if not trial_results:
            print("No trials matched the patient data.")
            return

        # Sort trials by score (higher score first)
        trial_results.sort(key=lambda x: x['score'], reverse=True)

        # Display all matching trials
        print("\nMatching Clinical Trials:")
        for trial in trial_results:
            print(f"Title: {trial['title']}")
            print(f"NCT ID: {trial['nct_id']}")
            print(f"Eligibility: {trial['eligibility']}")
            print(f"Score: {trial['score']}%")
            print(f"Reason:\n{trial['reason']}\n")
            print("-" * 80)

        # Calculate costs
        costs = {}

        # Prices per 1,000 tokens
        costs['gpt-4'] = (
            (token_usage['gpt-4']['prompt_tokens'] / 1000) * 0.03 +  # Prompt tokens
            (token_usage['gpt-4']['completion_tokens'] / 1000) * 0.06  # Completion tokens
        )

        costs['gpt-3.5-turbo'] = (
            (token_usage['gpt-3.5-turbo']['total_tokens'] / 1000) * 0.002
        )

        costs['text-embedding-ada-002'] = (
            (token_usage['text-embedding-ada-002']['total_tokens'] / 1000) * 0.0004
        )

        total_cost = costs.get('gpt-4', 0) + costs.get('gpt-3.5-turbo', 0) + costs['text-embedding-ada-002']

        # Print token usage and costs
        print("\nToken Usage:")
        if token_usage['gpt-4']['total_tokens'] > 0:
            print(f"gpt-4:")
            print(f"  Prompt Tokens: {token_usage['gpt-4']['prompt_tokens']}")
            print(f"  Completion Tokens: {token_usage['gpt-4']['completion_tokens']}")
            print(f"  Total Tokens: {token_usage['gpt-4']['total_tokens']}")
            print(f"  Cost: ${costs['gpt-4']:.6f}")

        if token_usage['gpt-3.5-turbo']['total_tokens'] > 0:
            print(f"gpt-3.5-turbo:")
            print(f"  Total Tokens: {token_usage['gpt-3.5-turbo']['total_tokens']}")
            print(f"  Cost: ${costs['gpt-3.5-turbo']:.6f}")

        print(f"text-embedding-ada-002:")
        print(f"  Total Tokens: {token_usage['text-embedding-ada-002']['total_tokens']}")
        print(f"  Cost: ${costs['text-embedding-ada-002']:.6f}")

        print(f"\nEstimated Total Cost: ${total_cost:.6f}")

submit_button.on_click(on_submit_button_clicked)

# Function to handle file upload and processing
def handle_file_upload(change):
    with output_area:
        clear_output()
        if upload_widget.value:
            # Get the uploaded file
            uploaded_file = next(iter(upload_widget.value.values()))
            content = uploaded_file['content']
            try:
                # Read CSV content into DataFrame using io.BytesIO
                df = pd.read_csv(io.BytesIO(content))
                if df.empty:
                    print("The uploaded CSV file is empty.")
                    return
                if len(df) != 1:
                    print("Please upload a CSV file with exactly one patient record.")
                    return
                
                # Convert the single row to a dictionary
                patient_info = df.iloc[0].to_dict()
                print("**Patient Information:**")
                display(df)
                
                # Generate paragraph using OpenAI
                print("\n**Generating patient profile paragraph...**\n")
                paragraph = generate_patient_paragraph(patient_info)
                print("**Generated Patient Profile:**\n")
                print(paragraph)
                
                # Insert the paragraph into the patient data textarea
                patient_data_textarea.value = paragraph
                
            except Exception as e:
                print(f"Error processing CSV file: {e}")

# Link the upload event to the handler function
upload_widget.observe(handle_file_upload, names='value')

# Load NCT Numbers and trial data at the beginning
csv_file_path = 'table_trials_andy.csv'  # Replace with your CSV file path
nct_numbers = load_nct_numbers(csv_file_path)
trials_data = fetch_trials_by_nct_numbers(nct_numbers)
total_trials_fetched = len(trials_data)

# Display the widgets
display(
    widgets.VBox([
        upload_widget,
        patient_data_textarea,
        include_solid_tumor_checkbox,
        submit_button,
        output_area
    ])
)


Could not find data for the following NCT IDs:
['NCT0580550']


VBox(children=(FileUpload(value={}, accept='.csv', description='Upload CSV'), Textarea(value='', description='…