Figure out how to get clinical trials in clinicaltrials.gov

In [1]:
file_path = "../data/NCT05244993.json"

In [2]:
import json

def load_json_file(file_path):
    """
    Load a JSON file from the specified file path and return the JSON object.
    
    :param file_path: str, path to the JSON file
    :return: dict, loaded JSON object
    """
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    return json_data


def process_clinical_trial_data(data):
    """
    Process a JSON object of clinical trial data to extract relevant attributes.
    
    :param data: dict, JSON object of clinical trial data
    :return: dict, structured summary of extracted information
    """
    # Extract identification information
    identification_info = data['protocolSection']['identificationModule']
    
    # Extract conditions
    conditions_info = data['protocolSection']['conditionsModule']['conditions']
    
    # Extract interventions
    interventions_list = data['protocolSection']['armsInterventionsModule']['interventions']
    interventions_descriptions = [
        intervention.get('description', 'No description provided') for intervention in interventions_list
    ]
    
    # Extract eligibility information
    eligibility_info = data['protocolSection']['eligibilityModule']
    eligibility_criteria_text = eligibility_info.get('eligibilityCriteria', 'Not provided')

    # Extract summary information
    summary_info = data['protocolSection']['descriptionModule']['briefSummary']
    
    # Summarize the extracted information
    extracted_info = {
        'NCT ID': identification_info.get('nctId', 'Not provided'),
        'Brief Title': identification_info.get('briefTitle', 'Not provided'),
        'Brief Summary': summary_info if summary_info else 'Not provided',
        'Official Title': identification_info.get('officialTitle', 'Not provided'),
        'Conditions': conditions_info if conditions_info else 'Not provided',
        'Interventions Description': interventions_descriptions,
        'Eligibility': {
            'Healthy Volunteers': eligibility_info.get('healthyVolunteers', 'Not specified'),
            'Sex': eligibility_info.get('sex', 'Not specified'),
            'Minimum Age': eligibility_info.get('minimumAge', 'Not specified'),
            'Maximum Age': eligibility_info.get('maximumAge', 'Not specified'),
            'Standard Ages': eligibility_info.get('stdAges', 'Not specified'),
            'Criteria Text ': eligibility_criteria_text
        }
    }
    
    return extracted_info

# Now we'll test the functions with the file we previously loaded
json_data = load_json_file(file_path)
clinical_trial_summary = process_clinical_trial_data(json_data)
clinical_trial_summary

{'NCT ID': 'NCT05244993',
 'Brief Title': 'AK105 Plus Anlotinib Hydrochloride Combined With Albumin Paclitaxel as a First-line Therapy in Patients With Advanced Triple-negative Breast Cancer',
 'Brief Summary': 'This trial used a multicentre, single-arm design in which patients were treated with AK105 plus Anlotinib Hydrochloride combined with albumin paclitaxel. Patients included in this trial were advanced breast cancer with hormone receptor negative and Her2 negative. The primary endpoint is ORR, and the secondary endpoint is DCR, PFS, OS and safety.',
 'Official Title': 'AK105 Plus Anlotinib Hydrochloride Combined With Albumin Paclitaxel as a First-line Therapy in Patients With Advanced Triple-negative Breast Cancer',
 'Conditions': ['Breast Neoplasm Female'],
 'Interventions Description': ['AK105: 100mg per bottle, 200mg IV Day 1, cycled every 21 days',
  'Anlotinib Hydrochloride: 12mg per capsule, 12 mg PO once daily on Days 1-14, cycled every 21 days',
  'Albumin paclitaxel: 100

In [3]:
folder = "ctg-studies.json"

# Go through all the json files in the folder and process them
import os

# def process_clinical_trials_folder(folder):
#     """
#     Process a folder of JSON files containing clinical trial data.
    
#     :param folder: str, path to the folder containing the JSON files
#     :return: list, list of structured summaries of extracted information
#     """
#     clinical_trials_summaries = []
#     for file in os.listdir(folder):
#         file_path = os.path.join(folder, file)
#         json_data = load_json_file(file_path)
#         clinical_trial_summary = process_clinical_trial_data(json_data)
#         clinical_trials_summaries.append(clinical_trial_summary)
#     return clinical_trials_summaries

# Make a 

In [4]:
import textwrap

# Update the function to use the new format_criteria function
def format_clinical_trial_summary(summary):
    """
    Generate a formatted summary for a clinical trial data dictionary.
    
    :param summary: dict, structured summary of extracted clinical trial information
    :return: str, formatted summary suitable for presentation to a doctor
    """
    # Helper function to format a list with bullet points
    def format_list(items):
        return "\n".join(f"• {item}" for item in items)

    # Helper function to wrap text for better readability
    def wrap_text(text, width=80):
        return "\n".join(textwrap.wrap(text, width=width))

    # Start building the formatted summary
    formatted_summary = f"Clinical Trial Summary:\n\n"

    # # Add identification information
    # formatted_summary += f"NCT ID: {summary['NCT ID']}\n"
    # formatted_summary += f"Brief Title: {wrap_text(summary['Brief Title'])}\n"
    # formatted_summary += f"Brief Summary: {wrap_text(summary['Brief Summary'])}\n"
    # formatted_summary += f"Official Title: {wrap_text(summary['Official Title'])}\n\n"
    
    # Add conditions
    formatted_summary += "Conditions:\n"
    formatted_summary += f"{format_list(summary['Conditions'])}\n\n"
    
    # # Add interventions
    # formatted_summary += "Interventions Description:\n"
    # interventions_formatted = format_list(summary['Interventions Description'])
    # formatted_summary += f"{wrap_text(interventions_formatted, width=100)}\n\n"
    
    # Add eligibility information
    eligibility = summary['Eligibility']
    formatted_summary += "Eligibility Criteria:\n"
    # For each key in eligibility, check if it's the criteria text and format accordingly
    for key, value in eligibility.items():
        if key == 'Criteria Text ':
            # formatted_criteria = format_criteria(value)
            # formatted_summary += wrap_text(formatted_criteria, width=100) + "\n"
            formatted_summary += '\n' + value
        else:
            formatted_summary += f"{key}: {wrap_text(str(value))}\n"
    
    return formatted_summary

# Format the summary with the updated function
formatted_summary = format_clinical_trial_summary(clinical_trial_summary)
print(formatted_summary)

Clinical Trial Summary:

Conditions:
• Breast Neoplasm Female

Eligibility Criteria:
Healthy Volunteers: False
Sex: FEMALE
Minimum Age: 18 Years
Maximum Age: 75 Years
Standard Ages: ['ADULT', 'OLDER_ADULT']

Inclusion Criteria:

* Female aged 18-75 years old.
* ECOG 0 or 1 point.
* Advanced triple-negative invasive breast cancer :

  1. The pathological classification is triple negative, specifically:

     1. ER negative: IHC\<1%.
     2. PR negative: IHC\<1%.
     3. HER2 negative: IHC-/+ or IHC++ but FISH/CISH is negative.
  2. Tumor staging: locally advanced or recurrent/metastatic breast cancer.
* If the last chemotherapy drug in the previous adjuvant/neoadjuvant treatment stage is paclitaxel, paclitaxel liposome, paclitaxel albumin or docetaxel, it will take ≥6 months from the end of treatment to enrollment.
* At least one objectively measurable lesion according to the RECIST 1.1 .
* The main organs are functioning well, and the blood test results within 14 days before enrollment