# Extracting Medical Essay Fragments from Clinical Trials API

This notebook demonstrates how to request and retrieve fragments of medical essays from the Clinical Trials API. The goal is to automate the extraction of relevant medical text fragments for further analysis or processing.

In [1]:
import numpy as np
import os
import requests
import json

## Fetching Clinical Trials Data

This section involves defining a function, `get_trials(expr)`, to fetch data from the Clinical Trials API. The function:

- **Builds the URL**: Incorporates the search expression (`expr`) into the API endpoint.
- **Makes the Request**: Uses the `requests` library to send an HTTP GET request to fetch the data.
- **Processes the Response**: If the request is successful, the JSON response is parsed and returned. If not, the function returns `None`.

In [5]:
# Function to fetch clinical trials data based on a given search expression
def get_trials(expr):
    # Build the URL using the search expression
    base_url = "https://clinicaltrials.gov/api/query/study_fields?expr={EXPR}&fields=NCTId&min_rnk=1&max_rnk=200&fmt=json"
    url = base_url.format(EXPR=expr)
    
    # Send a GET request to the API
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse and return the JSON response
        result = response.json()
        return result
    else:
        # Return None if the request failed
        return None

In [9]:
covid = get_trials("covid")['StudyFieldsResponse']['StudyFields']

In [12]:
ncts = []
covid = get_trials("covid")['StudyFieldsResponse']['StudyFields']
for i in range(len(covid)):
    ncts.append(covid[i]['NCTId'][0])

In [18]:
heart_attact = get_trials("heart+attack")['StudyFieldsResponse']['StudyFields']
for i in range(len(heart_attact)):
    ncts.append(heart_attact[i]['NCTId'][0])

In [19]:
cancer = get_trials("cancer")['StudyFieldsResponse']['StudyFields']
for i in range(len(cancer)):
    ncts.append(cancer[i]['NCTId'][0])

In [20]:
# REquest from ClinicalTrials.gov/api/query/study_fields?expr={NCT}&fields=NCTId,Condition,BriefTitle&fmt=json
def get_clinical_trials(nct_id):
    """
    Gets the clinical trials for a given NCT ID.

    Parameters:
        nct_id (str): The NCT ID of the clinical trial.

    Returns:
        dict or None: The clinical trial result as a dictionary if the response status code is 200,
        otherwise None.
    """
    base_url = "https://clinicaltrials.gov/api/query/study_fields?expr={NCT}&fields=NCTId,BriefTitle,BriefSummary,BaselineGroupDescription,OutcomeMeasureDescription&fmt=json"
    url = base_url.format(NCT=nct_id)
    response = requests.get(url)
    if response.status_code == 200:
        result = response.json()
        return result
    else:
        return None
    

In [21]:
# Save in no-plain folder as txt each section
def save_clinical_trials(nct_id):
    """
    Gets the clinical trials for a given NCT ID.

    Parameters:
        nct_id (str): The NCT ID of the clinical trial.

    Returns:
        dict or None: The clinical trial result as a dictionary if the response status code is 200,
        otherwise None.
    """
    base_url = "https://clinicaltrials.gov/api/query/study_fields?expr={NCT}&fields=NCTId,BriefTitle,BriefSummary,BaselineGroupDescription,OutcomeMeasureDescription&fmt=json"
    url = base_url.format(NCT=nct_id)
    response = requests.get(url)
    if response.status_code == 200:
        result = response.json()
        return result
    else:
        return None


In [22]:
def save_text(text, filename):
    """
    Saves the given text to a file with the given filename.

    Parameters:
        text (str): The text to be saved.
        filename (str): The name of the file to save the text to.
    """
    with open(os.path.join(os.getcwd(), filename), 'w', encoding='utf-8') as file:
        file.write(text)

In [3]:
texts_folder = "full_texts"
ncts = []

for filename in os.listdir(texts_folder):
    if filename.endswith(".txt"):
        nct = filename.split("_")[0].split(".")[0]
        ncts.append(nct)

print(ncts)


['NCT00585195', 'NCT00761267', 'NCT00925002', 'NCT00935012', 'NCT00952380', 'NCT00974311', 'NCT01212991', 'NCT01463306', 'NCT01470612', 'NCT01543087', 'NCT01546038', 'NCT01557244', 'NCT01639001', 'NCT01664923', 'NCT01720524', 'NCT01747915', 'NCT01877668', 'NCT01900899', 'NCT01920061', 'NCT01934140', 'NCT01939158', 'NCT01945021', 'NCT01945775', 'NCT01962207', 'NCT01964716', 'NCT01970865', 'NCT01989676', 'NCT01994889', 'NCT02003924', 'NCT02034916', 'NCT02072824', 'NCT02092467', 'NCT02100514', 'NCT02117570', 'NCT02130557', 'NCT02133742', 'NCT02135029', 'NCT02187003', 'NCT02187744', 'NCT02213263', 'NCT02222922', 'NCT02226172', 'NCT02297438', 'NCT02310763', 'NCT02364999', 'NCT02367456', 'NCT02382796', 'NCT02384382', 'NCT02458287', 'NCT02475733', 'NCT02480153', 'NCT02484092', 'NCT02493751', 'NCT02497781', 'NCT02504294', 'NCT02528188', 'NCT02528253', 'NCT02534935', 'NCT02561195', 'NCT02573259', 'NCT02592434', 'NCT02600923', 'NCT02609828', 'NCT02650193', 'NCT02697773', 'NCT02709486', 'NCT02718

In [27]:
for nct in ncts:
    trial = get_clinical_trials(nct)
    if trial:
        title = trial['StudyFieldsResponse']['StudyFields'][0]['BriefTitle'][0]
        summary = trial['StudyFieldsResponse']['StudyFields'][0]['BriefSummary'][0]
        baselines = trial['StudyFieldsResponse']['StudyFields'][0]['BaselineGroupDescription']
        outcomes = trial['StudyFieldsResponse']['StudyFields'][0]['OutcomeMeasureDescription']
        save_text(f"{title}\n{summary}", f"ncts/{nct}_summary.txt")
        i = 0
        for i, baseline in enumerate(baselines):
            # if have more than 100 words in baseline
            if len(baseline.split()) > 100:
                save_text(f"{title}\n{baseline}", f"ncts/{nct}_baseline_{i}.txt")
        for i, outcome in  enumerate(outcomes):
            # if have more than 100 words in outcome
            if len(outcome.split()) > 100:
                save_text(f"{title}\n{outcome}", f"ncts/{nct}_outcome_{i}.txt")