In [None]:
import numpy as np
import pandas as pd
import json

In [None]:
# this notebook pulls in information about each clinical trial from clinicaltrials.gov based on the NCT ID,
# then merges it to patient enrollment information, thereby creating a retrospective trail matching training and evaluation dataset.

In [None]:

#  OnCore database at DFCI has internal DFCI protocol number and external NCT_ID
oncore = pd.read_csv('../structured_data/oncore_all.clean.csv')
oncore = oncore[oncore.protocol_type == 'Treatment']
oncore = oncore[['protocol_no','nct_id']].rename(columns={'protocol_no':'protocol_number'})
oncore = oncore[~oncore.nct_id.isnull()]
oncore = oncore.groupby('nct_id').first().reset_index()

In [None]:
oncore.info()

In [None]:
# pull title, summary, eligibility criteria from ct.gov
import requests

chunks = np.array_split(oncore.nct_id, trial_frame.shape[0] // 4 + 1)
trial_list = []

for chunk in chunks:
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    params = {"query.term": ",".join(chunk.tolist()), "format": "json"}

    response = requests.get(base_url, params=params)
    responses = response.json()

    # Process the data as needed (e.g., create a DataFrame)
    # ...
    for trial_dict in responses['studies']:

        nct_id = protocol_number = title = brief_summary = is_drug = detailed_summary = eligibility_criteria = ''
        nct_id = trial_dict['protocolSection']['identificationModule']['nctId']
        protocol_number = trial_dict['protocolSection']['identificationModule']['orgStudyIdInfo']['id']
        title = trial_dict['protocolSection']['identificationModule'].get('officialTitle')
        if not trial_dict['protocolSection'].get('descriptionModule') is None:
            brief_summary = trial_dict['protocolSection']['descriptionModule']['briefSummary']
        if not trial_dict['protocolSection'].get('oversightModule') is None:
            is_drug = trial_dict['protocolSection'].get('oversightModule').get('isFdaRegulatedDrug')
        if not trial_dict['protocolSection'].get('descriptionModule') is None:
            detailed_summary = trial_dict['protocolSection']['descriptionModule'].get('detailedDescription')
        if not trial_dict['protocolSection'].get('eligibilityModule') is None:
            eligibility_criteria = trial_dict['protocolSection']['eligibilityModule']['eligibilityCriteria']
        frame = pd.DataFrame(data={'nct_id':[nct_id], 
                                  'title':[title],
                                  'brief_summary':[brief_summary],
                                   'is_drug':[is_drug],
                                   'detailed_summary':[detailed_summary],
                                   'eligibility_criteria':[eligibility_criteria]
                                  }).astype(str).replace('nan', '')
        
        trial_list.append(frame)



In [None]:
len(trial_list)

In [None]:
trial_frame = pd.concat(trial_list, axis=0)
trial_frame = trial_frame.groupby('nct_id').first()
trial_frame.info()

In [None]:
trial_frame = trial_frame[~(trial_frame.brief_summary == '')]
trial_frame.info()

In [None]:
trial_frame = pd.merge(oncore, trial_frame, on='nct_id')
trial_frame.info()

In [None]:
# pull in actual trial enrollments at DFCI 
enrollments = pd.read_csv('/data/clin_notes_outcomes/pan_dfci_2024/structured_data/PROTOCOL_ENROLLMENT_DFHCC.txt', sep='|', encoding='ISO-8859-1', low_memory=False)
enrollments = enrollments[enrollments.TREATMENT_TYPE_CD == 'Tre']
enrollments = enrollments[~enrollments.TREATMENT_START_DT.isnull()]

enrollments = enrollments[['DFCI_MRN', 'STUDY_NM', 'STUDY_NBR','TREATMENT_START_DT']].rename(columns={'DFCI_MRN':'dfci_mrn',
                                                                                          'STUDY_NM':'study_nm',
                                                                                                      'STUDY_NBR':'protocol_nbr',
                                                                                          'TREATMENT_START_DT':'trial_start_dt'})



# this gets rid of unspecified active protocols that are being masked
known_enrollments = enrollments[~enrollments.protocol_nbr.isnull()]


In [None]:
known_enrollments.info()

In [None]:
known_enrollments.groupby(['dfci_mrn','protocol_nbr']).first().info()

In [None]:
trial_frame['protocol_nbr'] = pd.to_numeric(trial_frame['protocol_number'].str.replace('-',''))

In [None]:
trial_frame

In [None]:
# merge actual retrospective trial enrollments from DFCI to trial criteria from clinicaltrials.gov
useful_enrollments = pd.merge(known_enrollments, trial_frame, on='protocol_nbr')

In [None]:
useful_enrollments.info()

In [None]:
useful_enrollments.dfci_mrn.nunique()

In [None]:
useful_enrollments['trial_text'] = useful_enrollments['title'] + "\n" + useful_enrollments['brief_summary'] + "\n" + useful_enrollments['eligibility_criteria']

In [None]:
useful_enrollments['trial_text'] = useful_enrollments.trial_text.str.replace("\\s+", " ", regex=True)




In [None]:
useful_enrollments.to_csv('/data/clin_notes_outcomes/pan_dfci_2024/derived_data/useful_trial_enrollments.csv')