In [10]:
import pandas as pd
import re
from dateutil import parser

def my_parse_date(x, dayfirst=False):
    if x:
        return parser.parse(x, dayfirst=dayfirst).isoformat()
    return x

In [3]:
df = pd.read_json('/Users/eric/Downloads/clinical_trials_raw_2021-12-08.json.gz')
data = df.to_dict(orient='records')

In [11]:
parse_study(data[0])

{'has_results': False,
 'NCTId': 'NCT05113407',
 'other_ids': [{'type': 'org_study_id', 'id': '2021-A01876-35'}],
 'title': 'Observatory on the Use of the Shockwave Medical C2 Coronary Lithotripsy System in the General Population in France.',
 'acronym': 'France LILI',
 'summary': 'Coronary calcified lesions will have an increasing impact in the daily practice of coronary angioplasty, considering the epidemiological explosion of factors favoring coronary calcifications, first of all diabetes. Moreover, calcified lesions are underestimated in angiography and associated with an increase in angiographic complications, as well as with a worse clinical prognosis. The usual techniques for the preparation of calcified plaque, in particular rotary atherectomy, have a low penetration rate in France (3% of procedures) and are associated with an increase in per-procedural complications without clinical evidence of effectiveness4. A new device has been developed by Schockwave Medical Inc. for the 

In [12]:
data[0]

{'Rank': 1,
 'Study': {'ProtocolSection': {'IdentificationModule': {'NCTId': 'NCT05113407',
    'OrgStudyIdInfo': {'OrgStudyId': '2021-A01876-35'},
    'Organization': {'OrgFullName': 'French Cardiology Society',
     'OrgClass': 'OTHER'},
    'BriefTitle': 'Observatory on the Use of the Shockwave Medical C2 Coronary Lithotripsy System in the General Population in France.',
    'OfficialTitle': 'Observatory on the Use of the Shockwave Medical C2 Coronary Lithotripsy System in the General Population in France.',
    'Acronym': 'France LILI'},
   'StatusModule': {'StatusVerifiedDate': 'November 2021',
    'OverallStatus': 'Recruiting',
    'ExpandedAccessInfo': {'HasExpandedAccess': 'No'},
    'StartDateStruct': {'StartDate': 'November 16, 2021',
     'StartDateType': 'Actual'},
    'PrimaryCompletionDateStruct': {'PrimaryCompletionDate': 'December 2023',
     'PrimaryCompletionDateType': 'Anticipated'},
    'CompletionDateStruct': {'CompletionDate': 'June 2024',
     'CompletionDateType

In [24]:
def parse_study(input_study):
    x = input_study.get('Study')
    protocol = x.get('ProtocolSection', {})
    # Results
    results = x.get('ResultsSection')
    elt = {'has_results': (results is not None)}
    # Identification
    identification_module = protocol.get('IdentificationModule', {})
    elt['NCTId'] = identification_module.get('NCTId')
    elt['other_ids'] = []
    if identification_module.get("OrgStudyIdInfo", {}).get("OrgStudyId"):
        elt['other_ids'].append({'type': "org_study_id",
                                 "id": identification_module.get("OrgStudyIdInfo", {}).get("OrgStudyId")})
    for second_id_elt in identification_module.get("SecondaryIdInfoList", {}).get("SecondaryIdInfo", []):
        if second_id_elt.get("SecondaryId"):
            elt['other_ids'].append({'type': second_id_elt.get('SecondaryIdType'),
                                     'id': second_id_elt.get('SecondaryId')})
            if second_id_elt.get("SecondaryIdType") == "EudraCT Number":
                elt['eudraCT'] = second_id_elt.get("SecondaryId")
    elt['title'] = identification_module.get('OfficialTitle')
    elt['acronym'] = identification_module.get('Acronym')
    #description
    description_module = protocol.get('DescriptionModule', {})
    summary = description_module.get('BriefSummary')
    if summary:
        elt['summary'] = summary
    # Status
    status_module = protocol.get('StatusModule', {})
    study_start_date = status_module.get('StartDateStruct', {}).get('StartDate')
    study_start_date_type = status_module.get('StartDateStruct', {}).get('StartDateType')
    elt['study_start_date'] = my_parse_date(study_start_date)
    elt['study_start_date_type'] = study_start_date_type
    elt['status'] = status_module.get("OverallStatus")
    study_completion_date = status_module.get('CompletionDateStruct', {}).get('CompletionDate')
    study_completion_date_type = status_module.get('CompletionDateStruct', {}).get('CompletionDateType')
    elt['study_completion_date'] = my_parse_date(study_completion_date)
    elt['study_completion_date_type'] = study_completion_date_type
    study_first_submit_date = status_module.get('StudyFirstSubmitDate')
    study_first_submit_qc_date = status_module.get('StudyFirstSubmitQCDate')
    results_first_submit_date = status_module.get('ResultsFirstSubmitDate')
    results_first_submit_qc_date = status_module.get('ResultsFirstSubmitQCDate')
    elt['study_first_submit_date'] = my_parse_date(study_first_submit_date)
    elt['study_first_submit_qc_date'] = my_parse_date(study_first_submit_qc_date)
    elt['results_first_submit_date'] = my_parse_date(results_first_submit_date)
    elt['results_first_submit_qc_date'] = my_parse_date(results_first_submit_qc_date)
    # Design
    design_module = protocol.get('DesignModule', {})
    study_type = design_module.get('StudyType')
    elt['study_type'] = study_type
    design_info = design_module.get('DesignInfo', {})
    time_perspective = design_info.get('DesignTimePerspectiveList', {}).get('DesignTimePerspective', [])
    elt['time_perspective'] = time_perspective
    elt['design_allocation'] = design_info.get('DesignAllocation')
    elt['primary_purpose'] = design_info.get('DesignPrimaryPurpose')
    enrollment_info = design_module.get("EnrollmentInfo", {})
    enrollment_count = enrollment_info.get("EnrollmentCount")
    enrollment_type = enrollment_info.get("EnrollmentType")
    elt['enrollment_count'] = enrollment_count
    elt['enrollment_type'] = enrollment_type
    # References
    ref_module = protocol.get('ReferencesModule', {})
    ref_list = ref_module.get("ReferenceList", {})
    references = ref_list.get('Reference', [])
    elt['references'] = references
    for r in references:
        if 'doi:' in r.get('ReferenceCitation', '').lower():
            doi = re.sub(".*doi:", '', r.get('ReferenceCitation', '')).strip().lower()
            doi = doi.split(" ")[0]
            if doi[-1] == ".":
                doi = doi[:-1]
            r['doi'] = doi
    # Type can be result, derived or background
    elt['publications_result'] = []
    for r in references:
        if r.get('ReferenceType') in ['result', 'derived'] and 'protocol' not in r['ReferenceCitation'].lower():
            if 'doi' in r:
                elt['publications_result'].append(r['doi'])
            elif 'ReferencePMID' in r:
                elt['publications_result'].append(r['ReferencePMID'])
            elif 'ReferenceCitation' in r:
                elt['publications_result'].append(r['ReferenceCitation'])
            else:
                elt['publications_result'].append('other')
    elt['has_publications_result'] = len(elt['publications_result']) > 0
    elt['has_results_or_publications'] = elt['has_results'] or elt['has_publications_result']
    # IPD individual patient data
    ipd_module = protocol.get('IPDSharingStatementModule', {})
    ipd_sharing = ipd_module.get('IPDSharing')
    elt['ipd_sharing'] = ipd_sharing
    ipd_sharing_description = ipd_module.get('IPDSharingDescription')
    elt['ipd_sharing'] = ipd_sharing
    elt['ipd_sharing_description'] = ipd_sharing_description
    # Sponsor
    sponsor_module = protocol.get('SponsorCollaboratorsModule', {})
    lead_sponsor = sponsor_module.get('LeadSponsor', {}).get('LeadSponsorName')
    elt['lead_sponsor'] = lead_sponsor
    # ContactLocation
    locations_module = protocol.get('ContactsLocationsModule', {})
    locations = locations_module.get('LocationList', {}).get('Location', [])
    location_country = list(set(
        [x.get('LocationCountry') for x in locations if "LocationCountry" in x]))
    location_facility = list(set(
        [x.get('LocationFacility') for x in locations if "LocationFacility" in x]))
    elt['location_country'] = location_country
    elt['location_facility'] = location_facility
    contacts = locations_module.get('CentralContactList', {}).get('CentralContact', [])
    if contacts:
        elt['contacts'] = contacts
    for contact in contacts:
        if contact.get('CentralContactEMail'):
            elt['email'] = contact.get('CentralContactEMail')
            break
    officials = locations_module.get('OverallOfficialList', {}).get('OverallOfficial', [])
    if officials:
        elt['officials'] = officials
    # Intervention
    intervention_module = protocol.get('ArmsInterventionsModule', {})
    interventions = intervention_module.get('InterventionList', {}).get('Intervention', [])
    intervention_type = list(set(
        [w.get('InterventionType') for w in interventions if 'InterventionType' in w]))
    elt['intervention_type'] = intervention_type
    return elt



In [26]:
res = []
for d in data:
    res.append(parse_study(d))

In [29]:
dd = pd.DataFrame(res)

In [47]:
dd['has_email'] = dd.email.apply(lambda x:'@' in str(x))

In [51]:
dd[(dd.study_type=='Interventional') & (dd.status!='Completed')].has_email.value_counts()

True     6603
False    4659
Name: has_email, dtype: int64

In [54]:
dd[(dd.study_type=='Interventional')].groupby('status').has_email.mean()

status
Active, not recruiting     0.000000
Completed                  0.000000
Enrolling by invitation    0.000000
Not yet recruiting         0.953682
Recruiting                 0.965396
Suspended                  0.000000
Terminated                 0.000000
Unknown status             0.733538
Withdrawn                  0.000000
Name: has_email, dtype: float64

In [55]:
dd[(dd.study_type=='Interventional') & (dd.status=='Completed')]

Unnamed: 0,has_results,NCTId,other_ids,title,acronym,summary,study_start_date,study_start_date_type,status,study_completion_date,...,ipd_sharing_description,lead_sponsor,location_country,location_facility,contacts,email,officials,intervention_type,eudraCT,has_email
11,False,NCT00799565,"[{'type': 'org_study_id', 'id': '2008-01'}]",Genetic Polymorphisms in Idiopathic Mitral Val...,MVP-France,This prospective nation-wide (France) study ai...,2008-12-12T00:00:00,,Completed,2011-07-12T00:00:00,...,,French Cardiology Society,[France],"[Hôpital Lariboisière, Hôpital de la Cavale Bl...",,,"[{'OverallOfficialName': 'Albert Alain Hagège,...",[Genetic],,False
27,False,NCT03029897,"[{'type': 'org_study_id', 'id': '16-135'}]",Impact of E-reporting by Patients With Relapsi...,VigiP-SEP,Adverse drug reactions are collected exhaustiv...,2017-05-05T00:00:00,Actual,Completed,2019-04-24T00:00:00,...,,"University Hospital, Caen",[France],[CHU Caen],,,,[Other],,False
51,False,NCT00842166,"[{'type': 'org_study_id', 'id': '2008 19'}, {'...",,,The aim is to document viral etiologies of iso...,2009-02-12T00:00:00,,Completed,2011-07-12T00:00:00,...,,Assistance Publique Hopitaux De Marseille,[France],[Service des Maladies Infectieuses et Tropical...,,,"[{'OverallOfficialName': 'REMI CHARREL', 'Over...",[Other],,False
70,False,NCT01742741,"[{'type': 'org_study_id', 'id': 'France'}, {'t...",Early Feasibility Study 2 of Outpatient Contro...,,"An unblinded, randomized, cross-over design wi...",2013-05-12T00:00:00,,Completed,2013-05-12T00:00:00,...,,University of Virginia,[France],[Centre d'Investigation Clinique CHU Montipell...,,,"[{'OverallOfficialName': 'Eric Renard, MD, PhD...",[Device],,False
77,False,NCT00821925,"[{'type': 'org_study_id', 'id': '06-PP-04'}]",Estimation of Osteoporosis' Prevalence in Fran...,,The purpose of this study is to estimate Osteo...,2006-12-12T00:00:00,,Completed,,...,,Centre Hospitalier Universitaire de Nice,[France],[Department of Rheumatology],,,[{'OverallOfficialName': 'Liana Euller-Ziegler...,[Procedure],,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31691,False,NCT00429195,"[{'type': 'org_study_id', 'id': 'LIPGENE Dieta...",LIPGENE Dietary Intervention Study,,"The LIPGENE Human Dietary Intervention Study, ...",2004-02-12T00:00:00,,Completed,2007-01-12T00:00:00,...,,University College Dublin,[Ireland],"[Nutrigenomics Research Group, Institute of Mo...",,,"[{'OverallOfficialName': 'Helen M Roche, PhD',...",[Behavioral],,False
31696,True,NCT02509494,"[{'type': 'org_study_id', 'id': 'CR107372'}, {...","A Staged Phase 3 Study, Including a Double-Bli...",EBOVAC-Salone,The purpose of this study is the evaluation of...,2015-09-30T00:00:00,Actual,Completed,2019-07-03T00:00:00,...,,Janssen Vaccines & Prevention B.V.,[Sierra Leone],[],,,[{'OverallOfficialName': 'Janssen Vaccines & P...,[Biological],,False
31702,False,NCT02389660,"[{'type': 'org_study_id', 'id': 'W73/7. PR/201...",European Comparative Effectiveness Research on...,E-COMPARED,"Effective, accessible, and affordable depressi...",2015-06-12T00:00:00,Actual,Completed,2018-06-12T00:00:00,...,,"University of Social Sciences and Humanities, ...",[Poland],[University of Social Sciences and Humanities],,,"[{'OverallOfficialName': 'Roman Cieslak, PhD',...",[Behavioral],,False
31704,False,NCT01127204,"[{'type': 'org_study_id', 'id': 'ANRS 12206 MO...",Randomized Phase 3 Trial to Evaluate Two Simpl...,,The MONOD trial aim to evaluate the implementa...,2011-06-12T00:00:00,,Completed,2015-04-12T00:00:00,...,,"ANRS, Emerging Infectious Diseases","[Burkina Faso, Côte D'Ivoire]",[Service de maladies infectieuses - CHU Charle...,,,[{'OverallOfficialName': 'Marguerite Timite-Ko...,[Drug],,False
