# Clinical trials analysis

#### Reading clinical trials data using pandas

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("ctg-studies.csv")

#### Looking at the data

In [3]:
df.head()

Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents
0,NCT05013879,Kinesiotape for Edema After Bilateral Total Kn...,https://clinicaltrials.gov/study/NCT05013879,,COMPLETED,The purpose of this study is to determine if k...,NO,"Arthroplasty Complications|Arthroplasty, Repla...",DEVICE: Kinesio(R)Tape for edema control,Change from baseline and during 1-2-day time i...,...,Allocation: RANDOMIZED|Intervention Model: SIN...,2021-13203,2021-10-18,2023-11-24,2023-11-24,2021-08-19,,2024-02-23,"Burke Rehabilitation Hospital, White Plains, N...",
1,NCT01402479,An Open-labeled Trial of Ramipril in Patients ...,https://clinicaltrials.gov/study/NCT01402479,,COMPLETED,Physiology of migraine involving renin-angiote...,NO,Migraine With Hypertension,DRUG: Ramipril,"headache frequency, headache days, 12 week",...,Allocation: NON_RANDOMIZED|Intervention Model:...,0408-131-005,2004-10,2005-07,2005-07,2011-07-26,,2011-08-08,"Seoul National University Hospital, Seoul, 110...",
2,NCT00812279,Investigate the Exposure to Selected Smoke Con...,https://clinicaltrials.gov/study/NCT00812279,,COMPLETED,The overall purpose of this clinical study con...,NO,Smoking,OTHER: Distillation based smoking article (SMA...,To demonstrate a reduction in the three primar...,...,Allocation: RANDOMIZED|Intervention Model: PAR...,YVD-CS01-EU,2008-11,2009-02,2009-02,2008-12-22,,2019-11-07,"MTZ Clinical Research Inc., Warsaw, 02-106, Po...",
3,NCT03878979,Preoperative Immune Checkpoint Inhibitor for P...,https://clinicaltrials.gov/study/NCT03878979,,COMPLETED,"Nivolumab, also known as (Bristol Myers Squibb...",YES,Head and Neck Squamous Cell Carcinoma|Head and...,DRUG: Nivolumab 480mg and surgical resection,Safety as Measured by Number of Participants W...,...,Allocation: NON_RANDOMIZED|Intervention Model:...,J1923|IRB00207577|CA209-9H7,2019-07-08,2023-10-17,2023-10-17,2019-03-18,2024-11-21,2024-11-21,Johns Hopkins Sidney Kimmel Comprehensive Canc...,"Study Protocol and Statistical Analysis Plan, ..."
4,NCT05865379,Efficacy and Safety of BUFY01 Versus SVS20 in ...,https://clinicaltrials.gov/study/NCT05865379,BUSTON-01,NOT_YET_RECRUITING,The goal of this interventional investigation ...,NO,Dry Eye Disease,DEVICE: BUFY01 eye drops in single-dose contai...,"Signs, Change from baseline in Oxford score (0...",...,Allocation: RANDOMIZED|Intervention Model: PAR...,BUFY01-CT-2101,2025-06,2026-03,2026-06,2023-05-18,,2025-05-14,,


In [4]:
#Column names
df.columns

Index(['NCT Number', 'Study Title', 'Study URL', 'Acronym', 'Study Status',
       'Brief Summary', 'Study Results', 'Conditions', 'Interventions',
       'Primary Outcome Measures', 'Secondary Outcome Measures',
       'Other Outcome Measures', 'Sponsor', 'Collaborators', 'Sex', 'Age',
       'Phases', 'Enrollment', 'Funder Type', 'Study Type', 'Study Design',
       'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents'],
      dtype='object')

In [5]:
# rows,column number
df.shape

(543009, 30)

In [6]:
#types of data in columns
df.dtypes

NCT Number                     object
Study Title                    object
Study URL                      object
Acronym                        object
Study Status                   object
Brief Summary                  object
Study Results                  object
Conditions                     object
Interventions                  object
Primary Outcome Measures       object
Secondary Outcome Measures     object
Other Outcome Measures         object
Sponsor                        object
Collaborators                  object
Sex                            object
Age                            object
Phases                         object
Enrollment                    float64
Funder Type                    object
Study Type                     object
Study Design                   object
Other IDs                      object
Start Date                     object
Primary Completion Date        object
Completion Date                object
First Posted                   object
Results Firs

#### Filtering out terminated, withdrawn studies

In [7]:
#Options under study status
df["Study Status"].value_counts()

Study Status
COMPLETED                    297097
UNKNOWN                       78928
RECRUITING                    66701
TERMINATED                    31561
NOT_YET_RECRUITING            24075
ACTIVE_NOT_RECRUITING         21019
WITHDRAWN                     15357
ENROLLING_BY_INVITATION        4680
SUSPENDED                      1670
WITHHELD                        922
NO_LONGER_AVAILABLE             491
AVAILABLE                       261
APPROVED_FOR_MARKETING          221
TEMPORARILY_NOT_AVAILABLE        26
Name: count, dtype: int64

In [8]:
#creating a df with studies that are terminated, withdrawn, suspended, no longer available, temporarily not available
dropdf= df[df["Study Status"].isin([
    "TERMINATED", 
    "WITHDRAWN", 
    "SUSPENDED", 
    "NO_LONGER_AVAILABLE", 
    "TEMPORARILY_NOT_AVAILABLE"])].index
#.index gives the df a new index

In [9]:
#Dropping terminated, withdrawn, suspended, no longer available, temporarily not available
df.drop(dropdf, inplace=True)

In [10]:
#Checking if the terminated/withdrawn studies were dropped
df["Study Status"].value_counts()

Study Status
COMPLETED                  297097
UNKNOWN                     78928
RECRUITING                  66701
NOT_YET_RECRUITING          24075
ACTIVE_NOT_RECRUITING       21019
ENROLLING_BY_INVITATION      4680
WITHHELD                      922
AVAILABLE                     261
APPROVED_FOR_MARKETING        221
Name: count, dtype: int64

#### Removing device, behavioral, procedural, other studies

In [11]:
df["Interventions"].value_counts()

Interventions
OTHER: No intervention                                                                                 1202
OTHER: no intervention                                                                                  448
OTHER: Questionnaire                                                                                    362
OTHER: Exercise                                                                                         348
BEHAVIORAL: Exercise                                                                                    251
                                                                                                       ... 
PROCEDURE: Bronchoscope guided advancement|PROCEDURE: Conventional advancement                            1
BEHAVIORAL: Therapeutic education on fever|BEHAVIORAL: Therapeutic education on household accidents       1
BEHAVIORAL: Project With                                                                                  1
DEVICE: Comput

In [12]:
#creating a df with studies that are device, behavioral, procedural, other
drop_index = df[df["Interventions"].str.contains("OTHER|DEVICE|BEHAVIORAL|PROCEDURE|DIAGNOSTIC_TEST|DIETARY_SUPPLEMENT", case=False, na=False)].index
df.drop(drop_index, inplace=True)


In [13]:
df["Interventions"].value_counts()

Interventions
DRUG: Metformin                                              160
DRUG: Dexmedetomidine                                        132
DRUG: Rituximab                                              113
DRUG: Pembrolizumab                                          110
DRUG: Ketamine                                                91
                                                            ... 
DRUG: Ricolinostat                                             1
BIOLOGICAL: AlloVax|BIOLOGICAL: CRCL|BIOLOGICAL: AlloStim      1
DRUG: Torasemide Prolonged Release|DRUG: Furosemide            1
DRUG: Sertraline|DRUG: Venlafaxine Extended Release            1
DRUG: Iron Carboxymaltose|DRUG: Placebo                        1
Name: count, Length: 125859, dtype: int64

In [14]:
# creating a new column by extracing word before : using regular expression "^(\w+):"
df["Intervention_Type"] = df["Interventions"].str.extract(r"^(\w+):")


In [15]:
df.head()

Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,Intervention_Type
1,NCT01402479,An Open-labeled Trial of Ramipril in Patients ...,https://clinicaltrials.gov/study/NCT01402479,,COMPLETED,Physiology of migraine involving renin-angiote...,NO,Migraine With Hypertension,DRUG: Ramipril,"headache frequency, headache days, 12 week",...,0408-131-005,2004-10,2005-07,2005-07,2011-07-26,,2011-08-08,"Seoul National University Hospital, Seoul, 110...",,DRUG
3,NCT03878979,Preoperative Immune Checkpoint Inhibitor for P...,https://clinicaltrials.gov/study/NCT03878979,,COMPLETED,"Nivolumab, also known as (Bristol Myers Squibb...",YES,Head and Neck Squamous Cell Carcinoma|Head and...,DRUG: Nivolumab 480mg and surgical resection,Safety as Measured by Number of Participants W...,...,J1923|IRB00207577|CA209-9H7,2019-07-08,2023-10-17,2023-10-17,2019-03-18,2024-11-21,2024-11-21,Johns Hopkins Sidney Kimmel Comprehensive Canc...,"Study Protocol and Statistical Analysis Plan, ...",DRUG
5,NCT03791879,Caudal Dexmedetomidine Analgesia in Pediatrics .,https://clinicaltrials.gov/study/NCT03791879,,COMPLETED,Dexmedetomidine (DEXM) is a highly selective α...,NO,164 Boys for Hypospadias Surgery Under General...,DRUG: Caudal dexamedatomidine analgesia,Time to (1st analgesic request objective pain ...,...,MFM IR.18.11.322 - 2018/11/11,2019-01-01,2020-10-01,2020-10-15,2019-01-03,,2021-03-01,"Anesthesia department,Faculty of medicine, Man...",,DRUG
6,NCT03877679,The Effect of Topical Curcumin Versus Topical ...,https://clinicaltrials.gov/study/NCT03877679,,UNKNOWN,Introduce a new anti-inflammatory and antioxid...,NO,Oral Lichen Planus,DRUG: Triamcinolone|DRUG: Turmeric paste,"Pain intensity, measured by Visual Analog Scal...",...,OMED2:5:1,2019-05-01,2020-05-01,2020-06-01,2019-03-18,,2019-03-18,,,DRUG
8,NCT00684879,Screening Behavior in Adults With Hereditary H...,https://clinicaltrials.gov/study/NCT00684879,,COMPLETED,This study will explore the factors that influ...,NO,Osler-Rendu-Weber Disease|Osler-Rendu Disease|...,,,...,999908143|08-HG-N143,2008-05-21,,2016-01-07,2008-05-28,,2019-11-25,National Human Genome Research Institute (NHGR...,,


In [16]:
#Number of Drugs, Biologicals, radiations etc.
df["Intervention_Type"].value_counts()

Intervention_Type
DRUG                   141020
BIOLOGICAL              18087
RADIATION                2823
COMBINATION_PRODUCT      1672
GENETIC                  1546
Name: count, dtype: int64

In [17]:
# Show rows where "Interventions" contains the word "Radiation"
df[df["Interventions"].str.contains("RADIATION", case=True, na=False)]

Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,Intervention_Type
567,NCT05944679,Clinical Utility to Follow-up Radiographs Duri...,https://clinicaltrials.gov/study/NCT05944679,,COMPLETED,Primary prosthetic surgery for the treatment o...,NO,Prosthetic Knee Surgery,RADIATION: No Xray|RADIATION: Xray,"KSS Test change, Change value from preoperativ...",...,Radioprotesi,2022-01-20,2024-05-31,2024-05-31,2023-07-13,,2024-08-14,"Hospital Parc taulí, Sabadell, Barcelona, 0820...",,RADIATION
768,NCT04931979,SRT in Combination With Pembrolizumab in Patie...,https://clinicaltrials.gov/study/NCT04931979,Pembro-SRT,RECRUITING,To evaluate the efficacy and safety of a pembr...,NO,Urologic Cancer|Biochemical Recurrence of Mali...,DRUG: Pembrolizumab Injection [Keytruda]|RADIA...,"Complete biochemical response, number of patie...",...,MK-3475-C51|2021-001291-42|P003141,2022-10-20,2025-07-01,2026-04-01,2021-06-18,,2023-03-01,"Clinic of Urology, Medical Center - University...",,DRUG
982,NCT04058379,Artificial Intelligence Analysis of Initial Sc...,https://clinicaltrials.gov/study/NCT04058379,RADIOMIC-TBI,UNKNOWN,We assume that an early iterative automatic CT...,NO,"Trauma, Brain",RADIATION: CT scan,Clinical evolution during first 7 days in ICU ...,...,38RC19.193,2020-01-01,2021-04-12,2021-10-12,2019-08-15,,2021-04-28,"University Hospital Grenoble, Grenoble, France",,RADIATION
1072,NCT02802579,ECG Triggered Dual Source CT for Non-invasive ...,https://clinicaltrials.gov/study/NCT02802579,,COMPLETED,Coronary arterial disease is a risk factor for...,NO,"Obesity, Morbid",RADIATION: standard protocol|RADIATION: enhanc...,"Image quality, Coronary arteries (with at leas...",...,RAD0701,2007-12,2015-12,2015-12,2016-06-16,,2016-06-17,"Cantonal Hospital St Gallen, St. Gallen, 9007,...",,RADIATION
1106,NCT03223779,Study of TAS-102 Plus Radiation Therapy for th...,https://clinicaltrials.gov/study/NCT03223779,,UNKNOWN,This research study is studying a drug in comb...,NO,Colorectal Cancer,DRUG: TAS-102|RADIATION: Photon SBRT,"Maximum Tolerated Dose (MTD), MTD will be dete...",...,17-231,2017-10-13,2023-01,2025-01,2017-07-21,,2022-09-08,"Massachusetts General Hospital, Boston, Massac...",,DRUG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542297,NCT02999087,Randomized Trial of Avelumab-cetuximab-radioth...,https://clinicaltrials.gov/study/NCT02999087,REACH,ACTIVE_NOT_RECRUITING,The purpose of this study is to demonstrate th...,NO,HNSCC,DRUG: Cetuximab|DRUG: avelumab|DRUG: Cisplatin...,"Progression free survival, Time between random...",...,GORTEC 2017-01,2017-09-14,2027-12,2027-12,2016-12-21,,2025-05-07,"Centre Hospitalier Bretagne Sud, Lorient, 5632...",,DRUG
542553,NCT05727787,vGRID SBRT: A Phase I Clinical Trial in Unrese...,https://clinicaltrials.gov/study/NCT05727787,vGRID SBRT,RECRUITING,This trial will provide the maximum tolerated ...,NO,Liver Cancer,RADIATION: Stereotactic Body Radiation Treatment,Maximum tolerated dose (MTD) of Single Fractio...,...,IIT-2021-GRID-SBRT-HCC,2023-02-23,2025-05-28,2025-12-28,2023-02-14,,2025-04-04,"University of Kansas Medical Center, Kansas Ci...",,RADIATION
542692,NCT04357587,Safety and Feasibility of PD-1 Blockade in the...,https://clinicaltrials.gov/study/NCT04357587,,COMPLETED,Colorectal cancer is the third most common can...,NO,Rectal Neoplasms,DRUG: Pembrolizumab|RADIATION: External beam r...,Rate of adverse events (AEs) as defined by the...,...,CASE1220,2020-08-06,2023-09-25,2023-09-25,2020-04-22,,2024-05-29,"Cleveland Clinic, Case Comprehensive Cancer Ce...",,DRUG
542905,NCT03277287,Assessment of Fractional CO2 Laser in Treatmen...,https://clinicaltrials.gov/study/NCT03277287,,COMPLETED,This is a prospective comparative study on 120...,NO,Cleft Lip Post-surgical Scar,RADIATION: Fractional CO2 laser,"Vancouver scar scale, It assess four variables...",...,CO2 laser in cleft lip scar,2017-09-01,2020-01-30,2020-02-15,2017-09-11,,2020-04-21,"Mohamed shadad, Assiut, Egypt",,RADIATION


In [184]:
# Show rows where "Interventions" contains the word "DRUG"
df[df["Interventions"].str.contains("GENETIC", case=True, na=False)]


Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,Intervention_Type
229,NCT05203679,Evaluation of the Safety and Efficacy of Hemop...,https://clinicaltrials.gov/study/NCT05203679,,ACTIVE_NOT_RECRUITING,"This is a multi-center, single-arm, open-label...",NO,Hemophilia B,GENETIC: Single dose intravenous injection of ...,"Annualized bleeding rate (ABR), To assess ABR,...",...,BBM001-CLN1001,2021-12-30,2024-04-16,2028-06-30,2022-01-24,,2025-02-26,"Anhui Provincial Hospital, Hefei, Anhui, 23002...",,GENETIC
318,NCT03557879,Exome Analysis in Hearing Impaired Patients,https://clinicaltrials.gov/study/NCT03557879,NGS-NSHL,UNKNOWN,Hearing impairment is the most frequent sensor...,NO,Hearing Impairment,GENETIC: Exome sequencing,"identification of candidate genes, Description...",...,UF96802,2018-06-04,2018-12-31,2019-06-01,2018-06-15,,2018-06-15,"Uhmontpellier, Montpellier, 34295, France",,GENETIC
903,NCT05402579,Diabetic Ketoacidosis From New SGLT2i: Can Gen...,https://clinicaltrials.gov/study/NCT05402579,DaNGER,RECRUITING,Sodium glucose co-transporter 2 (SGLT2) inhibi...,NO,Diabetes Type 2|DKA|Diabetic Ketoacidosis,GENETIC: Genomic analysis,Identification of genomic variants associated ...,...,CTO 3737,2022-07-29,2025-02-01,2025-02-01,2022-06-02,,2024-11-20,St. Joseph's Health Centre (Unity Health Toron...,,GENETIC
1186,NCT02354079,HYPOCHOL : A Genetically-based Strategy to Ide...,https://clinicaltrials.gov/study/NCT02354079,HYPOCHOL,ACTIVE_NOT_RECRUITING,The aim of this study is to identify new targe...,NO,Hypobetalipoproteinemia,GENETIC: hypobetalipoproteinemia genetic and g...,type and number of genetic abnormalities leadi...,...,RC14_0400,2016-01-07,2022-01-07,2032-01-07,2015-02-03,,2023-08-04,"CHU de Nantes, Nantes, 44093, France",,GENETIC
1270,NCT03955679,AveXis Managed Access Program Cohort for Acces...,https://clinicaltrials.gov/study/NCT03955679,,APPROVED_FOR_MARKETING,The purpose of this Cohort Treatment Protocol ...,NO,Spinal Muscular Atrophy,GENETIC: AVXS-101,,...,AVXS-101-MAP-001,,,,2019-05-20,,2020-08-28,"Nationwide Children's Hospital, Columbus, Ohio...",,GENETIC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540235,NCT06652633,Long-term Follow-up of Participants Treated wi...,https://clinicaltrials.gov/study/NCT06652633,Hesperia,RECRUITING,This is a long-term follow-up study for partic...,NO,Hematological Malignancies,GENETIC: GLPG CAR T-cell therapy,Percentage of participants with targeted adver...,...,LTF-CL-001|2023-510173-34-00,2024-09-09,2039-07,2039-07,2024-10-22,,2024-10-22,"Antwerp University Hospital, Edegem, 2650, Bel...",,GENETIC
540412,NCT06103487,Long Term Follow-Up for RGX-111,https://clinicaltrials.gov/study/NCT06103487,,ENROLLING_BY_INVITATION,RGX-111-5101 is a long-term follow up study th...,NO,Mucopolysaccharidosis I,GENETIC: No Intervention,"Evaluation of the long-term safety of RGX-111,...",...,RGX-111-5101,2023-07-24,2027-09,2027-09,2023-10-26,,2024-07-03,"Children's Hospital of Orange County, Orange, ...",,GENETIC
540703,NCT03392987,A Safety and Efficacy Study of Cryopreserved O...,https://clinicaltrials.gov/study/NCT03392987,,ACTIVE_NOT_RECRUITING,OTL-200 is autologous CD34+ cells transduced w...,NO,Lysosomal Storage Disease|Metachromatic Leukod...,GENETIC: OTL-200,Change in Gross Motor Function Measure (GMFM) ...,...,205756|2017-001730-26,2018-01-25,2022-04-06,2026-01-31,2018-01-08,,2025-01-27,Ospedale San Raffaele - Telethon Institute for...,,GENETIC
542227,NCT06042387,Genome Analysis Across Populations in Inflamma...,https://clinicaltrials.gov/study/NCT06042387,,RECRUITING,"Objective:\n\nTo use clinical, genetic and gen...",NO,Ulcerative Colitis|Healthy|Crohn Disease|Infla...,GENETIC: Blood or saliva sample collection,"Genomics studies across populations, Whole Exo...",...,02-0234-E|U01DK062423,2022-05-01,2027-05,2027-09,2023-09-18,,2025-03-25,"Sinai Health System, Toronto, Ontario, M5G 1X5...",,GENETIC


In [18]:
#Removing radiation rows as well
dropradiation=df[df["Interventions"].str.contains("RADIATION", case=True, na=False)].index

In [19]:
df.drop(dropradiation, inplace=True)

In [20]:
# Getting number of various therapies 
df["Interventions"].value_counts()

Interventions
DRUG: Metformin                                               160
DRUG: Dexmedetomidine                                         132
DRUG: Rituximab                                               113
DRUG: Pembrolizumab                                           110
DRUG: Ketamine                                                 91
                                                             ... 
DRUG: Bonefos (Clodronate, BAY94-8393)                          1
DRUG: Disitamab Vedotin for injection                           1
DRUG: Liposomal Cyclosporine A|DRUG: standard of care           1
DRUG: clarithromycin, metronidazole, proton pump inhibitor      1
DRUG: Iron Carboxymaltose|DRUG: Placebo                         1
Name: count, Length: 122320, dtype: int64

In [21]:
#getting number of drugs, biologicals, genetic and combination therapies
df["Intervention_Type"].value_counts()

Intervention_Type
DRUG                   139947
BIOLOGICAL              17897
COMBINATION_PRODUCT      1664
GENETIC                  1542
Name: count, dtype: int64

In [22]:
df.head()

Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,Intervention_Type
1,NCT01402479,An Open-labeled Trial of Ramipril in Patients ...,https://clinicaltrials.gov/study/NCT01402479,,COMPLETED,Physiology of migraine involving renin-angiote...,NO,Migraine With Hypertension,DRUG: Ramipril,"headache frequency, headache days, 12 week",...,0408-131-005,2004-10,2005-07,2005-07,2011-07-26,,2011-08-08,"Seoul National University Hospital, Seoul, 110...",,DRUG
3,NCT03878979,Preoperative Immune Checkpoint Inhibitor for P...,https://clinicaltrials.gov/study/NCT03878979,,COMPLETED,"Nivolumab, also known as (Bristol Myers Squibb...",YES,Head and Neck Squamous Cell Carcinoma|Head and...,DRUG: Nivolumab 480mg and surgical resection,Safety as Measured by Number of Participants W...,...,J1923|IRB00207577|CA209-9H7,2019-07-08,2023-10-17,2023-10-17,2019-03-18,2024-11-21,2024-11-21,Johns Hopkins Sidney Kimmel Comprehensive Canc...,"Study Protocol and Statistical Analysis Plan, ...",DRUG
5,NCT03791879,Caudal Dexmedetomidine Analgesia in Pediatrics .,https://clinicaltrials.gov/study/NCT03791879,,COMPLETED,Dexmedetomidine (DEXM) is a highly selective α...,NO,164 Boys for Hypospadias Surgery Under General...,DRUG: Caudal dexamedatomidine analgesia,Time to (1st analgesic request objective pain ...,...,MFM IR.18.11.322 - 2018/11/11,2019-01-01,2020-10-01,2020-10-15,2019-01-03,,2021-03-01,"Anesthesia department,Faculty of medicine, Man...",,DRUG
6,NCT03877679,The Effect of Topical Curcumin Versus Topical ...,https://clinicaltrials.gov/study/NCT03877679,,UNKNOWN,Introduce a new anti-inflammatory and antioxid...,NO,Oral Lichen Planus,DRUG: Triamcinolone|DRUG: Turmeric paste,"Pain intensity, measured by Visual Analog Scal...",...,OMED2:5:1,2019-05-01,2020-05-01,2020-06-01,2019-03-18,,2019-03-18,,,DRUG
8,NCT00684879,Screening Behavior in Adults With Hereditary H...,https://clinicaltrials.gov/study/NCT00684879,,COMPLETED,This study will explore the factors that influ...,NO,Osler-Rendu-Weber Disease|Osler-Rendu Disease|...,,,...,999908143|08-HG-N143,2008-05-21,,2016-01-07,2008-05-28,,2019-11-25,National Human Genome Research Institute (NHGR...,,


In [190]:
df.columns

Index(['NCT Number', 'Study Title', 'Study URL', 'Acronym', 'Study Status',
       'Brief Summary', 'Study Results', 'Conditions', 'Interventions',
       'Primary Outcome Measures', 'Secondary Outcome Measures',
       'Other Outcome Measures', 'Sponsor', 'Collaborators', 'Sex', 'Age',
       'Phases', 'Enrollment', 'Funder Type', 'Study Type', 'Study Design',
       'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents', 'Intervention_Type'],
      dtype='object')

In [23]:
#Dropping columns unnecessary for this analysis
df.drop(["Study URL","Acronym","Brief Summary", "Other IDs","Study Design","Study Documents","Primary Outcome Measures","Secondary Outcome Measures", "Other Outcome Measures"], axis=1, inplace=True)

In [24]:
df.columns

Index(['NCT Number', 'Study Title', 'Study Status', 'Study Results',
       'Conditions', 'Interventions', 'Sponsor', 'Collaborators', 'Sex', 'Age',
       'Phases', 'Enrollment', 'Funder Type', 'Study Type', 'Start Date',
       'Primary Completion Date', 'Completion Date', 'First Posted',
       'Results First Posted', 'Last Update Posted', 'Locations',
       'Intervention_Type'],
      dtype='object')

In [25]:
df.head()

Unnamed: 0,NCT Number,Study Title,Study Status,Study Results,Conditions,Interventions,Sponsor,Collaborators,Sex,Age,...,Funder Type,Study Type,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Intervention_Type
1,NCT01402479,An Open-labeled Trial of Ramipril in Patients ...,COMPLETED,NO,Migraine With Hypertension,DRUG: Ramipril,Seoul National University Hospital,,ALL,"ADULT, OLDER_ADULT",...,OTHER,INTERVENTIONAL,2004-10,2005-07,2005-07,2011-07-26,,2011-08-08,"Seoul National University Hospital, Seoul, 110...",DRUG
3,NCT03878979,Preoperative Immune Checkpoint Inhibitor for P...,COMPLETED,YES,Head and Neck Squamous Cell Carcinoma|Head and...,DRUG: Nivolumab 480mg and surgical resection,Sidney Kimmel Comprehensive Cancer Center at J...,Bristol-Myers Squibb,ALL,"ADULT, OLDER_ADULT",...,OTHER,INTERVENTIONAL,2019-07-08,2023-10-17,2023-10-17,2019-03-18,2024-11-21,2024-11-21,Johns Hopkins Sidney Kimmel Comprehensive Canc...,DRUG
5,NCT03791879,Caudal Dexmedetomidine Analgesia in Pediatrics .,COMPLETED,NO,164 Boys for Hypospadias Surgery Under General...,DRUG: Caudal dexamedatomidine analgesia,Mansoura University,,MALE,CHILD,...,OTHER,INTERVENTIONAL,2019-01-01,2020-10-01,2020-10-15,2019-01-03,,2021-03-01,"Anesthesia department,Faculty of medicine, Man...",DRUG
6,NCT03877679,The Effect of Topical Curcumin Versus Topical ...,UNKNOWN,NO,Oral Lichen Planus,DRUG: Triamcinolone|DRUG: Turmeric paste,Cairo University,,ALL,"CHILD, ADULT, OLDER_ADULT",...,OTHER,INTERVENTIONAL,2019-05-01,2020-05-01,2020-06-01,2019-03-18,,2019-03-18,,DRUG
8,NCT00684879,Screening Behavior in Adults With Hereditary H...,COMPLETED,NO,Osler-Rendu-Weber Disease|Osler-Rendu Disease|...,,National Human Genome Research Institute (NHGRI),,ALL,"ADULT, OLDER_ADULT",...,NIH,OBSERVATIONAL,2008-05-21,,2016-01-07,2008-05-28,,2019-11-25,National Human Genome Research Institute (NHGR...,


In [26]:
df["Funder Type"].value_counts()

Funder Type
OTHER        118421
INDUSTRY      77309
NIH            6950
OTHER_GOV      5107
NETWORK        1781
FED            1157
INDIV           304
UNKNOWN          38
Name: count, dtype: int64

In [27]:
#Remove Other, NIH, Other_gov, network, fed, Indiv, unknown
#creating a df with studies that are Other, NIH, Other_gov, network, fed, Indiv, unknown

drop_fundtype = df[df["Funder Type"].str.contains("OTHER|NIH|OTHER_GOV|NETWORK|FED|INDIV|UNKNOWN", case=True, na=False)].index



In [28]:
df.drop(drop_fundtype, inplace=True)

In [29]:
df["Funder Type"].value_counts()

Funder Type
INDUSTRY    77309
Name: count, dtype: int64

In [30]:
df.head()

Unnamed: 0,NCT Number,Study Title,Study Status,Study Results,Conditions,Interventions,Sponsor,Collaborators,Sex,Age,...,Funder Type,Study Type,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Intervention_Type
10,NCT01353079,Efficacy and Safety Study Of Short Ragweed Pol...,COMPLETED,YES,Allergy,BIOLOGICAL: Short Ragweed Pollen Allergenic Ex...,Greer Laboratories,,ALL,ADULT,...,INDUSTRY,INTERVENTIONAL,2011-04,2011-11,2012-04,2011-05-12,2015-02-05,2015-02-05,"Site 7, Warrenton, Virginia, 20186, United States",BIOLOGICAL
18,NCT02840279,A Multiple Ascending Dose Study of BPN14770 in...,COMPLETED,NO,Alzheimer's Disease,DRUG: BPN14770|DRUG: Placebo,Tetra Discovery Partners,,ALL,"ADULT, OLDER_ADULT",...,INDUSTRY,INTERVENTIONAL,2016-06,2016-11,2016-12,2016-07-21,,2017-01-18,"Jasper Clinic, Kalamazoo, Michigan, 49007, Uni...",DRUG
23,NCT00721279,Sifrol (Pramipexole) Onset of Action and Impac...,COMPLETED,YES,Restless Legs Syndrome,DRUG: Sifrol® (pramipexole dihydrochloride),Boehringer Ingelheim,,ALL,"ADULT, OLDER_ADULT",...,INDUSTRY,OBSERVATIONAL,2007-09,2008-10,,2008-07-24,2010-01-08,2014-06-04,"Boehringer Ingelheim Investigational Site, Abs...",DRUG
39,NCT03284879,Post-Marketing Surveillance Study of OTEZLA,COMPLETED,NO,Psoriasis,,Amgen,,ALL,"CHILD, ADULT, OLDER_ADULT",...,INDUSTRY,OBSERVATIONAL,2017-09-05,2021-10-31,2022-03-20,2017-09-15,,2022-04-29,"Tugi dermatology clinic, Kitakyushu, Fukuoka, ...",
48,NCT06091579,"A Study to Evaluate the Safety, Tolerability a...",COMPLETED,NO,Healthy Volunteers,DRUG: Treprostinil Palmitil Inhalation Powder|...,Insmed Incorporated,,ALL,ADULT,...,INDUSTRY,INTERVENTIONAL,2020-09-17,2021-01-12,2021-01-12,2023-10-19,,2023-10-19,"USA001, Austin, Texas, 78744, United States",DRUG


In [31]:
# Which columns have the most NA values, can we remove the rows corresponding them
df.isnull().sum()

NCT Number                     0
Study Title                    0
Study Status                   0
Study Results                  0
Conditions                   937
Interventions               6248
Sponsor                        0
Collaborators              66144
Sex                         1002
Age                          922
Phases                     11162
Enrollment                  2260
Funder Type                  922
Study Type                   922
Start Date                  2116
Primary Completion Date     5475
Completion Date             4756
First Posted                   0
Results First Posted       55910
Last Update Posted             0
Locations                   9222
Intervention_Type           6249
dtype: int64

In [32]:
#Phases are essential for this analysis, hence removing rows with no disclosed phases
df["Phases"].value_counts()

Phases
PHASE1           23640
PHASE3           17113
PHASE2           15603
PHASE4            5017
PHASE1|PHASE2     4243
PHASE2|PHASE3     1082
EARLY_PHASE1       371
Name: count, dtype: int64

In [33]:
df["Phases"].isnull().sum()

11162

In [34]:
#also removing rows with no interventions, conditions, funder type, study type
df.dropna(subset=["Phases", "Interventions", "Conditions","Funder Type","Study Type"], inplace=True)

In [203]:
df.isnull().sum()

NCT Number                     0
Study Title                    0
Study Status                   0
Study Results                  0
Conditions                     0
Interventions                  0
Sponsor                        0
Collaborators              56719
Sex                           39
Age                            0
Phases                         0
Enrollment                   806
Funder Type                    0
Study Type                     0
Start Date                   659
Primary Completion Date     3712
Completion Date             3009
First Posted                   0
Results First Posted       46254
Last Update Posted             0
Locations                   6725
Intervention_Type              0
dtype: int64

In [35]:
#Remove Collaborators, results posted
df.drop(["Collaborators","Results First Posted"], axis=1, inplace=True)

In [36]:
df.head()

Unnamed: 0,NCT Number,Study Title,Study Status,Study Results,Conditions,Interventions,Sponsor,Sex,Age,Phases,Enrollment,Funder Type,Study Type,Start Date,Primary Completion Date,Completion Date,First Posted,Last Update Posted,Locations,Intervention_Type
10,NCT01353079,Efficacy and Safety Study Of Short Ragweed Pol...,COMPLETED,YES,Allergy,BIOLOGICAL: Short Ragweed Pollen Allergenic Ex...,Greer Laboratories,ALL,ADULT,PHASE3,429.0,INDUSTRY,INTERVENTIONAL,2011-04,2011-11,2012-04,2011-05-12,2015-02-05,"Site 7, Warrenton, Virginia, 20186, United States",BIOLOGICAL
18,NCT02840279,A Multiple Ascending Dose Study of BPN14770 in...,COMPLETED,NO,Alzheimer's Disease,DRUG: BPN14770|DRUG: Placebo,Tetra Discovery Partners,ALL,"ADULT, OLDER_ADULT",PHASE1,77.0,INDUSTRY,INTERVENTIONAL,2016-06,2016-11,2016-12,2016-07-21,2017-01-18,"Jasper Clinic, Kalamazoo, Michigan, 49007, Uni...",DRUG
48,NCT06091579,"A Study to Evaluate the Safety, Tolerability a...",COMPLETED,NO,Healthy Volunteers,DRUG: Treprostinil Palmitil Inhalation Powder|...,Insmed Incorporated,ALL,ADULT,PHASE1,42.0,INDUSTRY,INTERVENTIONAL,2020-09-17,2021-01-12,2021-01-12,2023-10-19,2023-10-19,"USA001, Austin, Texas, 78744, United States",DRUG
80,NCT05943379,RC48-ADC in Combination With Gemcitabine in Hi...,RECRUITING,NO,NMIBC,DRUG: RC48-ADC in Combination with gemcitabine,"RemeGen Co., Ltd.",ALL,"ADULT, OLDER_ADULT",PHASE2,85.0,INDUSTRY,INTERVENTIONAL,2023-06-08,2025-12-01,2026-12-01,2023-07-13,2023-07-13,"Peking University First Hosptital, Beijing, Be...",DRUG
83,NCT01104779,Safety and Efficacy of Cariprazine in Schizoph...,COMPLETED,YES,Schizophrenia,DRUG: Cariprazine|DRUG: Placebo,Forest Laboratories,ALL,ADULT,PHASE3,446.0,INDUSTRY,INTERVENTIONAL,2010-04-27,2011-12-15,2011-12-15,2010-04-15,2018-11-14,"Forest Investigative Site 48, Costa Mesa, Cali...",DRUG


In [37]:
df["Start Date"].value_counts()

Start Date
2007-10       272
2009-06       250
2008-06       244
2008-01       238
2010-10       232
             ... 
2008-03-05      1
2011-09-05      1
2009-10-27      1
2008-11-27      1
2009-10-07      1
Name: count, Length: 6528, dtype: int64

AttributeError: 'Index' object has no attribute 'head'