In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportions_ztest, proportions_chisquare
import statsmodels.api as sm

In [2]:
import sys
from pathlib import Path
import os
cwd = os.getcwd()
parent = str(Path(cwd).parents[0])
sys.path.append(parent)

In [3]:
#importing custom functions for analysis

from lib.functions import ci_calc, z_test, summarizer, check_dupes, simple_logistic_regression, crosstab

#ci_calc will compute a simple confidence interval around a proportion
#summarizer gives a nice output of proportions and CIs and returns and item with them
#check_dupes helps with the date analysis
#simple_logicstic_regression and crosstab do what they say on the tin

# Data Loading and Setup

In [4]:
#Loading in all the data and creating the analysis dataset.

#Primary dataset with dates turned into dates
df = pd.read_csv(parent + '/data/final_dataset/final_dataset.csv')
df['euctr_results_date'] = pd.to_datetime(df['euctr_results_date'])
df['ctgov_results_date'] = pd.to_datetime(df['ctgov_results_date'])
df['isrctn_results_date'] = pd.to_datetime(df['isrctn_results_date'])
df['journal_pub_date'] = pd.to_datetime(df['journal_pub_date'])

#Manually collected sponsor data for regression
regression = pd.read_csv(parent + '/data/additional_data/manual_reg_data.csv')

#Regression data derived from the EUCTR
other_reg_data = pd.read_csv(parent + '/data/additional_data/spon_country_data.csv')

#the original sample (and replacements) for data on inferred dates
sample = pd.read_csv(parent + '/data/samples/euctr_search_sample_final.csv')
replacements  = pd.read_csv(parent + '/data/samples/replacement_sample.csv')
full_sample = pd.concat([sample,replacements])

#the results section scrape and making one column we need into dates
dec_results = pd.read_csv(parent + '/data/source_data/' + 'euctr_data_quality_results_scrape_dec_2020.csv.zip')
dec_results['first_version_date'] = pd.to_datetime(dec_results.first_version_date)

In [5]:
#Setting search reference dates
search_start_date = pd.to_datetime('2020-12-11')
primary_search_completion_date = pd.to_datetime('2021-07-22')
last_search_any = pd.to_datetime('2023-01-03')

# Analysis Prep

## Detail the exclusions and get the inferred status of the final sample

In [6]:
#Number and reason for exclusions
exclusions = df[df.replaced == 1]
exclusions.replaced_reason.value_counts()

Withdrawn             15
Ongoing                3
No protocol access     2
Name: replaced_reason, dtype: int64

In [7]:
sample_inferred_status = df[df.replaced.isna()][['euctr_id']].merge(full_sample[['eudract_number', 'inferred']], how='left', left_on='euctr_id', right_on='eudract_number')

In [8]:
#Inferred completion date status of the final sample
sample_inferred_status.inferred.value_counts()

0    353
1    147
Name: inferred, dtype: int64

In [9]:
exclusions_inferred_status = exclusions[['euctr_id']].merge(full_sample[['eudract_number', 'inferred']], how='left', left_on='euctr_id', right_on='eudract_number')

In [10]:
#Inferred completion date status of the trials we had to replace
exclusions_inferred_status.inferred.value_counts()

0    14
1     6
Name: inferred, dtype: int64

## Setting up the analysis dataset

In [11]:
#For the analysis, we don't want to include these excluded trials so we will make an analyses dataframe moving forward.
analysis_df = df[df.replaced != 1].reset_index(drop=True)

#Quick check the final dataset is the length we expect
assert(len(analysis_df) == 500)

In [12]:
#Here we make binary variable to indicate whether recorded results were published after our cutoff. 
#Per protocol, nothing available after we began searches should be included in the final results.
#Creating binary variables allows us to only correct for this once, rather than have to keep doing it throughout.

#Included EUCTR results
analysis_df['euctr_results_inc'] = np.where((analysis_df.euctr_results == 'Yes') & 
                                            (analysis_df.euctr_results_date <= search_start_date), 1, 0)

#Included clinicaltrials.gov results
analysis_df['ctgov_results_inc'] = np.where((analysis_df.ctgov_results == 'Yes') & 
                                            (analysis_df.ctgov_results_date <= search_start_date), 1, 0)

#Included ISRCTN results
analysis_df['isrctn_results_inc'] = np.where((analysis_df.isrctn_results == 'Yes') & 
                                            (analysis_df.isrctn_results_date <= search_start_date), 1, 0)

#Included journal results
analysis_df['journal_results_inc'] = np.where((analysis_df.journal_result == 'Yes') & 
                                            (analysis_df.journal_pub_date <= search_start_date), 1, 0)

#A catch-all for any result
analysis_df['any_results_inc'] = np.where(((analysis_df.euctr_results_inc == 1) | 
                                          (analysis_df.ctgov_results_inc == 1) | 
                                          (analysis_df.isrctn_results_inc == 1) | 
                                          (analysis_df.journal_results_inc == 1)), 1, 0)

In [206]:
#Exporting the analysis dataset so it can be used elsewhere.

analysis_df.to_csv(parent + '/data/final_dataset/' + 'analysis_df.csv')

# Main Analysis

In [13]:
analysis_df.head()

Unnamed: 0,euctr_id,dual_searched,searched_by,senior_reviewed,replaced,replaced_reason,euctr_results,euctr_results_link,euctr_results_format,euctr_results_date,...,journal_pub_date,journal_reg_numbers,team_discuss,additional_results_located,notes,euctr_results_inc,ctgov_results_inc,isrctn_results_inc,journal_results_inc,any_results_inc
0,2014-003401-15,1,JM,,,,Yes,https://www.clinicaltrialsregister.eu/ctr-sear...,Tabular,2015-08-16,...,2018-03-23,ClinicalTrials.gov,No,,First published from journal website,1,1,0,1,1
1,2011-001616-57,1,JM,,,,Yes,https://www.clinicaltrialsregister.eu/ctr-sear...,Tabular,2016-03-24,...,2018-03-14,ClinicalTrials.gov,No,,,1,0,0,1,1
2,2004-002743-27,1,JM,,,,Yes,https://www.clinicaltrialsregister.eu/ctr-sear...,Tabular,2015-04-25,...,2010-12-10,,No,,,1,1,0,1,1
3,2013-003561-34,1,JM,,,,Yes,https://www.clinicaltrialsregister.eu/ctr-sear...,Tabular,2017-10-21,...,2017-02-06,ClinicalTrials.gov,No,Additional Results: https://dx.doi.org/10.1182...,,1,1,0,1,1
4,2006-001414-33,1,JM,,,,Yes,https://www.clinicaltrialsregister.eu/ctr-sear...,Journal Article,2019-04-05,...,2010-09-24,ISRCTN,No,Additional Results: https://doi.org/10.1186/14...,,1,0,0,1,1


In [199]:
full_sample.head()

Unnamed: 0.1,Unnamed: 0,eudract_number,final_date,inferred
0,5552,2014-003401-15,2015-02-03,0
1,7336,2011-001616-57,2014-09-12,0
2,12314,2004-002743-27,2008-01-31,0
3,6567,2013-003561-34,2017-10-02,0
4,13573,2006-001414-33,2013-07-01,0


In [201]:
temp = analysis_df.merge(full_sample[['eudract_number', 'inferred']], how='left', left_on='euctr_id', right_on='eudract_number')

In [204]:
temp2.columns

Index(['euctr_id', 'dual_searched', 'searched_by', 'senior_reviewed',
       'replaced', 'replaced_reason', 'euctr_results', 'euctr_results_link',
       'euctr_results_format', 'euctr_results_date', 'ctgov_xreg', 'nct_id',
       'ctgov_results', 'ctgov_results_link', 'ctgov_results_date',
       'isrctn_xreg', 'isrctn_id', 'isrctn_results', 'isrctn_results_type',
       'isrctn_results_link', 'isrctn_additional_links', 'isrctn_results_date',
       'journal_result', 'journal_link', 'journal_source', 'journal_match',
       'journal_pub_date', 'journal_reg_numbers', 'team_discuss',
       'additional_results_located', 'notes', 'euctr_results_inc',
       'ctgov_results_inc', 'isrctn_results_inc', 'journal_results_inc',
       'any_results_inc', 'eudract_number', 'inferred'],
      dtype='object')

In [205]:
temp2 = temp[temp.inferred == 1]
temp2[(temp2.nct_id.notnull()) | (temp2.isrctn_id.notnull()) | (temp2.journal_results_inc == 1)]

Unnamed: 0,euctr_id,dual_searched,searched_by,senior_reviewed,replaced,replaced_reason,euctr_results,euctr_results_link,euctr_results_format,euctr_results_date,...,team_discuss,additional_results_located,notes,euctr_results_inc,ctgov_results_inc,isrctn_results_inc,journal_results_inc,any_results_inc,eudract_number,inferred
5,2005-005658-37,1,JM,,,,No,,,NaT,...,No,,ClinicalTrials.gov lists trial as Terminated; ...,0,0,0,0,0,2005-005658-37,1
16,2006-003995-36,1,JM,,,,No,,,NaT,...,No,,,0,0,0,1,1,2006-003995-36,1
17,2015-003397-33,1,JM,1.0,,,No,,,NaT,...,Yes,,Both registry entries and the paper are linked...,0,0,0,1,1,2015-003397-33,1
23,2009-012543-42,1,JM,,,,No,,,NaT,...,No,,,0,0,0,0,0,2009-012543-42,1
27,2012-002105-22,1,JM,,,,No,,,NaT,...,No,,EUCTR results appeared after the start of sear...,0,0,0,0,0,2012-002105-22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,2011-005017-37,1,JS,,,,No,,,NaT,...,No,Interim analysis: https://pubmed.ncbi.nlm.nih....,"PDF: Published Online: October 25, 2018.",0,0,0,1,1,2011-005017-37,1
477,2010-022699-30,1,JS,,,,No,,,NaT,...,No,These are substudy results that added on anoth...,,0,0,0,1,1,2010-022699-30,1
480,2005-003293-25,0,,,,,No,,,NaT,...,No,,Use Pubmed epub date date for this journal,0,0,0,1,1,2005-003293-25,1
484,2009-016090-14,0,,,,,No,,,NaT,...,No,,,0,0,0,0,0,2009-016090-14,1


## Results on the EUCTR

In [14]:
euctr_results = analysis_df[(analysis_df.euctr_results_inc == 1)]
total_found_euctr = len(euctr_results)
summarizer(total_found_euctr, len(analysis_df))

Outcome of Interest: 264
Total: 500
Proportion: 52.8%
95% CI: 48.42-57.18


(0.48424184223256195, 0.528, 0.5717581577674381)

In [15]:
#What do these results look like?
euctr_results.euctr_results_format.value_counts()

Tabular                                      116
CSR Synopsis                                  86
ClinicalTrials.gov Results                    24
Journal Article                                8
Tabular and Journal Article                    8
Short Report                                   6
Report                                         6
Tabular and CSR Synopsis                       4
Tabular and Short Report                       4
Notice of termination with low enrollment      1
Tabular and Report                             1
Name: euctr_results_format, dtype: int64

In [16]:
#Lets get percents and CIs for these.
results_types = euctr_results.euctr_results_format.value_counts().to_frame()
results_types['percent'] = round((results_types.euctr_results_format / total_found_euctr) * 100,2)

#Adding CIs (i'm sure there's a more elegant way to do this with .map() or .apply() 
#but it's too much of a weird edge case to bother
ci_lower = []
ci_upper = []
for x in results_types.euctr_results_format.to_list():
    cis = ci_calc(x, total_found_euctr, printer=False)
    ci_lower.append(round(cis[0] * 100,2))
    ci_upper.append(round(cis[2] * 100,2))
    
results_types['ci_lower'] = ci_lower
results_types['ci_upper'] = ci_upper

results_types = results_types.reset_index().rename(columns={'index':'results_type'})
results_types

Unnamed: 0,results_type,euctr_results_format,percent,ci_lower,ci_upper
0,Tabular,116,43.94,37.95,49.93
1,CSR Synopsis,86,32.58,26.92,38.23
2,ClinicalTrials.gov Results,24,9.09,5.62,12.56
3,Journal Article,8,3.03,0.96,5.1
4,Tabular and Journal Article,8,3.03,0.96,5.1
5,Short Report,6,2.27,0.47,4.07
6,Report,6,2.27,0.47,4.07
7,Tabular and CSR Synopsis,4,1.52,0.04,2.99
8,Tabular and Short Report,4,1.52,0.04,2.99
9,Notice of termination with low enrollment,1,0.38,-0.36,1.12


In [134]:
#Now we want to group like with like to get our final descriptive stats to report

#Total with just tabular results
summarizer(results_types[results_types.results_type == 'Tabular'].euctr_results_format[0], total_found_euctr)

Outcome of Interest: 116
Total: 264
Proportion: 43.94%
95% CI: 37.95-49.93


(0.37952383252571037, 0.4393939393939394, 0.4992640462621684)

In [18]:
#Total with just a Document
doc_types = ['CSR Synopsis', 
             'ClinicalTrials.gov Results', 
             'Journal Article', 
             'Short Report', 
             'Report', 
             'Notice of termination with low enrollment']
ci_calc(results_types[results_types.results_type.isin(doc_types)].euctr_results_format.sum(), total_found_euctr)

Proportion: 49.62%
95% CI: 43.59-55.65


(0.43589902145456233, 0.4962121212121212, 0.55652522096968)

In [19]:
#Total with both Tabular and Document results
ci_calc(results_types[~results_types.results_type.isin(doc_types + ['Tabular'])].euctr_results_format.sum(), total_found_euctr)

Proportion: 6.44%
95% CI: 3.48-9.4


(0.03478498177391675, 0.06439393939393939, 0.09400289701396203)

## Cross-Registration and results availability on other registries

In [20]:
#First we'll look at cross-registration on ClinicalTrials.gov
ctg_xreg = analysis_df[analysis_df.nct_id.notnull()]
print(f'Out of {len(analysis_df)} trials {len(ctg_xreg)} were cross-registered on ClinicalTrials.gov')
ci_calc(len(ctg_xreg),len(analysis_df))

Out of 500 trials 339 were cross-registered on ClinicalTrials.gov
Proportion: 67.8%
95% CI: 63.7-71.9


(0.6370443472619469, 0.678, 0.7189556527380532)

In [21]:
#CTG x-reg results rate
ctg_results = ctg_xreg[(ctg_xreg.ctgov_results_inc == 1)]
print(f'Of the {len(ctg_xreg)} trials registered on ClinicalTrials.gov, {len(ctg_results)} had results')
ci_calc(len(ctg_results), len(ctg_xreg))

Of the 339 trials registered on ClinicalTrials.gov, 132 had results
Proportion: 38.94%
95% CI: 33.75-44.13


(0.3374732043090307, 0.3893805309734513, 0.4412878576378719)

In [22]:
#Proportion cross-registered on the ISRCTN 
isrctn_xreg = analysis_df[analysis_df.isrctn_id.notnull()]
print(f'Out of {len(analysis_df)} trials {len(isrctn_xreg)} were cross-registered on the ISRCTN')
ci_calc(len(isrctn_xreg), len(analysis_df))

Out of 500 trials 32 were cross-registered on the ISRCTN
Proportion: 6.4%
95% CI: 4.25-8.55


(0.042546459201334624, 0.064, 0.08545354079866538)

In [23]:
#ISRCTN x-reg results rate
isrctn_results = isrctn_xreg[(isrctn_xreg.isrctn_results_inc == 1)]
print(f'Of the {len(isrctn_xreg)} trials registered on the ISRCTN, {len(isrctn_results)} had results')
ci_calc(len(isrctn_results), len(isrctn_xreg))

Of the 32 trials registered on the ISRCTN, 2 had results
Proportion: 6.25%
95% CI: -2.14-14.64


(-0.021370016617978563, 0.0625, 0.14637001661797855)

In [24]:
#How many trials registered on all three registries
triple_reg = len(analysis_df[analysis_df.nct_id.notnull() & analysis_df.isrctn_id.notnull()])
print(f'{triple_reg} registered on all three registires')
ci_calc(triple_reg, len(analysis_df))

9 registered on all three registires
Proportion: 1.8%
95% CI: 0.63-2.97


(0.006346320615359283, 0.018, 0.029653679384640714)

## Results in the literature

In [25]:
#How many had results in the literature
journal_results = analysis_df[(analysis_df.journal_results_inc == 1)]
print(f'Out of {len(analysis_df)} trials on the EUCTR, {len(journal_results)} had results in the literature')
ci_calc(len(journal_results), len(analysis_df))

Out of 500 trials on the EUCTR, 293 had results in the literature
Proportion: 58.6%
95% CI: 54.28-62.92


(0.5428262226253018, 0.586, 0.6291737773746982)

In [26]:
#How did we find journal results?

journal_results.journal_source.value_counts()

ClinicalTrials.gov                                 117
Google Scholar                                      90
PubMed                                              46
EUCTR                                               28
ISRCTN                                               7
CSR on EUCTR and ClinicalTrials.gov                  1
Forward citation from review on Google scholar       1
Forward citation from ISRCTN articles                1
Citation checked located Cochrane review             1
Linked to in another document in Google Scholar      1
Name: journal_source, dtype: int64

## Summarizing results availability

In [27]:
#Getting lists of ids for all results types

all_trial_ids = set(analysis_df.euctr_id.to_list())
euctr_results_ids = set(euctr_results.euctr_id.to_list())
ctg_results_ids = set(ctg_results.euctr_id.to_list())
isrctn_results_ids = set(isrctn_results.euctr_id.to_list())
journal_results_ids = set(journal_results.euctr_id.to_list())

In [28]:
#Overall, inclusive of duplicates, how many results did we locate?
len(euctr_results_ids) + len(ctg_results_ids) + len(isrctn_results_ids) + len(journal_results_ids)

691

In [29]:
#How many had results anywhere?
results_nowhere = all_trial_ids - euctr_results_ids - ctg_results_ids - isrctn_results_ids - journal_results_ids
has_some_result = len(analysis_df) - len(results_nowhere)
#sense check
assert(len(results_nowhere) + has_some_result == 500)

print(f'{has_some_result} of {len(analysis_df)} trials had results somewhere')
ci_calc(has_some_result,len(analysis_df))

381 of 500 trials had results somewhere
Proportion: 76.2%
95% CI: 72.47-79.93


(0.7246718012864269, 0.762, 0.7993281987135731)

In [30]:
#What did enrollment look like for the trials with no results?
#This uses enrollment numbers from the manual data we collected on enrollment

no_results = analysis_df[analysis_df.euctr_id.isin(results_nowhere)].reset_index(drop=True)
no_results[['euctr_id']].merge(regression[['Trial ID', 'Enrollment']], how='left', left_on='euctr_id', right_on='Trial ID').Enrollment.sum()

33801

## What results were unique to each dissemination route?

In [31]:
#How many had results on just the EUCTR?
just_euctr = len(euctr_results_ids - ctg_results_ids - isrctn_results_ids - journal_results_ids)
print(f'{just_euctr} trials had results on just the EUCTR')
ci_calc(just_euctr, len(euctr_results))

54 trials had results on just the EUCTR
Proportion: 20.45%
95% CI: 15.59-25.32


(0.15588717957072745, 0.20454545454545456, 0.25320372952018166)

In [133]:
#What did the distribution of documents look like for unique results

just_euctr_ids = euctr_results_ids - ctg_results_ids - isrctn_results_ids - journal_results_ids
euctr_results[euctr_results.euctr_id.isin(just_euctr_ids)].euctr_results_format.value_counts()

CSR Synopsis                                 29
Tabular                                      15
Short Report                                  3
Tabular and CSR Synopsis                      2
Report                                        2
Notice of termination with low enrollment     1
Tabular and Report                            1
Tabular and Journal Article                   1
Name: euctr_results_format, dtype: int64

In [32]:
#How many had results on just on ClinicalTrials.gov?
just_ctg = len(ctg_results_ids - euctr_results_ids - isrctn_results_ids - journal_results_ids)
print(f'{just_ctg} trials had results on just ClinicalTrials.gov')
ci_calc(just_ctg,len(ctg_results))

3 trials had results on just ClinicalTrials.gov
Proportion: 2.27%
95% CI: -0.27-4.82


(-0.0026971240190091124, 0.022727272727272728, 0.048151669473554565)

In [33]:
#How many had results on just on the ISRCTN?
just_isrctn = len(isrctn_results_ids - euctr_results_ids - ctg_results_ids - journal_results_ids)
print(f'{just_isrctn} trials had results on just the ISRCTN')

0 trials had results on just the ISRCTN


In [34]:
#How many had results just in the literature?
just_pub = len(journal_results_ids - euctr_results_ids - ctg_results_ids - isrctn_results_ids)
print(f'{just_pub} trials had results only in a journal publication')
ci_calc(just_pub,len(journal_results))

107 trials had results only in a journal publication
Proportion: 36.52%
95% CI: 31.01-42.03


(0.31005580797466115, 0.3651877133105802, 0.42031961864649925)

In [35]:
#How many have no results on the EUCTR but results anywhere else?
not_euctr = analysis_df[(analysis_df.euctr_results_inc == 0) & ((analysis_df.ctgov_results_inc == 1) | 
                                                               (analysis_df.isrctn_results_inc == 1) | 
                                                               (analysis_df.journal_results_inc == 1))]
print(f'{len(not_euctr)} trials without EUCTR results had results somewhere else')
ci_calc(len(not_euctr), len(analysis_df))

117 trials without EUCTR results had results somewhere else
Proportion: 23.4%
95% CI: 19.69-27.11


(0.19688979249855912, 0.234, 0.2711102075014409)

In [36]:
#How many had results nowhere?
print(f'{len(results_nowhere)} trials had no results located')
ci_calc(len(results_nowhere), len(analysis_df))

119 trials had no results located
Proportion: 23.8%
95% CI: 20.07-27.53


(0.2006718012864269, 0.238, 0.27532819871357306)

## Getting data on combinations of results availability

We will visualise these in an upset chart in the paper

In [37]:
upset_plot_data = analysis_df[['euctr_results_inc', 'ctgov_results_inc', 'isrctn_results_inc', 'journal_results_inc']]

upset_plot_data.to_csv(parent + '/data/graphing_data/upset_data.csv')

# Data Quality, Completion Status, and Reporting

For overall population numbers, see the `Data Processing` notebooke

In [38]:
#Making a new DF for this population to investiage results availability by inferred and available completion dates

analysis_df_2 = analysis_df.merge(full_sample[['eudract_number', 'inferred']], 
                                  how='left', 
                                  left_on='euctr_id', right_on='eudract_number').drop('eudract_number', axis=1)

inferred = analysis_df_2[analysis_df_2.inferred == 1]
print(f'Inferred: {len(inferred)}; {round((len(inferred)/len(analysis_df_2)) * 100, 2)}%')
stated = analysis_df_2[analysis_df_2.inferred == 0]
print(f'Stated: {len(stated)}; {round((len(stated)/len(analysis_df_2)) * 100, 2)}%')

Inferred: 147; 29.4%
Stated: 353; 70.6%


In [39]:
#How many of the inferred ones had results anywhere?
inferred_res_sw = len(inferred[(inferred.euctr_results_inc == 1) | (inferred.ctgov_results_inc == 1) | (inferred.isrctn_results_inc == 1) | (inferred.journal_results_inc == 1)])
print(f'Inferred Dates with any results: {inferred_res_sw}')
print(f'Total inferred dates: {len(inferred)}')
ci_calc(inferred_res_sw, len(inferred))

Inferred Dates with any results: 70
Total inferred dates: 147
Proportion: 47.62%
95% CI: 39.55-55.69


(0.3954531334145431, 0.47619047619047616, 0.5569278189664093)

In [40]:
#How many of the extracted ones had results anywhere?
stated_res_sw = len(stated[(stated.euctr_results_inc == 1) | (stated.ctgov_results_inc == 1) | (stated.isrctn_results_inc == 1) | (stated.journal_results_inc == 1)])
print(f'Extracted with any results: {stated_res_sw}')
print(f'Total extracted dates: {len(stated)}')
ci_calc(stated_res_sw, len(stated))

Extracted with any results: 311
Total extracted dates: 353
Proportion: 88.1%
95% CI: 84.72-91.48


(0.8472445581989536, 0.8810198300283286, 0.9147951018577036)

In [41]:
#a is the number of trials with results
#b is the total number of trials

a = [inferred_res_sw, stated_res_sw]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

3.516998150004491e-22


In [42]:
#How many of the inferred ones had results somewhere else?
inferred_res_swe = len(inferred[(inferred.ctgov_results_inc == 1) | (inferred.isrctn_results_inc == 1) | (inferred.journal_results_inc == 1)])
print(f'Inferred with results outside the EUCTR: {inferred_res_swe}')
print(f'Total Inferred: {len(inferred)}')
ci_calc(inferred_res_swe, len(inferred))

Inferred with results outside the EUCTR: 70
Total Inferred: 147
Proportion: 47.62%
95% CI: 39.55-55.69


(0.3954531334145431, 0.47619047619047616, 0.5569278189664093)

In [43]:
#How many of the extracted ones had results somewhere else?
stated_res_swe = len(stated[(stated.ctgov_results_inc == 1) | (stated.isrctn_results_inc == 1) | (stated.journal_results_inc == 1)])
print(f'Extracted with results outside the EUCTR: {stated_res_swe}')
print(f'Total Extracted: {len(stated)}')
ci_calc(stated_res_swe, len(stated))

Extracted with results outside the EUCTR: 257
Total Extracted: 353
Proportion: 72.8%
95% CI: 68.16-77.45


(0.681626281828091, 0.7280453257790368, 0.7744643697299826)

In [136]:
a = [inferred_res_swe, stated_res_swe]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

6.903275707982106e-08


## Now we have to do this for each registry

In [45]:
#EUCTR

#Results posted to EUCTR by date of search - stated
stated_results_euctr = stated[(stated.euctr_results_inc == 1)]
print(f'Extracted with EUCTR results: {len(stated_results_euctr)}')
print(f'Total extracted: {len(stated)}')
ci_calc(len(stated_results_euctr), len(stated))

print('\n')

#Results posted to EUCTR by date of search - inferred
inferred_results_euctr = inferred[(inferred.euctr_results_inc == 1)]
print(f'Inferred with EUCTR results: {len(inferred_results_euctr)}')
print(f'Total inferred: {len(inferred)}')
ci_calc(len(inferred_results_euctr), len(inferred))

Extracted with EUCTR results: 264
Total extracted: 353
Proportion: 74.79%
95% CI: 70.26-79.32


Inferred with EUCTR results: 0
Total inferred: 147
Proportion: 0.0%
95% CI: 0.0-0.0


(0.0, 0.0, 0.0)

In [46]:
a = [len(inferred_results_euctr), len(stated_results_euctr)]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

1.376627481812372e-52


In [47]:
#CTG extracted dates
stated_ctg = stated[stated.nct_id.notnull()]
stated_ctg_results = stated_ctg[(stated_ctg.ctgov_results_inc == 1)]
print(f'Extracted with CTG results: {len(stated_ctg_results)}')
print(f'Total CTG extracted: {len(stated_ctg)}')
ci_calc(len(stated_ctg_results), len(stated_ctg))

print('\n')

#CTG inferred dates
inferred_ctg = inferred[inferred.nct_id.notnull()]
inferred_ctg_results = inferred_ctg[(inferred_ctg.ctgov_results_inc == 1)]
print(f'Inferred with CTG results: {len(inferred_ctg_results)}')
print(f'Total CTG inferred: {len(inferred_ctg)}')
ci_calc(len(inferred_ctg_results), len(inferred_ctg))

Extracted with CTG results: 130
Total CTG extracted: 270
Proportion: 48.15%
95% CI: 42.19-54.11


Inferred with CTG results: 2
Total CTG inferred: 69
Proportion: 2.9%
95% CI: -1.06-6.86


(-0.010599874223468034, 0.028985507246376812, 0.06857088871622166)

In [48]:
a = [len(inferred_ctg_results), len(stated_ctg_results)]
b = [len(inferred_ctg),len(stated_ctg)]

stat, pval = proportions_ztest(a, b)
print(pval)

6.011656192173383e-12


In [49]:
#isrctn extracted dates
stated_isrctn = stated[stated.isrctn_id.notnull()]
stated_isrctn_results = stated_isrctn[(stated_isrctn.isrctn_results_inc == 1)]
print(f'Extracted with ISRCTN results: {len(stated_isrctn_results)}')
print(f'Total ISRCTN extracted: {len(stated_isrctn)}')
ci_calc(len(stated_isrctn_results), len(stated_isrctn))

print('\n')

#isrctn inferred dates
inferred_isrctn = inferred[inferred.isrctn_id.notnull()]
inferred_isrctn_results = inferred_isrctn[(inferred_isrctn.isrctn_results_inc == 1)]
print(f'Inferred with ISRCTN results:{len(inferred_isrctn_results)}')
print(f'Total ISRCTN extracted: {len(inferred_isrctn)}')
ci_calc(len(inferred_isrctn_results), len(inferred_isrctn))

Extracted with ISRCTN results: 2
Total ISRCTN extracted: 29
Proportion: 6.9%
95% CI: -2.33-16.12


Inferred with ISRCTN results:0
Total ISRCTN extracted: 3
Proportion: 0.0%
95% CI: 0.0-0.0


(0.0, 0.0, 0.0)

In [50]:
a = [len(inferred_isrctn_results), len(stated_isrctn_results)]
b = [len(inferred_isrctn),len(stated_isrctn)]

stat, pval = proportions_ztest(a, b)
print(pval)

0.6385149374549752


In [51]:
#journal extracted dates
stated_journal = stated[(stated.journal_results_inc == 1)]
print(f'Extracted with results in a Journal: {len(stated_journal)}')
print(f'Total Extracted: {len(stated)}')
ci_calc(len(stated_journal), len(stated))

print('\n')

#journal inferred dates
inferred_journal = inferred[(inferred.journal_results_inc == 1)]
print(f'Inferred with results in a Journal: {len(inferred_journal)}')
print(f'Total Inferred: {len(inferred)}')
ci_calc(len(inferred_journal), len(inferred))

Extracted with results in a Journal: 224
Total Extracted: 353
Proportion: 63.46%
95% CI: 58.43-68.48


Inferred with results in a Journal: 69
Total Inferred: 147
Proportion: 46.94%
95% CI: 38.87-55.01


(0.38871035097043183, 0.46938775510204084, 0.5500651592336498)

In [52]:
a = [len(inferred_journal), len(stated_journal)]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

0.0006348653817690258


# Publication Date Analysis

In [53]:
date_df = analysis_df[['euctr_id', 'euctr_results', 'euctr_results_date', 'nct_id', 'ctgov_results', 'ctgov_results_date', 
             'isrctn_id', 'isrctn_results', 'isrctn_results_date', 'journal_result', 'journal_pub_date']].reset_index(drop=True)

earliest_results_date = dec_results.first_version_date.min()

In [54]:
#Sense checking to make sure there are no duplicate values
just_dates = date_df[['euctr_results_date','ctgov_results_date', 'isrctn_results_date', 'journal_pub_date']].reset_index(drop=True)
just_dates['test'] = just_dates.apply(check_dupes, axis=1)
just_dates.test.value_counts()

#There are no repeat dates so no need to worry about that.

False    500
Name: test, dtype: int64

In [55]:
#Getting rid of results dates from after we started searching
date_df['euctr_results_date'] = np.where(date_df['euctr_results_date'] > search_start_date, pd.NaT, date_df['euctr_results_date'])
date_df['euctr_results_date'] = pd.to_datetime(date_df['euctr_results_date'])

date_df['ctgov_results_date'] = np.where(date_df['ctgov_results_date'] > search_start_date, pd.NaT, date_df['ctgov_results_date'])
date_df['ctgov_results_date'] = pd.to_datetime(date_df['ctgov_results_date'])

date_df['isrctn_results_date'] = np.where(date_df['isrctn_results_date'] > search_start_date, pd.NaT, date_df['isrctn_results_date'])
date_df['isrctn_results_date'] = pd.to_datetime(date_df['isrctn_results_date'])

date_df['journal_pub_date'] = np.where(date_df['journal_pub_date'] > search_start_date, pd.NaT, date_df['journal_pub_date'])
date_df['journal_pub_date'] = pd.to_datetime(date_df['journal_pub_date'])

In [56]:
#Getting the earliest publication date for each trial
date_df['min_date'] = date_df[['euctr_results_date',
                               'ctgov_results_date', 
                               'isrctn_results_date', 
                               'journal_pub_date']].min(axis=1)

In [57]:
#Getting the total number of results available
date_df['results_counts'] = (date_df == 'Yes').T.sum()

## Pre-EUCTR Results Section trials

In [58]:
#Trials that had a result of some kind before the EUCTR results section launched
pre_euctr = date_df[(date_df.min_date < earliest_results_date)].reset_index(drop=True)

print(len(pre_euctr))

135


In [59]:
#How many of these went on the publish a result on the EUCTR

print(len(pre_euctr[pre_euctr.euctr_results == 'Yes']))

ci_calc(95,135)

94
Proportion: 70.37%
95% CI: 62.67-78.07


(0.6266759528652704, 0.7037037037037037, 0.780731454542137)

In [60]:
#Here we can extract where the earliest result was extracted for a given trial

conds = [pre_euctr.euctr_results_date == pre_euctr.min_date, 
         pre_euctr.ctgov_results_date == pre_euctr.min_date, 
         pre_euctr.isrctn_results_date == pre_euctr.min_date, 
         pre_euctr.journal_pub_date == pre_euctr.min_date]

out = ['EUCTR', 'CTgov', 'ISRCTN', 'Journal']

pre_euctr['earliest_results'] = np.select(conds, out, 'No Result')

In [61]:
#Lets now look at the distribution of where trials were first to report prior to the EUCTR
#This has to be limited to only trials that were also cross-registered on ClinicalTrials.gov to compare
#like with like
first_report_pre = pre_euctr[pre_euctr.nct_id.notnull()].earliest_results.value_counts()
first_report_pre

Journal    67
CTgov      31
Name: earliest_results, dtype: int64

In [62]:
#Can use this to get the CIs for those
#Journals
ci_calc(first_report_pre[0],first_report_pre.sum())

Proportion: 68.37%
95% CI: 59.16-77.57


(0.5915998519916292, 0.6836734693877551, 0.7757470867838809)

In [63]:
#CT gov
ci_calc(first_report_pre[1],first_report_pre.sum())

Proportion: 31.63%
95% CI: 22.43-40.84


(0.22425291321611912, 0.3163265306122449, 0.4084001480083707)

## Post-EUCTR Results Section Trials

In [64]:
#Make the sample
post_euctr = date_df[(date_df.min_date >= earliest_results_date)].reset_index(drop=True)

#Trials with any result after the launch of the EUCTR results section
print(len(post_euctr))

247


In [65]:
#How many of these ended up on the EUCTR at all
len(post_euctr[post_euctr.euctr_results == 'Yes'])

172

In [66]:
#And the CI for that
ci_calc(len(post_euctr[post_euctr.euctr_results == 'Yes']),len(post_euctr))

Proportion: 69.64%
95% CI: 63.9-75.37


(0.6390099628541973, 0.6963562753036437, 0.7537025877530902)

In [67]:
#Adding the earliest dissemination route

conds = [post_euctr.euctr_results_date == post_euctr.min_date, 
         post_euctr.ctgov_results_date == post_euctr.min_date, 
         post_euctr.isrctn_results_date == post_euctr.min_date, 
         post_euctr.journal_pub_date == post_euctr.min_date]

out = ['EUCTR', 'CTgov', 'ISRCTN', 'Journal']

post_euctr['earliest_results'] = np.select(conds, out, 'No Result')

In [68]:
first_report_post = post_euctr[post_euctr.nct_id.notnull()].earliest_results.value_counts()
first_report_post

Journal    89
EUCTR      83
CTgov      20
Name: earliest_results, dtype: int64

In [69]:
first_report_post.sum()

192

In [70]:
#Journal CIs
ci_calc(first_report_post[0], first_report_post.sum())

Proportion: 46.35%
95% CI: 39.3-53.41


(0.39300452712164496, 0.4635416666666667, 0.5340788062116884)

In [71]:
#EUCTR CIs
ci_calc(first_report_post[1], first_report_post.sum())

Proportion: 43.23%
95% CI: 36.22-50.24


(0.36221772884531195, 0.4322916666666667, 0.5023656044880214)

In [72]:
#CTG CIs
ci_calc(first_report_post[2], first_report_post.sum())

Proportion: 10.42%
95% CI: 6.1-14.74


(0.060956747926332336, 0.10416666666666667, 0.147376585407001)

In [142]:
#What about trials not on ClinicalTrials.gov.
#We can ignore the trial with the earliest ISRCTN result here
first_pub_no_ctg = post_euctr[(post_euctr.nct_id.isna())].earliest_results.value_counts()
first_pub_no_ctg

Journal    31
EUCTR      23
ISRCTN      1
Name: earliest_results, dtype: int64

In [143]:
#CI for journal
ci_calc(first_pub_no_ctg[0],first_pub_no_ctg.sum() - 1)

Proportion: 57.41%
95% CI: 44.22-70.6


(0.44218458175100844, 0.5740740740740741, 0.7059635663971398)

In [144]:
#CI for EUCTR
ci_calc(first_pub_no_ctg[1],first_pub_no_ctg.sum()- 1)

Proportion: 42.59%
95% CI: 29.4-55.78


(0.2940364336028603, 0.42592592592592593, 0.5578154182489916)

In [76]:
#CI for ISRCTN
ci_calc(first_pub_no_ctg[2],first_pub_no_ctg.sum())

Proportion: 1.82%
95% CI: -1.71-5.35


(-0.017129092394234546, 0.01818181818181818, 0.05349272875787091)

## Data for Start Year Figure

Here we just get the data we would need and export it. Figures are made in a separate notebook.

In [77]:
graphing_df = analysis_df[['euctr_id', 
                           'euctr_results_inc', 
                           'any_results_inc']].merge(regression[['Trial ID', 
                                                                 'Trial Start Year']], 
                                                    how='left', left_on='euctr_id', right_on='Trial ID').drop('Trial ID', axis=1)

graphing_df.to_csv(parent + '/data/graphing_data/start_year_data.csv')

# Reporting of Trial IDs

Might need to re-adjust this so that only things eligible to have that ID (i.e. registered there) are in the denom

In [78]:
trial_id_df = analysis_df[['euctr_id', 'nct_id', 'isrctn_id', 'journal_results_inc', 'journal_reg_numbers']].reset_index(drop=True)

In [79]:
reg_id_df = trial_id_df[trial_id_df.journal_results_inc == 1].journal_reg_numbers.value_counts(dropna=False).to_frame().reset_index()

In [181]:
#How many EUCTR/Publication pairs had an EUCTR ID

euctr_pub_ids = trial_id_df[(trial_id_df.journal_results_inc == 1) & (trial_id_df.euctr_id.notnull())]
print(f'There are {len(euctr_pub_ids)} trials with an EUCTR registration and a matched publication')
print(f'Below are the ones with a Trial ID excluding the {euctr_pub_ids.journal_reg_numbers.value_counts()["None"]} with no ID')
euctr_id_match = euctr_pub_ids[euctr_pub_ids.journal_reg_numbers != 'None'].journal_reg_numbers.value_counts()
euctr_id_match

There are 293 trials with an EUCTR registration and a matched publication
Below are the ones with a Trial ID excluding the 58 with no ID


ClinicalTrials.gov                   157
EUCTR/EudraCT                         33
EUCTR/EudraCT, ClinicalTrials.gov     27
ISRCTN                                 7
EUCTR/EudraCT, ISRCTN                  6
Other registration number              3
ClinicalTrials.gov, ISRCTN             2
Name: journal_reg_numbers, dtype: int64

In [182]:
#Stats on number containing an EUCTR ID
summarizer(euctr_id_match.filter(like='EUCTR/EudraCT').sum(), len(euctr_pub_ids))

Outcome of Interest: 66
Total: 293
Proportion: 22.53%
95% CI: 17.74-27.31


(0.17742166011265287, 0.22525597269624573, 0.2730902852798386)

In [177]:
#How many CTG/Publication pairs had an NCT ID

ctg_pub_ids = trial_id_df[(trial_id_df.journal_results_inc == 1) & (trial_id_df.nct_id.notnull())]
print(f'There are {len(ctg_pub_ids)} trials with a ClinicalTrials.gov registration and a matched publication')
print(f'Below are the ones with a Trial ID excluding the {ctg_pub_ids.journal_reg_numbers.value_counts()["None"]} with no ID')
ctg_id_match = ctg_pub_ids[ctg_pub_ids.journal_reg_numbers != 'None'].journal_reg_numbers.value_counts()
ctg_id_match

There are 222 trials with a ClinicalTrials.gov registration and a matched publication
Below are the ones with a Trial ID excluding the 25 with no ID


ClinicalTrials.gov                   157
EUCTR/EudraCT, ClinicalTrials.gov     26
EUCTR/EudraCT                         11
ClinicalTrials.gov, ISRCTN             2
ISRCTN                                 1
Name: journal_reg_numbers, dtype: int64

In [184]:
#Stats on number containing an NCT ID
summarizer(ctg_id_match.filter(like='ClinicalTrials.gov').sum(), len(ctg_pub_ids))

Outcome of Interest: 185
Total: 222
Proportion: 83.33%
95% CI: 78.43-88.24


(0.7843088149430074, 0.8333333333333334, 0.8823578517236593)

In [179]:
#How many EUCTR/Publication pairs had an ISRCTN ID

isrctn_pub_ids = trial_id_df[(trial_id_df.journal_results_inc == 1) & (trial_id_df.isrctn_id.notnull())]
print(f'There are {len(isrctn_pub_ids)} trials with an ISRCTN registration and a matched publication')
print(f'Below are the ones with a Trial ID excluding the {isrctn_pub_ids.journal_reg_numbers.value_counts()["None"]} with no ID')
isrctn_id_match = isrctn_pub_ids[isrctn_pub_ids.journal_reg_numbers != 'None'].journal_reg_numbers.value_counts()
isrctn_id_match

There are 24 trials with an ISRCTN registration and a matched publication
Below are the ones with a Trial ID excluding the 3 with no ID


ISRCTN                               7
EUCTR/EudraCT, ISRCTN                6
ClinicalTrials.gov                   3
EUCTR/EudraCT, ClinicalTrials.gov    2
ClinicalTrials.gov, ISRCTN           2
EUCTR/EudraCT                        1
Name: journal_reg_numbers, dtype: int64

In [186]:
#Stats on number containing an ISRCTN ID
summarizer(isrctn_id_match.filter(like='ISRCTN').sum(), len(isrctn_pub_ids))

Outcome of Interest: 15
Total: 24
Proportion: 62.5%
95% CI: 43.13-81.87


(0.43131049331468674, 0.625, 0.8186895066853133)

# Exploratory Analayses

In [86]:
#Creating the exploratory analysis dataset through merging a few different DFs 
#and aligning the columns for ease of use.

exploratory_final = analysis_df[['euctr_id', 'euctr_results_inc', 'ctgov_results_inc', 'isrctn_results_inc', 
                                 'journal_results_inc', 'any_results_inc', 'nct_id', 'isrctn_id', 
                                 'journal_result']].merge(full_sample[['eudract_number', 
                                                                  'inferred']], 
                                                          how='left', 
                                                          left_on='euctr_id', 
                                                          right_on='eudract_number')

exploratory_final = exploratory_final.merge(regression, 
                                            how='left', 
                                            left_on='euctr_id', 
                                            right_on='Trial ID').drop(['eudract_number', 
                                                                       'Timestamp', 
                                                                       'Notes', 
                                                                       'Trial ID'], axis=1)

exploratory_final = exploratory_final.merge(other_reg_data, 
                                            how='left', 
                                            left_on='euctr_id', 
                                            right_on='trial_id').drop(['Unnamed: 0', 'trial_id'], axis=1)

exploratory_final.columns = ['euctr_id', 'euctr_results_inc', 'ctgov_results_inc', 'isrctn_results_inc', 
                             'journal_results_inc', 'any_results_inc', 'nct_id', 'isrctn_id', 'journal_result', 
                             'inferred', 'trial_start_yr', 'enrollment', 'location', 'sponsor_status', 
                             'protocol_country', 'sponsor_country']

In [87]:
exploratory_final.head()

Unnamed: 0,euctr_id,euctr_results_inc,ctgov_results_inc,isrctn_results_inc,journal_results_inc,any_results_inc,nct_id,isrctn_id,journal_result,inferred,trial_start_yr,enrollment,location,sponsor_status,protocol_country,sponsor_country
0,2014-003401-15,1,1,0,1,1,NCT02269488,,Yes,0,2014,100,Non-EEA,Commercial,0.0,Japan
1,2011-001616-57,1,0,0,1,1,NCT01403636,,Yes,0,2011,167,EEA and Non-EEA,Commercial,2.0,France
2,2004-002743-27,1,1,0,1,1,NCT00630747,,Yes,0,2004,94,EEA and Non-EEA,Commercial,5.0,United States
3,2013-003561-34,1,1,0,1,1,NCT01980628,,Yes,0,2013,63,EEA and Non-EEA,Commercial,3.0,United States
4,2006-001414-33,1,0,0,1,1,,ISRCTN70127774,Yes,0,2006,60,EEA Only,Non-Commercial,1.0,United Kingdom


In [88]:
exploratory_final[exploratory_final.enrollment.isna()]

Unnamed: 0,euctr_id,euctr_results_inc,ctgov_results_inc,isrctn_results_inc,journal_results_inc,any_results_inc,nct_id,isrctn_id,journal_result,inferred,trial_start_yr,enrollment,location,sponsor_status,protocol_country,sponsor_country


Run the next two cells on the relevant variables in `exploratory_final` to get data for Table 1 of the paper.

#We will run `.describe()` on `enrollment` and `protocol_country`

#We will run `.value_counts()` on `sponsor_status`,`location`, and `trial_start_yr`

In [106]:
exploratory_final.protocol_country.describe()

count    500.000000
mean       2.348000
std        2.550275
min        0.000000
25%        1.000000
50%        1.000000
75%        3.000000
max       16.000000
Name: protocol_country, dtype: float64

In [111]:
exploratory_final.trial_start_yr.value_counts().sort_index()

1999     1
2002     1
2003     1
2004    14
2005    28
2006    49
2007    43
2008    60
2009    47
2010    46
2011    49
2012    51
2013    35
2014    33
2015    21
2016    17
2017     3
2018     1
Name: trial_start_yr, dtype: int64

## Analysis 1: Regression

In [92]:
#Taking only what we need:
regression_final = exploratory_final[['euctr_id', 'euctr_results_inc', 'any_results_inc', 'inferred', 
                                      'trial_start_yr', 'enrollment', 'location', 'sponsor_status', 
                                      'protocol_country']].reset_index(drop=True)

regression_final = regression_final[regression_final.any_results_inc == 1].reset_index(drop=True)

regression_final = regression_final.join(pd.get_dummies(regression_final[['location', 'sponsor_status']]), how='left')

In [93]:
regression_final.location.value_counts()

EEA Only           216
EEA and Non-EEA    151
Non-EEA             14
Name: location, dtype: int64

In [94]:
y_reg = regression_final['euctr_results_inc'].reset_index(drop=True)
x_reg = regression_final[['inferred', 'trial_start_yr', 'enrollment',
                         'protocol_country', 'location_EEA and Non-EEA', 'location_Non-EEA', 
                         'sponsor_status_Commercial']].reset_index(drop=True)

In [95]:
simple_logistic_regression(y_reg, x_reg)

         Current function value: 0.300646
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:      euctr_results_inc   No. Observations:                  381
Model:                          Logit   Df Residuals:                      373
Method:                           MLE   Df Model:                            7
Date:                Fri, 24 Feb 2023   Pseudo R-squ.:                  0.5125
Time:                        13:14:42   Log-Likelihood:                -114.55
converged:                      False   LL-Null:                       -234.98
Covariance Type:            nonrobust   LLR p-value:                 2.426e-48
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
inferred                    -43.9988   1.17e+05     -0.000      1.000    -2.3e+05     2.3e+05
trial_start_yr      

  result = func(self.values, **kwargs)


Unnamed: 0,OR,2.5%,97.5%,p_value
inferred,0.0,0.0,inf,0.9997
trial_start_yr,1.12,1.0,1.24,0.04123
enrollment,1.0,1.0,1.0,0.38933
protocol_country,1.1,0.92,1.31,0.29338
location_EEA and Non-EEA,1.01,0.4,2.51,0.98723
location_Non-EEA,24323130000.0,0.0,inf,0.99984
sponsor_status_Commercial,4.64,2.06,10.44,0.00021
cons,0.0,0.0,0.0,0.04175


If we run the regression per protocol it will not converge because, as shown earlier, no trials with inferred completion dates have a result on the EUCTR making that a perfect predictor. I will therefore remove this from the regression as it is a derived variable anyway.

In [96]:
#Re-running the regression without the "inferred" variable
y_reg1 = regression_final['euctr_results_inc'].reset_index(drop=True)
x_reg1 = regression_final[['trial_start_yr', 'enrollment',
                         'protocol_country', 'location_EEA and Non-EEA', 'location_Non-EEA', 
                         'sponsor_status_Commercial']].reset_index(drop=True)

In [97]:
simple_logistic_regression(y_reg1, x_reg1)

Optimization terminated successfully.
         Current function value: 0.452351
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:      euctr_results_inc   No. Observations:                  381
Model:                          Logit   Df Residuals:                      374
Method:                           MLE   Df Model:                            6
Date:                Fri, 24 Feb 2023   Pseudo R-squ.:                  0.2666
Time:                        13:14:42   Log-Likelihood:                -172.35
converged:                       True   LL-Null:                       -234.98
Covariance Type:            nonrobust   LLR p-value:                 1.271e-24
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
trial_start_yr                0.0798      0.042      1.882      0.060      -0.

Unnamed: 0,OR,2.5%,97.5%,p_value
trial_start_yr,1.08,1.0,1.18,0.05984
enrollment,1.0,1.0,1.0,0.60377
protocol_country,1.2,1.0,1.44,0.05426
location_EEA and Non-EEA,1.15,0.53,2.5,0.71838
location_Non-EEA,3.11,0.37,26.26,0.29745
sponsor_status_Commercial,9.41,4.89,18.11,0.0
cons,0.0,0.0,367.38,0.05867


Check univariable ORs here with any of these variables:

`trial_start_yr`, `enrollment`, `protocol_country`, `location_EEA and Non-EEA`, `location_Non-EEA`, `sponsor_status_Commercial`

In [190]:
x_regu = regression_final[['location_EEA and Non-EEA', 'location_Non-EEA']].reset_index(drop=True)

simple_logistic_regression(y_reg1, x_regu)

Optimization terminated successfully.
         Current function value: 0.549436
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:      euctr_results_inc   No. Observations:                  381
Model:                          Logit   Df Residuals:                      378
Method:                           MLE   Df Model:                            2
Date:                Fri, 24 Feb 2023   Pseudo R-squ.:                  0.1091
Time:                        17:03:52   Log-Likelihood:                -209.33
converged:                       True   LL-Null:                       -234.98
Covariance Type:            nonrobust   LLR p-value:                 7.274e-12
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
location_EEA and Non-EEA     1.7340      0.281      6.172      0.000       1.183

Unnamed: 0,OR,2.5%,97.5%,p_value
location_EEA and Non-EEA,5.66,3.27,9.82,0.0
location_Non-EEA,10.6,1.36,82.44,0.02412
cons,1.23,0.94,1.6,0.1351


In [99]:
#Holm-Bonferroni corrected thresholds
print(.05 / (7 - 1 + 1))
print(.05 / (7 - 2 + 1))
print(.05 / (7 - 3 + 1))
print(.05 / (7 - 4 + 1))
print(.05 / (7 - 5 + 1))

0.0071428571428571435
0.008333333333333333
0.01
0.0125
0.016666666666666666


# Analysis 2: Sponsor Country
Each trial will be assigned a “sponsor country” based on the most frequent sponsor country assigned in the EUCTR country protocols. A protocol of a specific country need not contain a sponsor from that country. If no single country appears most frequently, the trial will be coded as having “multi-country” sponsorship. The percent of trials reported to the EUCTR, other registries, and the literature will be reported for each unique sponsor country in the sample.

In [100]:
spon_country = exploratory_final[['euctr_id', 'nct_id', 'isrctn_id', 'journal_result', 'euctr_results_inc', 
                                  'ctgov_results_inc', 'isrctn_results_inc', 'journal_results_inc', 
                                  'any_results_inc', 'sponsor_country']].reset_index(drop=True)

In [193]:
#First for the EUCTR
spon_country_reporting = crosstab(spon_country, 'euctr_results_inc', 'sponsor_country').reset_index()
spon_country_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
spon_country_reporting['prct_reported'] = round((spon_country_reporting.reported / spon_country_reporting['all'])*100,2)
spon_country_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
26,All,236,264,500,52.8
25,United States,19,49,68,72.06
24,United Kingdom,15,46,61,75.41
8,Germany,29,30,59,50.85
7,France,21,28,49,57.14
12,Italy,38,8,46,17.39
20,Spain,31,5,36,13.89
16,Netherlands,25,9,34,26.47
22,Switzerland,4,25,29,86.21
5,Denmark,8,14,22,63.64


In [194]:
#Now for the other dissemination routes

#CTG
ct_gov_trials = spon_country[spon_country.nct_id.notnull()].reset_index(drop=True)
ctg_reporting = crosstab(ct_gov_trials, 'ctgov_results_inc', 'sponsor_country').reset_index()
ctg_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
ctg_reporting['prct_reported'] = round((ctg_reporting.reported / ctg_reporting['all'])*100,2)
ctg_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
23,All,207,132,339,38.94
22,United States,24,41,65,63.08
8,Germany,34,7,41,17.07
7,France,28,11,39,28.21
21,United Kingdom,18,14,32,43.75
19,Switzerland,8,19,27,70.37
17,Spain,20,0,20,0.0
14,Netherlands,15,4,19,21.05
5,Denmark,12,4,16,25.0
10,Italy,13,3,16,18.75


In [103]:
#ISRCTN
isrctn_trials = spon_country[spon_country.isrctn_id.notnull()].reset_index(drop=True)
isrctn_reporting = crosstab(isrctn_trials, 'isrctn_results_inc', 'sponsor_country').reset_index()
isrctn_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
isrctn_reporting['prct_reported'] = (isrctn_reporting.reported / isrctn_reporting['all']) * 100
isrctn_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
7,All,30,2,32,6.25
6,United Kingdom,20,1,21,4.761905
4,Germany,3,0,3,0.0
5,Netherlands,2,1,3,33.333333
0,Austria,2,0,2,0.0
1,Canada,1,0,1,0.0
2,Denmark,1,0,1,0.0
3,France,1,0,1,0.0


In [191]:
#Journal Reporting
journal_reporting = crosstab(spon_country, 'journal_results_inc', 'sponsor_country').reset_index()
journal_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
journal_reporting['prct_reported'] = round((journal_reporting.reported / journal_reporting['all'])*100,2)
journal_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
26,All,207,293,500,58.6
25,United States,23,45,68,66.18
24,United Kingdom,16,45,61,73.77
8,Germany,28,31,59,52.54
7,France,27,22,49,44.9
12,Italy,25,21,46,45.65
20,Spain,19,17,36,47.22
16,Netherlands,10,24,34,70.59
22,Switzerland,10,19,29,65.52
5,Denmark,10,12,22,54.55


In [192]:
#Any Reporting
any_reporting = crosstab(spon_country, 'any_results_inc', 'sponsor_country').reset_index()
any_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
any_reporting['prct_reported'] = round((any_reporting.reported / any_reporting['all'])*100,2)
any_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
26,All,119,381,500,76.2
25,United States,11,57,68,83.82
24,United Kingdom,6,55,61,90.16
8,Germany,15,44,59,74.58
7,France,13,36,49,73.47
12,Italy,21,25,46,54.35
20,Spain,18,18,36,50.0
16,Netherlands,8,26,34,76.47
22,Switzerland,1,28,29,96.55
5,Denmark,6,16,22,72.73
