In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportions_ztest, proportions_chisquare
import statsmodels.api as sm
from scipy.stats import median_test
import matplotlib.pyplot as plt

In [2]:
import sys
from pathlib import Path
import os
cwd = os.getcwd()
parent = str(Path(cwd).parents[0])
sys.path.append(parent)

In [3]:
#importing custom functions for analysis

from lib.functions import ci_calc, z_test, summarizer, check_dupes, simple_logistic_regression, crosstab

#ci_calc will compute a simple confidence interval around a proportion
#summarizer returns a nice output of proportions and CIs
#check_dupes helps with the date analysis
#simple_logicstic_regression and crosstab do what they say on the tin

# Data Loading and Setup

In [4]:
#Loading in all the data and creating the analysis dataset.

#Primary dataset with dates turned into dates
df = pd.read_csv(parent + '/data/final_dataset/final_dataset.csv')
df['euctr_results_date'] = pd.to_datetime(df['euctr_results_date'])
df['ctgov_results_date'] = pd.to_datetime(df['ctgov_results_date'])
df['isrctn_results_date'] = pd.to_datetime(df['isrctn_results_date'])
df['journal_pub_date'] = pd.to_datetime(df['journal_pub_date'])

#Manually collected sponsor data for regression
regression = pd.read_csv(parent + '/data/additional_data/manual_reg_data.csv')

#Regression data derived from the EUCTR
other_reg_data = pd.read_csv(parent + '/data/additional_data/spon_country_data.csv')

#the original sample (and replacements) for data on inferred dates
sample = pd.read_csv(parent + '/data/samples/euctr_search_sample_final.csv')
replacements  = pd.read_csv(parent + '/data/samples/replacement_sample.csv')
full_sample = pd.concat([sample,replacements])

#the results section scrape and making one column we need into dates
dec_results = pd.read_csv(parent + '/data/source_data/' + 'euctr_data_quality_results_scrape_dec_2020.csv.zip')
dec_results['first_version_date'] = pd.to_datetime(dec_results.first_version_date)

In [5]:
#Setting search reference dates
search_start_date = pd.to_datetime('2020-12-11')
primary_search_completion_date = pd.to_datetime('2021-07-22')
last_search_any = pd.to_datetime('2023-08-16')

# Analysis Prep

## Detail the exclusions and get the inferred status of the final sample

In [6]:
#Number and reason for exclusions
exclusions = df[df.replaced == 1]
exclusions.replaced_reason.value_counts()

Withdrawn             16
Ongoing                3
No protocol access     2
Observational          1
Name: replaced_reason, dtype: int64

In [7]:
sample_inferred_status = df[df.replaced.isna()][['euctr_id']].merge(full_sample[['eudract_number', 'inferred']], how='left', left_on='euctr_id', right_on='eudract_number')

In [8]:
#Inferred completion date status of the final sample
sample_inferred_status.inferred.value_counts()

0    354
1    146
Name: inferred, dtype: int64

In [9]:
exclusions_inferred_status = exclusions[['euctr_id']].merge(full_sample[['eudract_number', 'inferred']], how='left', left_on='euctr_id', right_on='eudract_number')

In [10]:
#Inferred completion date status of the trials we had to replace
exclusions_inferred_status.inferred.value_counts()

0    15
1     7
Name: inferred, dtype: int64

## Setting up the analysis dataset

In [11]:
#For the analysis, we don't want to include these excluded trials so we will make an analyses dataframe moving forward.
analysis_df = df[df.replaced != 1].reset_index(drop=True)

#Quick check the final dataset is the length we expect
assert(len(analysis_df) == 500)

In [12]:
#Here we make binary variable to indicate whether recorded results were published after our cutoff. 
#Per protocol, nothing available after we began searches should be included in the final results.
#Creating binary variables allows us to only correct for this once, rather than have to keep doing it throughout.

#Included EUCTR results
analysis_df['euctr_results_inc'] = np.where((analysis_df.euctr_results == 'Yes') & 
                                            (analysis_df.euctr_results_date <= search_start_date), 1, 0)

#Included clinicaltrials.gov results
analysis_df['ctgov_results_inc'] = np.where((analysis_df.ctgov_results == 'Yes') & 
                                            (analysis_df.ctgov_results_date <= search_start_date), 1, 0)

#Included ISRCTN results
analysis_df['isrctn_results_inc'] = np.where((analysis_df.isrctn_results == 'Yes') & 
                                            (analysis_df.isrctn_results_date <= search_start_date), 1, 0)

#Included journal results
analysis_df['journal_results_inc'] = np.where((analysis_df.journal_result == 'Yes') & 
                                            (analysis_df.journal_pub_date <= search_start_date), 1, 0)

#A catch-all for any result
analysis_df['any_results_inc'] = np.where(((analysis_df.euctr_results_inc == 1) | 
                                          (analysis_df.ctgov_results_inc == 1) | 
                                          (analysis_df.isrctn_results_inc == 1) | 
                                          (analysis_df.journal_results_inc == 1)), 1, 0)

In [13]:
#Exporting the analysis dataset so it can be used elsewhere.

analysis_df.to_csv(parent + '/data/final_dataset/' + 'analysis_df.csv')

# Main Analysis

## Results on the EUCTR

In [16]:
euctr_results = analysis_df[(analysis_df.euctr_results_inc == 1)]
total_found_euctr = len(euctr_results)
summarizer(total_found_euctr, len(analysis_df))

Outcome of Interest: 266
Total: 500
Proportion: 53.2%
95% CI: 48.83-57.57


(0.48826291729893273, 0.532, 0.5757370827010673)

In [17]:
#What do these results look like?
euctr_results.euctr_results_format.value_counts()

Tabular                        117
CSR Synopsis                    88
ClinicalTrials.gov Results      24
Journal Article                  7
Tabular and Journal Article      7
Short Report                     6
Tabular and CSR Synopsis         4
Tabular and Short Report         4
Report                           4
Notice of no analysis            4
Tabular and Report               1
Name: euctr_results_format, dtype: int64

In [18]:
#Lets get percents and CIs for these.
results_types = euctr_results.euctr_results_format.value_counts().to_frame()
results_types['percent'] = round((results_types.euctr_results_format / total_found_euctr) * 100,2)

#Adding CIs (i'm sure there's a more elegant way to do this with .map() or .apply() 
#but it's too much of a weird edge case to bother
ci_lower = []
ci_upper = []
for x in results_types.euctr_results_format.to_list():
    cis = ci_calc(x, total_found_euctr, printer=False)
    ci_lower.append(round(cis[0] * 100,2))
    ci_upper.append(round(cis[2] * 100,2))
    
results_types['ci_lower'] = ci_lower
results_types['ci_upper'] = ci_upper

results_types = results_types.reset_index().rename(columns={'index':'results_type'})
results_types

Unnamed: 0,results_type,euctr_results_format,percent,ci_lower,ci_upper
0,Tabular,117,43.98,38.02,49.95
1,CSR Synopsis,88,33.08,27.43,38.74
2,ClinicalTrials.gov Results,24,9.02,5.58,12.47
3,Journal Article,7,2.63,0.71,4.56
4,Tabular and Journal Article,7,2.63,0.71,4.56
5,Short Report,6,2.26,0.47,4.04
6,Tabular and CSR Synopsis,4,1.5,0.04,2.97
7,Tabular and Short Report,4,1.5,0.04,2.97
8,Report,4,1.5,0.04,2.97
9,Notice of no analysis,4,1.5,0.04,2.97


In [19]:
#Now we want to group like with like to get our final descriptive stats to report

#Total with just tabular results
summarizer(results_types[results_types.results_type == 'Tabular'].euctr_results_format[0], total_found_euctr)

Outcome of Interest: 117
Total: 266
Proportion: 43.98%
95% CI: 38.02-49.95


(0.38019835583519473, 0.4398496240601504, 0.499500892285106)

In [20]:
#Total with just a Document
doc_types = ['CSR Synopsis', 
             'ClinicalTrials.gov Results', 
             'Journal Article', 
             'Short Report', 
             'Report', 
             'Notice of no analysis']
summarizer(results_types[results_types.results_type.isin(doc_types)].euctr_results_format.sum(), total_found_euctr)

Outcome of Interest: 133
Total: 266
Proportion: 50.0%
95% CI: 43.99-56.01


(0.43991234473047336, 0.5, 0.5600876552695266)

In [21]:
for doc in doc_types:
    print(doc)
    summarizer(results_types[results_types.results_type == doc].euctr_results_format.sum(), total_found_euctr)
    print('\n')

CSR Synopsis
Outcome of Interest: 88
Total: 266
Proportion: 33.08%
95% CI: 27.43-38.74


ClinicalTrials.gov Results
Outcome of Interest: 24
Total: 266
Proportion: 9.02%
95% CI: 5.58-12.47


Journal Article
Outcome of Interest: 7
Total: 266
Proportion: 2.63%
95% CI: 0.71-4.56


Short Report
Outcome of Interest: 6
Total: 266
Proportion: 2.26%
95% CI: 0.47-4.04


Report
Outcome of Interest: 4
Total: 266
Proportion: 1.5%
95% CI: 0.04-2.97


Notice of no analysis
Outcome of Interest: 4
Total: 266
Proportion: 1.5%
95% CI: 0.04-2.97




In [22]:
#Reports

ci_calc(10,266)

Proportion: 3.76%
95% CI: 1.47-6.05


(0.014735175950589725, 0.03759398496240601, 0.060452793974222305)

In [23]:
#Tab and Document

ci_calc(16,266)

Proportion: 6.02%
95% CI: 3.16-8.87


(0.03157686467507926, 0.06015037593984962, 0.08872388720461999)

In [24]:
#Tab and CSR and Tab and Report

ci_calc(5,266)

Proportion: 1.88%
95% CI: 0.25-3.51


(0.0024762890839276873, 0.018796992481203006, 0.03511769587847832)

In [25]:
#Total with both Tabular and Document results
summarizer(results_types[~results_types.results_type.isin(doc_types + ['Tabular'])].euctr_results_format.sum(), total_found_euctr)

Outcome of Interest: 16
Total: 266
Proportion: 6.02%
95% CI: 3.16-8.87


(0.03157686467507926, 0.06015037593984962, 0.08872388720461999)

## Cross-Registration and results availability on other registries

In [26]:
#First we looks at unique registrations to the EUCTR
euctr_only_reg = analysis_df[analysis_df.nct_id.isna() & analysis_df.isrctn_id.isna()]
print(f'Out of {len(analysis_df)} trials {len(euctr_only_reg)} were only on the EUCTR')
ci_calc(len(euctr_only_reg), len(analysis_df))

Out of 500 trials 138 were only on the EUCTR
Proportion: 27.6%
95% CI: 23.68-31.52


(0.23681725179623056, 0.276, 0.3151827482037695)

In [27]:
#Now we'll look at cross-registration on ClinicalTrials.gov
ctg_xreg = analysis_df[analysis_df.nct_id.notnull()]
print(f'Out of {len(analysis_df)} trials {len(ctg_xreg)} were cross-registered on ClinicalTrials.gov')
ci_calc(len(ctg_xreg),len(analysis_df))

Out of 500 trials 339 were cross-registered on ClinicalTrials.gov
Proportion: 67.8%
95% CI: 63.7-71.9


(0.6370443472619469, 0.678, 0.7189556527380532)

In [28]:
#CTG x-reg results rate
ctg_results = ctg_xreg[(ctg_xreg.ctgov_results_inc == 1)]
print(f'Of the {len(ctg_xreg)} trials registered on ClinicalTrials.gov, {len(ctg_results)} had results')
ci_calc(len(ctg_results), len(ctg_xreg))

Of the 339 trials registered on ClinicalTrials.gov, 133 had results
Proportion: 39.23%
95% CI: 34.04-44.43


(0.34035281554493385, 0.39233038348082594, 0.444307951416718)

In [29]:
#Proportion cross-registered on the ISRCTN 
isrctn_xreg = analysis_df[analysis_df.isrctn_id.notnull()]
print(f'Out of {len(analysis_df)} trials {len(isrctn_xreg)} were cross-registered on the ISRCTN')
ci_calc(len(isrctn_xreg), len(analysis_df))

Out of 500 trials 32 were cross-registered on the ISRCTN
Proportion: 6.4%
95% CI: 4.25-8.55


(0.042546459201334624, 0.064, 0.08545354079866538)

In [30]:
#ISRCTN x-reg results rate
isrctn_results = isrctn_xreg[(isrctn_xreg.isrctn_results_inc == 1)]
print(f'Of the {len(isrctn_xreg)} trials registered on the ISRCTN, {len(isrctn_results)} had results')
ci_calc(len(isrctn_results), len(isrctn_xreg))

Of the 32 trials registered on the ISRCTN, 2 had results
Proportion: 6.25%
95% CI: -2.14-14.64


(-0.021370016617978563, 0.0625, 0.14637001661797855)

In [31]:
#How many trials registered on all three registries
triple_reg = len(analysis_df[analysis_df.nct_id.notnull() & analysis_df.isrctn_id.notnull()])
print(f'{triple_reg} registered on all three registires')
ci_calc(triple_reg, len(analysis_df))

9 registered on all three registires
Proportion: 1.8%
95% CI: 0.63-2.97


(0.006346320615359283, 0.018, 0.029653679384640714)

## Results in the literature

In [32]:
#How many had results in the literature
journal_results = analysis_df[(analysis_df.journal_results_inc == 1)]
print(f'Out of {len(analysis_df)} trials on the EUCTR, {len(journal_results)} had results in the literature')
ci_calc(len(journal_results), len(analysis_df))

Out of 500 trials on the EUCTR, 293 had results in the literature
Proportion: 58.6%
95% CI: 54.28-62.92


(0.5428262226253018, 0.586, 0.6291737773746982)

In [33]:
#How did we find journal results?

journal_results.journal_source.value_counts()

ClinicalTrials.gov                                 117
Google Scholar                                      90
PubMed                                              46
EUCTR                                               28
ISRCTN                                               7
CSR on EUCTR and ClinicalTrials.gov                  1
Forward citation from review on Google scholar       1
Forward citation from ISRCTN articles                1
Citation checked located Cochrane review             1
Linked to in another document in Google Scholar      1
Name: journal_source, dtype: int64

## Summarizing results availability

In [34]:
#Getting lists of ids for all results types

all_trial_ids = set(analysis_df.euctr_id.to_list())
euctr_results_ids = set(euctr_results.euctr_id.to_list())
ctg_results_ids = set(ctg_results.euctr_id.to_list())
isrctn_results_ids = set(isrctn_results.euctr_id.to_list())
journal_results_ids = set(journal_results.euctr_id.to_list())

In [35]:
#Overall, inclusive of duplicates, how many results did we locate?
len(euctr_results_ids) + len(ctg_results_ids) + len(isrctn_results_ids) + len(journal_results_ids)

694

In [36]:
#How many had results anywhere?
results_nowhere = all_trial_ids - euctr_results_ids - ctg_results_ids - isrctn_results_ids - journal_results_ids
has_some_result = len(analysis_df) - len(results_nowhere)
#sense check
assert(len(results_nowhere) + has_some_result == 500)

print(f'{has_some_result} of {len(analysis_df)} trials had results somewhere')
ci_calc(has_some_result,len(analysis_df))

383 of 500 trials had results somewhere
Proportion: 76.6%
95% CI: 72.89-80.31


(0.7288897924985591, 0.766, 0.8031102075014409)

In [37]:
#What did enrollment look like for the trials with no results?
#This uses enrollment numbers from the manual data we collected on enrollment

no_results = analysis_df[analysis_df.euctr_id.isin(results_nowhere)].reset_index(drop=True)
no_results[['euctr_id']].merge(regression[['Trial ID', 'Enrollment']], how='left', left_on='euctr_id', right_on='Trial ID').Enrollment.sum()

33673

## What results were unique to each dissemination route?

In [38]:
#How many had results on just the EUCTR?
just_euctr = len(euctr_results_ids - ctg_results_ids - isrctn_results_ids - journal_results_ids)
print(f'{just_euctr} trials had results on just the EUCTR')
summarizer(just_euctr, has_some_result)

55 trials had results on just the EUCTR
Outcome of Interest: 55
Total: 383
Proportion: 14.36%
95% CI: 10.85-17.87


(0.10848138075064588, 0.14360313315926893, 0.17872488556789196)

In [39]:
#What did the distribution of documents look like for unique results

just_euctr_ids = euctr_results_ids - ctg_results_ids - isrctn_results_ids - journal_results_ids
euctr_results[euctr_results.euctr_id.isin(just_euctr_ids)].euctr_results_format.value_counts()

CSR Synopsis                30
Tabular                     15
Short Report                 3
Notice of no analysis        3
Tabular and CSR Synopsis     2
Tabular and Report           1
Report                       1
Name: euctr_results_format, dtype: int64

In [40]:
euctr_results[euctr_results.euctr_id.isin(just_euctr_ids)].euctr_results_format.value_counts().sum()

55

In [41]:
# Values for appendix table

summarizer(1,55)

Outcome of Interest: 1
Total: 55
Proportion: 1.82%
95% CI: -1.71-5.35


(-0.017129092394234546, 0.01818181818181818, 0.05349272875787091)

In [42]:
#How many had results on just on ClinicalTrials.gov?
just_ctg = len(ctg_results_ids - euctr_results_ids - isrctn_results_ids - journal_results_ids)
print(f'{just_ctg} trials had results on just ClinicalTrials.gov')
summarizer(just_ctg,has_some_result)

3 trials had results on just ClinicalTrials.gov
Outcome of Interest: 3
Total: 383
Proportion: 0.78%
95% CI: -0.1-1.67


(-0.0009960778236745831, 0.007832898172323759, 0.0166618741683221)

In [43]:
#How many had results on just on the ISRCTN?
just_isrctn = len(isrctn_results_ids - euctr_results_ids - ctg_results_ids - journal_results_ids)
print(f'{just_isrctn} trials had results on just the ISRCTN')

0 trials had results on just the ISRCTN


In [44]:
#How many had results just in the literature?
just_pub = len(journal_results_ids - euctr_results_ids - ctg_results_ids - isrctn_results_ids)
print(f'{just_pub} trials had results only in a journal publication')
summarizer(just_pub,has_some_result)

108 trials had results only in a journal publication
Outcome of Interest: 108
Total: 383
Proportion: 28.2%
95% CI: 23.69-32.7


(0.23691967043954865, 0.2819843342036554, 0.3270489979677621)

In [45]:
#How many have no results on the EUCTR but results anywhere else?
not_euctr = analysis_df[(analysis_df.euctr_results_inc == 0) & ((analysis_df.ctgov_results_inc == 1) | 
                                                               (analysis_df.isrctn_results_inc == 1) | 
                                                               (analysis_df.journal_results_inc == 1))]
print(f'{len(not_euctr)} trials without EUCTR results had results somewhere else')
ci_calc(len(not_euctr), len(analysis_df))

117 trials without EUCTR results had results somewhere else
Proportion: 23.4%
95% CI: 19.69-27.11


(0.19688979249855912, 0.234, 0.2711102075014409)

In [46]:
#How many had results nowhere?
print(f'{len(results_nowhere)} trials had no results located')
ci_calc(len(results_nowhere), len(analysis_df))

117 trials had no results located
Proportion: 23.4%
95% CI: 19.69-27.11


(0.19688979249855912, 0.234, 0.2711102075014409)

In [201]:
#How many trials had results somewhere that wasn't the EUCTR
outside_euctr = analysis_df[((analysis_df.ctgov_results_inc == 1) | 
                             (analysis_df.isrctn_results_inc == 1) | 
                             (analysis_df.journal_results_inc == 1))]
print(f'{len(outside_euctr)} had a result outside the EUCTR')
ci_calc(len(outside_euctr), len(analysis_df))

328 had a result outside the EUCTR
Proportion: 65.6%
95% CI: 61.44-69.76


(0.6143608159926255, 0.656, 0.6976391840073746)

## Getting data on combinations of results availability

We will visualise these in an upset chart in the paper

In [176]:
upset_plot_data = analysis_df[['euctr_results_inc', 'ctgov_results_inc', 'isrctn_results_inc', 'journal_results_inc']]

#upset_plot_data.to_csv(parent + '/data/graphing_data/upset_data.csv')

cross_reg_upset = analysis_df[['euctr_id', 'nct_id', 'isrctn_id']]

#cross_reg_upset.to_csv(parent + '/data/graphing_data/upset_reg_data.csv')

# Data Quality, Completion Status, and Reporting

For overall population numbers, see the `Data Processing` notebook

In [49]:
#Making a new DF for this population to investiage results availability by inferred and available completion dates

analysis_df_2 = analysis_df.merge(full_sample[['eudract_number', 'inferred']], 
                                  how='left', 
                                  left_on='euctr_id', right_on='eudract_number').drop('eudract_number', axis=1)

inferred = analysis_df_2[analysis_df_2.inferred == 1]
print(f'Inferred: {len(inferred)}; {round((len(inferred)/len(analysis_df_2)) * 100, 2)}%')
stated = analysis_df_2[analysis_df_2.inferred == 0]
print(f'Stated: {len(stated)}; {round((len(stated)/len(analysis_df_2)) * 100, 2)}%')

Inferred: 146; 29.2%
Stated: 354; 70.8%


In [50]:
#How many of the inferred ones had results anywhere?
inferred_res_sw = len(inferred[(inferred.euctr_results_inc == 1) | (inferred.ctgov_results_inc == 1) | (inferred.isrctn_results_inc == 1) | (inferred.journal_results_inc == 1)])
print(f'Inferred Dates with any results: {inferred_res_sw}')
print(f'Total inferred dates: {len(inferred)}')
ci_calc(inferred_res_sw, len(inferred))

Inferred Dates with any results: 71
Total inferred dates: 146
Proportion: 48.63%
95% CI: 40.52-56.74


(0.40522643774242184, 0.4863013698630137, 0.5673763019836056)

In [51]:
#How many of the extracted ones had results anywhere?
stated_res_sw = len(stated[(stated.euctr_results_inc == 1) | (stated.ctgov_results_inc == 1) | (stated.isrctn_results_inc == 1) | (stated.journal_results_inc == 1)])
print(f'Extracted with any results: {stated_res_sw}')
print(f'Total extracted dates: {len(stated)}')
ci_calc(stated_res_sw, len(stated))

Extracted with any results: 312
Total extracted dates: 354
Proportion: 88.14%
95% CI: 84.77-91.5


(0.8476696470571278, 0.8813559322033898, 0.9150422173496519)

In [52]:
#a is the number of trials with results
#b is the total number of trials

a = [inferred_res_sw, stated_res_sw]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

2.3783244255374413e-21


In [53]:
#How many of the inferred ones had results somewhere else?
inferred_res_swe = len(inferred[(inferred.ctgov_results_inc == 1) | (inferred.isrctn_results_inc == 1) | (inferred.journal_results_inc == 1)])
print(f'Inferred with results outside the EUCTR: {inferred_res_swe}')
print(f'Total Inferred: {len(inferred)}')
ci_calc(inferred_res_swe, len(inferred))

Inferred with results outside the EUCTR: 70
Total Inferred: 146
Proportion: 47.95%
95% CI: 39.84-56.05


(0.3984151949615977, 0.4794520547945205, 0.5604889146274433)

In [54]:
#How many of the extracted ones had results somewhere else?
stated_res_swe = len(stated[(stated.ctgov_results_inc == 1) | (stated.isrctn_results_inc == 1) | (stated.journal_results_inc == 1)])
print(f'Extracted with results outside the EUCTR: {stated_res_swe}')
print(f'Total Extracted: {len(stated)}')
ci_calc(stated_res_swe, len(stated))

Extracted with results outside the EUCTR: 258
Total Extracted: 354
Proportion: 72.88%
95% CI: 68.25-77.51


(0.682501227544367, 0.7288135593220338, 0.7751258910997006)

In [55]:
a = [inferred_res_swe, stated_res_swe]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

9.453467511956692e-08


## Now we have to do this for each registry

In [57]:
#EUCTR

#Results posted to EUCTR by date of search - stated
stated_results_euctr = stated[(stated.euctr_results_inc == 1)]
print(f'Extracted with EUCTR results: {len(stated_results_euctr)}')
print(f'Total extracted: {len(stated)}')
ci_calc(len(stated_results_euctr), len(stated))

print('\n')

#Results posted to EUCTR by date of search - inferred
inferred_results_euctr = inferred[(inferred.euctr_results_inc == 1)]
print(f'Inferred with EUCTR results: {len(inferred_results_euctr)}')
print(f'Total inferred: {len(inferred)}')
ci_calc(len(inferred_results_euctr), len(inferred))

Extracted with EUCTR results: 265
Total extracted: 354
Proportion: 74.86%
95% CI: 70.34-79.38


Inferred with EUCTR results: 1
Total inferred: 146
Proportion: 0.68%
95% CI: -0.65-2.02


(-0.006529288616355393, 0.00684931506849315, 0.020227918753341692)

In [58]:
a = [len(inferred_results_euctr), len(stated_results_euctr)]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

1.319454443770701e-51


In [59]:
#CTG extracted dates
stated_ctg = stated[stated.nct_id.notnull()]
stated_ctg_results = stated_ctg[(stated_ctg.ctgov_results_inc == 1)]
print(f'Extracted with CTG results: {len(stated_ctg_results)}')
print(f'Total CTG extracted: {len(stated_ctg)}')
ci_calc(len(stated_ctg_results), len(stated_ctg))

print('\n')

#CTG inferred dates
inferred_ctg = inferred[inferred.nct_id.notnull()]
inferred_ctg_results = inferred_ctg[(inferred_ctg.ctgov_results_inc == 1)]
print(f'Inferred with CTG results: {len(inferred_ctg_results)}')
print(f'Total CTG inferred: {len(inferred_ctg)}')
ci_calc(len(inferred_ctg_results), len(inferred_ctg))

Extracted with CTG results: 131
Total CTG extracted: 271
Proportion: 48.34%
95% CI: 42.39-54.29


Inferred with CTG results: 2
Total CTG inferred: 68
Proportion: 2.94%
95% CI: -1.07-6.96


(-0.010746937358755954, 0.029411764705882353, 0.06957046677052066)

In [60]:
a = [len(inferred_ctg_results), len(stated_ctg_results)]
b = [len(inferred_ctg),len(stated_ctg)]

stat, pval = proportions_ztest(a, b)
print(pval)

7.122031160427556e-12


In [61]:
#isrctn extracted dates
stated_isrctn = stated[stated.isrctn_id.notnull()]
stated_isrctn_results = stated_isrctn[(stated_isrctn.isrctn_results_inc == 1)]
print(f'Extracted with ISRCTN results: {len(stated_isrctn_results)}')
print(f'Total ISRCTN extracted: {len(stated_isrctn)}')
ci_calc(len(stated_isrctn_results), len(stated_isrctn))

print('\n')

#isrctn inferred dates
inferred_isrctn = inferred[inferred.isrctn_id.notnull()]
inferred_isrctn_results = inferred_isrctn[(inferred_isrctn.isrctn_results_inc == 1)]
print(f'Inferred with ISRCTN results:{len(inferred_isrctn_results)}')
print(f'Total ISRCTN extracted: {len(inferred_isrctn)}')
ci_calc(len(inferred_isrctn_results), len(inferred_isrctn))

Extracted with ISRCTN results: 2
Total ISRCTN extracted: 29
Proportion: 6.9%
95% CI: -2.33-16.12


Inferred with ISRCTN results:0
Total ISRCTN extracted: 3
Proportion: 0.0%
95% CI: 0.0-0.0


(0.0, 0.0, 0.0)

In [62]:
a = [len(inferred_isrctn_results), len(stated_isrctn_results)]
b = [len(inferred_isrctn),len(stated_isrctn)]

stat, pval = proportions_ztest(a, b)
print(pval)

0.6385149374549752


In [63]:
#journal extracted dates
stated_journal = stated[(stated.journal_results_inc == 1)]
print(f'Extracted with results in a Journal: {len(stated_journal)}')
print(f'Total Extracted: {len(stated)}')
ci_calc(len(stated_journal), len(stated))

print('\n')

#journal inferred dates
inferred_journal = inferred[(inferred.journal_results_inc == 1)]
print(f'Inferred with results in a Journal: {len(inferred_journal)}')
print(f'Total Inferred: {len(inferred)}')
ci_calc(len(inferred_journal), len(inferred))

Extracted with results in a Journal: 224
Total Extracted: 354
Proportion: 63.28%
95% CI: 58.26-68.3


Inferred with results in a Journal: 69
Total Inferred: 146
Proportion: 47.26%
95% CI: 39.16-55.36


(0.3916192111656352, 0.4726027397260274, 0.5535862682864195)

In [64]:
a = [len(inferred_journal), len(stated_journal)]
b = [len(inferred),len(stated)]

stat, pval = proportions_ztest(a, b)
print(pval)

0.0009461100911769659


# Publication Date Analysis

Due to the very low number of results, all but 1 of which was never the earliest, we are excluding ISRCTN from this analysis. You can verify this fact below using the original date_df dataframe.

In [65]:
date_df = analysis_df[['euctr_id', 'euctr_results_inc', 'euctr_results_date', 'nct_id', 'ctgov_results_inc', 'ctgov_results_date', 
             'isrctn_id', 'isrctn_results_inc', 'isrctn_results_date', 'journal_results_inc', 'journal_pub_date']].reset_index(drop=True)


#This is the earliest results available on the EUCTR
earliest_euctr_results_date = dec_results.first_version_date.min()
print(earliest_euctr_results_date)

2014-03-14 00:00:00


In [66]:
#It's probably easiest to just blank out any dates of results that are excluded to make life easier
#We'll also remove ISRCTN
#Making a fresh copy so we can compare for sanity checks

date_df2 = date_df.drop(['isrctn_id', 'isrctn_results_inc', 'isrctn_results_date'], axis=1).reset_index(drop=True)

date_df2['euctr_results_date'] = pd.to_datetime(np.where((date_df2.euctr_results_inc == 0) & date_df2.euctr_results_date.notnull(), pd.NaT, date_df2.euctr_results_date))

date_df2['ctgov_results_date'] = pd.to_datetime(np.where((date_df2.ctgov_results_inc == 0) & date_df2.ctgov_results_date.notnull(), pd.NaT, date_df2.ctgov_results_date))

date_df2['journal_pub_date'] = pd.to_datetime(np.where((date_df2.journal_results_inc == 0) & date_df2.journal_pub_date.notnull(), pd.NaT, date_df2.journal_pub_date))

In [67]:
#Sense checking to make sure there are no duplicate date values for when we take mins and maxes
just_dates = date_df2[['euctr_results_date','ctgov_results_date', 'journal_pub_date']].reset_index(drop=True)
just_dates['test'] = just_dates.apply(check_dupes, axis=1)
just_dates.test.value_counts()

#There are no repeat dates so no need to worry about that.

False    500
Name: test, dtype: int64

In [68]:
#Getting the earliest and latest publication dates
date_df2['min_date'] = date_df2[['euctr_results_date',
                               'ctgov_results_date', 
                               'journal_pub_date']].min(axis=1)

date_df2['max_date'] = date_df2[['euctr_results_date',
                               'ctgov_results_date', 
                               'journal_pub_date']].max(axis=1)

In [69]:
#Getting the total number of results available 
date_df2['results_counts'] = (date_df2[['euctr_results_inc', 'ctgov_results_inc', 'journal_results_inc']].T.sum())

# Time to Reporting

In [152]:
conds = [date_df2.euctr_results_date == date_df2.min_date, 
         date_df2.ctgov_results_date == date_df2.min_date, 
         date_df2.journal_pub_date == date_df2.min_date]

out = ['EUCTR', 'CTgov', 'Journal']

date_df2['earliest_results'] = np.select(conds, out, 'No Result')

In [156]:
# All Trials

date_df2[date_df2.nct_id.notnull()].earliest_results.value_counts()

Journal      156
EUCTR         84
CTgov         51
No Result     48
Name: earliest_results, dtype: int64

In [165]:
summarizer(156,291)

Outcome of Interest: 156
Total: 291
Proportion: 53.61%
95% CI: 47.88-59.34


(0.4787836419550631, 0.5360824742268041, 0.5933813064985451)

In [162]:
#Those with a first results

date_df2[(date_df2.nct_id.notnull()) & (date_df2.min_date < earliest_euctr_results_date) & (date_df2.journal_pub_date > pd.to_datetime('2008-09-30'))].earliest_results.value_counts()

Journal    64
CTgov      23
Name: earliest_results, dtype: int64

In [167]:
summarizer(23,87)

Outcome of Interest: 23
Total: 87
Proportion: 26.44%
95% CI: 17.17-35.7


(0.17169953711647767, 0.26436781609195403, 0.3570360950674304)

In [160]:
date_df2[(date_df2.nct_id.notnull()) & (date_df2.min_date >= earliest_euctr_results_date)].earliest_results.value_counts()

Journal    89
EUCTR      84
CTgov      20
Name: earliest_results, dtype: int64

In [170]:
summarizer(89,193)

Outcome of Interest: 89
Total: 193
Proportion: 46.11%
95% CI: 39.08-53.15


(0.3908113273641295, 0.46113989637305697, 0.5314684653819844)

# Data for Time to Reporting K-M Curves

Code for medians and 95% CIs were done in the `Figures` notebook

In [171]:
#Make the sample
post_euctr = date_df2[(date_df2.min_date >= earliest_euctr_results_date)].reset_index(drop=True)

#Trials with a first result only after the launch of the EUCTR results section
print(len(post_euctr))

248


In [172]:
km_df = post_euctr.merge(full_sample, how='left', left_on='euctr_id', right_on='eudract_number')

km_df['final_date'] = pd.to_datetime(km_df['final_date'])

In [173]:
km_df['euctr_days'] = (km_df['euctr_results_date'] - km_df['final_date']) / pd.Timedelta(1,"d")

km_df['euctr_days'] = np.where(km_df['euctr_days'].isna(), 
                               (search_start_date - km_df['final_date']) / pd.Timedelta(1,"d"),
                               km_df['euctr_days'])

km_df['ctg_days'] = (km_df['ctgov_results_date'] - km_df['final_date']) / pd.Timedelta(1,"d")

km_df['ctg_days'] = np.where(km_df['ctg_days'].isna(), 
                               (search_start_date - km_df['final_date']) / pd.Timedelta(1,"d"),
                               km_df['ctg_days'])

km_df['pub_days'] = (km_df['journal_pub_date'] - km_df['final_date']) / pd.Timedelta(1,"d")

km_df['pub_days'] = np.where(km_df['pub_days'].isna(), 
                               (search_start_date - km_df['final_date']) / pd.Timedelta(1,"d"),
                               km_df['pub_days'])


In [174]:
km_df.to_csv(parent + '/data/graphing_data/time_to_pub.csv')

## Data for Start Year Figure

Here we just get the data we would need and export it. Figures are made in a separate notebook.

In [175]:
graphing_df = analysis_df[['euctr_id', 
                           'euctr_results_inc', 
                           'any_results_inc']].merge(regression[['Trial ID', 
                                                                 'Trial Start Year']], 
                                                    how='left', left_on='euctr_id', right_on='Trial ID').drop('Trial ID', axis=1)

#graphing_df.to_csv(parent + '/data/graphing_data/start_year_data.csv')

# Reporting of Trial IDs

In [94]:
trial_id_df = analysis_df[['euctr_id', 'nct_id', 'isrctn_id', 'journal_results_inc', 'journal_reg_numbers']].reset_index(drop=True)

In [95]:
reg_id_df = trial_id_df[trial_id_df.journal_results_inc == 1].journal_reg_numbers.value_counts(dropna=False).to_frame().reset_index()

In [96]:
#How many EUCTR/Publication pairs had an EUCTR ID

euctr_pub_ids = trial_id_df[(trial_id_df.journal_results_inc == 1) & (trial_id_df.euctr_id.notnull())]
print(f'There are {len(euctr_pub_ids)} trials with an EUCTR registration and a matched publication')
print(f'Below are the ones with a Trial ID excluding the {euctr_pub_ids.journal_reg_numbers.value_counts()["None"]} with no ID')
euctr_id_match = euctr_pub_ids[euctr_pub_ids.journal_reg_numbers != 'None'].journal_reg_numbers.value_counts()
euctr_id_match

There are 293 trials with an EUCTR registration and a matched publication
Below are the ones with a Trial ID excluding the 58 with no ID


ClinicalTrials.gov                   157
EUCTR/EudraCT                         33
EUCTR/EudraCT, ClinicalTrials.gov     27
ISRCTN                                 7
EUCTR/EudraCT, ISRCTN                  6
Other registration number              3
ClinicalTrials.gov, ISRCTN             2
Name: journal_reg_numbers, dtype: int64

In [97]:
#Stats on number containing an EUCTR ID
summarizer(euctr_id_match.filter(like='EUCTR/EudraCT').sum(), len(euctr_pub_ids))

Outcome of Interest: 66
Total: 293
Proportion: 22.53%
95% CI: 17.74-27.31


(0.17742166011265287, 0.22525597269624573, 0.2730902852798386)

In [98]:
#How many CTG/Publication pairs had an NCT ID

ctg_pub_ids = trial_id_df[(trial_id_df.journal_results_inc == 1) & (trial_id_df.nct_id.notnull())]
print(f'There are {len(ctg_pub_ids)} trials with a ClinicalTrials.gov registration and a matched publication')
print(f'Below are the ones with a Trial ID excluding the {ctg_pub_ids.journal_reg_numbers.value_counts()["None"]} with no ID')
ctg_id_match = ctg_pub_ids[ctg_pub_ids.journal_reg_numbers != 'None'].journal_reg_numbers.value_counts()
ctg_id_match

There are 222 trials with a ClinicalTrials.gov registration and a matched publication
Below are the ones with a Trial ID excluding the 25 with no ID


ClinicalTrials.gov                   157
EUCTR/EudraCT, ClinicalTrials.gov     26
EUCTR/EudraCT                         11
ClinicalTrials.gov, ISRCTN             2
ISRCTN                                 1
Name: journal_reg_numbers, dtype: int64

In [99]:
#Stats on number containing an NCT ID
summarizer(ctg_id_match.filter(like='ClinicalTrials.gov').sum(), len(ctg_pub_ids))

Outcome of Interest: 185
Total: 222
Proportion: 83.33%
95% CI: 78.43-88.24


(0.7843088149430074, 0.8333333333333334, 0.8823578517236593)

In [100]:
#How many EUCTR/Publication pairs had an ISRCTN ID

isrctn_pub_ids = trial_id_df[(trial_id_df.journal_results_inc == 1) & (trial_id_df.isrctn_id.notnull())]
print(f'There are {len(isrctn_pub_ids)} trials with an ISRCTN registration and a matched publication')
print(f'Below are the ones with a Trial ID excluding the {isrctn_pub_ids.journal_reg_numbers.value_counts()["None"]} with no ID')
isrctn_id_match = isrctn_pub_ids[isrctn_pub_ids.journal_reg_numbers != 'None'].journal_reg_numbers.value_counts()
isrctn_id_match

There are 24 trials with an ISRCTN registration and a matched publication
Below are the ones with a Trial ID excluding the 3 with no ID


ISRCTN                               7
EUCTR/EudraCT, ISRCTN                6
ClinicalTrials.gov                   3
EUCTR/EudraCT, ClinicalTrials.gov    2
ClinicalTrials.gov, ISRCTN           2
EUCTR/EudraCT                        1
Name: journal_reg_numbers, dtype: int64

In [101]:
#Stats on number containing an ISRCTN ID
summarizer(isrctn_id_match.filter(like='ISRCTN').sum(), len(isrctn_pub_ids))

Outcome of Interest: 15
Total: 24
Proportion: 62.5%
95% CI: 43.13-81.87


(0.43131049331468674, 0.625, 0.8186895066853133)

# Exploratory Analyses

In [102]:
#Creating the exploratory analysis dataset through merging a few different DFs 
#and aligning the columns for ease of use.

exploratory_final = analysis_df[['euctr_id', 'euctr_results_inc', 'ctgov_results_inc', 'isrctn_results_inc', 
                                 'journal_results_inc', 'any_results_inc', 'nct_id', 'isrctn_id', 
                                 'journal_result']].merge(full_sample[['eudract_number', 
                                                                  'inferred']], 
                                                          how='left', 
                                                          left_on='euctr_id', 
                                                          right_on='eudract_number')

exploratory_final = exploratory_final.merge(regression, 
                                            how='left', 
                                            left_on='euctr_id', 
                                            right_on='Trial ID').drop(['eudract_number', 
                                                                       'Timestamp', 
                                                                       'Notes', 
                                                                       'Trial ID'], axis=1)

exploratory_final = exploratory_final.merge(other_reg_data, 
                                            how='left', 
                                            left_on='euctr_id', 
                                            right_on='trial_id').drop(['Unnamed: 0', 'trial_id'], axis=1)

exploratory_final.columns = ['euctr_id', 'euctr_results_inc', 'ctgov_results_inc', 'isrctn_results_inc', 
                             'journal_results_inc', 'any_results_inc', 'nct_id', 'isrctn_id', 'journal_result', 
                             'inferred', 'trial_start_yr', 'enrollment', 'location', 'sponsor_status', 
                             'protocol_country', 'sponsor_country']

In [103]:
exploratory_final.head()

Unnamed: 0,euctr_id,euctr_results_inc,ctgov_results_inc,isrctn_results_inc,journal_results_inc,any_results_inc,nct_id,isrctn_id,journal_result,inferred,trial_start_yr,enrollment,location,sponsor_status,protocol_country,sponsor_country
0,2014-003401-15,1,1,0,1,1,NCT02269488,,Yes,0,2014,100,Non-EEA,Commercial,0.0,Japan
1,2011-001616-57,1,0,0,1,1,NCT01403636,,Yes,0,2011,167,EEA and Non-EEA,Commercial,2.0,France
2,2004-002743-27,1,1,0,1,1,NCT00630747,,Yes,0,2004,94,EEA and Non-EEA,Commercial,5.0,United States
3,2013-003561-34,1,1,0,1,1,NCT01980628,,Yes,0,2013,63,EEA and Non-EEA,Commercial,3.0,United States
4,2006-001414-33,1,0,0,1,1,,ISRCTN70127774,Yes,0,2006,60,EEA Only,Non-Commercial,1.0,United Kingdom


In [104]:
exploratory_final[exploratory_final.enrollment.isna()]

Unnamed: 0,euctr_id,euctr_results_inc,ctgov_results_inc,isrctn_results_inc,journal_results_inc,any_results_inc,nct_id,isrctn_id,journal_result,inferred,trial_start_yr,enrollment,location,sponsor_status,protocol_country,sponsor_country


Run the next two cells on the relevant variables in `exploratory_final` to get data for Table 1 of the paper.

#We will run `.describe()` on `enrollment` and `protocol_country`

#We will run `.value_counts()` on `sponsor_status`,`location`, and `trial_start_yr`

In [128]:
exploratory_final.enrollment.describe()

count      500.00000
mean       240.33600
std        820.53967
min          1.00000
25%         36.00000
50%         70.50000
75%        196.50000
max      16000.00000
Name: enrollment, dtype: float64

In [134]:
exploratory_final.location.value_counts().sort_index()

EEA Only           319
EEA and Non-EEA    167
Non-EEA             14
Name: location, dtype: int64

## Analysis 1: Regression

In [107]:
#Taking only what we need:
regression_final = exploratory_final[['euctr_id', 'euctr_results_inc', 'any_results_inc', 'inferred', 
                                      'trial_start_yr', 'enrollment', 'location', 'sponsor_status', 
                                      'protocol_country']].reset_index(drop=True)

regression_final = regression_final[regression_final.any_results_inc == 1].reset_index(drop=True)

regression_final = regression_final.join(pd.get_dummies(regression_final[['location', 'sponsor_status']]), how='left')

In [108]:
regression_final.location.value_counts()

EEA Only           217
EEA and Non-EEA    152
Non-EEA             14
Name: location, dtype: int64

In [109]:
y_reg = regression_final['euctr_results_inc'].reset_index(drop=True)
x_reg = regression_final[['inferred', 'trial_start_yr', 'enrollment',
                         'protocol_country', 'location_EEA and Non-EEA', 'location_Non-EEA', 
                         'sponsor_status_Commercial']].reset_index(drop=True)

In [110]:
simple_logistic_regression(y_reg, x_reg)

Optimization terminated successfully.
         Current function value: 0.309099
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:      euctr_results_inc   No. Observations:                  383
Model:                          Logit   Df Residuals:                      375
Method:                           MLE   Df Model:                            7
Date:                Wed, 23 Aug 2023   Pseudo R-squ.:                  0.4978
Time:                        10:06:51   Log-Likelihood:                -118.38
converged:                       True   LL-Null:                       -235.71
Covariance Type:            nonrobust   LLR p-value:                 5.085e-47
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
inferred                     -5.3757      1.082     -4.967      0.000      -7.

Unnamed: 0,OR,2.5%,97.5%,p_value
inferred,0.0,0.0,0.04,0.0
trial_start_yr,1.13,1.02,1.25,0.02369
enrollment,1.0,1.0,1.0,0.70082
protocol_country,1.11,0.92,1.34,0.26077
location_EEA and Non-EEA,1.01,0.4,2.53,0.98185
location_Non-EEA,9.68,0.27,350.99,0.21518
sponsor_status_Commercial,4.87,2.18,10.88,0.00011
cons,0.0,0.0,0.0,0.02397


If we run the regression per protocol it leads to some funky results because of only including 1 trial with inferred results, as shown earlier. I will therefore remove this from the regression as it is a derived variable anyway.

In [111]:
#Re-running the regression without the "inferred" variable
y_reg1 = regression_final['euctr_results_inc'].reset_index(drop=True)
x_reg1 = regression_final[['trial_start_yr', 'enrollment',
                         'protocol_country', 'location_EEA and Non-EEA', 'location_Non-EEA', 
                         'sponsor_status_Commercial']].reset_index(drop=True)

In [112]:
simple_logistic_regression(y_reg1, x_reg1)

Optimization terminated successfully.
         Current function value: 0.443707
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:      euctr_results_inc   No. Observations:                  383
Model:                          Logit   Df Residuals:                      376
Method:                           MLE   Df Model:                            6
Date:                Wed, 23 Aug 2023   Pseudo R-squ.:                  0.2790
Time:                        10:07:22   Log-Likelihood:                -169.94
converged:                       True   LL-Null:                       -235.71
Covariance Type:            nonrobust   LLR p-value:                 6.072e-26
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
trial_start_yr                0.0888      0.043      2.066      0.039       0.

Unnamed: 0,OR,2.5%,97.5%,p_value
trial_start_yr,1.09,1.0,1.19,0.03885
enrollment,1.0,1.0,1.0,0.95769
protocol_country,1.22,1.0,1.48,0.04679
location_EEA and Non-EEA,1.14,0.52,2.49,0.73875
location_Non-EEA,3.16,0.37,26.74,0.29176
sponsor_status_Commercial,9.75,5.04,18.85,0.0
cons,0.0,0.0,0.0,0.03799


Check univariable ORs here with any of these variables:

`trial_start_yr`, `enrollment`, `protocol_country`, `location_EEA and Non-EEA`, `location_Non-EEA`, `sponsor_status_Commercial`

In [119]:
x_regu = regression_final[['location_EEA and Non-EEA', 'location_Non-EEA']].reset_index(drop=True)

simple_logistic_regression(y_reg1, x_regu)

Optimization terminated successfully.
         Current function value: 0.543839
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:      euctr_results_inc   No. Observations:                  383
Model:                          Logit   Df Residuals:                      380
Method:                           MLE   Df Model:                            2
Date:                Wed, 23 Aug 2023   Pseudo R-squ.:                  0.1163
Time:                        10:18:19   Log-Likelihood:                -208.29
converged:                       True   LL-Null:                       -235.71
Covariance Type:            nonrobust   LLR p-value:                 1.232e-12
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
location_EEA and Non-EEA     1.8133      0.286      6.347      0.000       1.253

Unnamed: 0,OR,2.5%,97.5%,p_value
location_EEA and Non-EEA,6.13,3.5,10.73,0.0
location_Non-EEA,10.71,1.38,83.28,0.02351
cons,1.21,0.93,1.59,0.15464


p-values uni:
com spon: <0.00001
nonEEA: <0.0001
prot_country: <0.0001
enrollment: .05
start_yr: 0.17209
EEA/NonEEA: .02412



In [120]:
#Holm-Bonferroni corrected thresholds
print(.05 / (7 - 1 + 1))
print(.05 / (7 - 2 + 1))
print(.05 / (7 - 3 + 1))
print(.05 / (7 - 4 + 1))
print(.05 / (7 - 5 + 1))

0.0071428571428571435
0.008333333333333333
0.01
0.0125
0.016666666666666666


# Analysis 2: Sponsor Country
Each trial will be assigned a “sponsor country” based on the most frequent sponsor country assigned in the EUCTR country protocols. A protocol of a specific country need not contain a sponsor from that country. If no single country appears most frequently, the trial will be coded as having “multi-country” sponsorship. The percent of trials reported to the EUCTR, other registries, and the literature will be reported for each unique sponsor country in the sample.

In [121]:
spon_country = exploratory_final[['euctr_id', 'nct_id', 'isrctn_id', 'journal_result', 'euctr_results_inc', 
                                  'ctgov_results_inc', 'isrctn_results_inc', 'journal_results_inc', 
                                  'any_results_inc', 'sponsor_country']].reset_index(drop=True)

In [122]:
#First for the EUCTR
spon_country_reporting = crosstab(spon_country, 'euctr_results_inc', 'sponsor_country').reset_index()
spon_country_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
spon_country_reporting['prct_reported'] = round((spon_country_reporting.reported / spon_country_reporting['all'])*100,2)
spon_country_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
26,All,234,266,500,53.2
25,United States,19,51,70,72.86
24,United Kingdom,15,46,61,75.41
8,Germany,29,29,58,50.0
7,France,21,28,49,57.14
12,Italy,38,8,46,17.39
20,Spain,31,5,36,13.89
16,Netherlands,25,9,34,26.47
22,Switzerland,3,25,28,89.29
5,Denmark,8,14,22,63.64


In [123]:
#Now for the other dissemination routes

#CTG
ct_gov_trials = spon_country[spon_country.nct_id.notnull()].reset_index(drop=True)
ctg_reporting = crosstab(ct_gov_trials, 'ctgov_results_inc', 'sponsor_country').reset_index()
ctg_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
ctg_reporting['prct_reported'] = round((ctg_reporting.reported / ctg_reporting['all'])*100,2)
ctg_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
23,All,206,133,339,39.23
22,United States,24,43,67,64.18
8,Germany,33,7,40,17.5
7,France,28,11,39,28.21
21,United Kingdom,18,14,32,43.75
19,Switzerland,8,18,26,69.23
17,Spain,20,0,20,0.0
14,Netherlands,15,4,19,21.05
5,Denmark,12,4,16,25.0
10,Italy,13,3,16,18.75


In [124]:
#ISRCTN
isrctn_trials = spon_country[spon_country.isrctn_id.notnull()].reset_index(drop=True)
isrctn_reporting = crosstab(isrctn_trials, 'isrctn_results_inc', 'sponsor_country').reset_index()
isrctn_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
isrctn_reporting['prct_reported'] = (isrctn_reporting.reported / isrctn_reporting['all']) * 100
isrctn_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
7,All,30,2,32,6.25
6,United Kingdom,20,1,21,4.761905
4,Germany,3,0,3,0.0
5,Netherlands,2,1,3,33.333333
0,Austria,2,0,2,0.0
1,Canada,1,0,1,0.0
2,Denmark,1,0,1,0.0
3,France,1,0,1,0.0


In [125]:
#Journal Reporting
journal_reporting = crosstab(spon_country, 'journal_results_inc', 'sponsor_country').reset_index()
journal_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
journal_reporting['prct_reported'] = round((journal_reporting.reported / journal_reporting['all'])*100,2)
journal_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
26,All,207,293,500,58.6
25,United States,24,46,70,65.71
24,United Kingdom,16,45,61,73.77
8,Germany,27,31,58,53.45
7,France,27,22,49,44.9
12,Italy,25,21,46,45.65
20,Spain,19,17,36,47.22
16,Netherlands,10,24,34,70.59
22,Switzerland,10,18,28,64.29
5,Denmark,10,12,22,54.55


In [126]:
#Any Reporting
any_reporting = crosstab(spon_country, 'any_results_inc', 'sponsor_country').reset_index()
any_reporting.columns = ['sponsor_country', 'not_reported', 'reported', 'all']
any_reporting['prct_reported'] = round((any_reporting.reported / any_reporting['all'])*100,2)
any_reporting.sort_values(by='all', ascending=False)

Unnamed: 0,sponsor_country,not_reported,reported,all,prct_reported
26,All,117,383,500,76.6
25,United States,11,59,70,84.29
24,United Kingdom,6,55,61,90.16
8,Germany,14,44,58,75.86
7,France,13,36,49,73.47
12,Italy,21,25,46,54.35
20,Spain,18,18,36,50.0
16,Netherlands,8,26,34,76.47
22,Switzerland,1,27,28,96.43
5,Denmark,6,16,22,72.73


# Peer Review Additions

Additions to the analysis requested by, or added following, peer review

## Breakdown of results by sponsor type

In [219]:
full_sample

Unnamed: 0.1,Unnamed: 0,eudract_number,final_date,inferred
0,5552,2014-003401-15,2015-02-03,0
1,7336,2011-001616-57,2014-09-12,0
2,12314,2004-002743-27,2008-01-31,0
3,6567,2013-003561-34,2017-10-02,0
4,13573,2006-001414-33,2013-07-01,0
...,...,...,...,...
17,1061,2008-000083-17,2010-01-13,1
18,8457,2007-004350-82,2012-03-23,0
19,19616,2015-002963-40,2017-07-22,1
20,2873,2011-001007-12,2012-01-13,0


In [220]:
spon_results = analysis_df.merge(other_reg_data[['trial_id', 'sponsor_status']], 
                                 left_on='euctr_id', 
                                 right_on='trial_id', 
                                 how='left').merge(full_sample[['eudract_number', 'inferred']], 
                                                   left_on='euctr_id', 
                                                   right_on='eudract_number', 
                                                   how='left')

In [221]:
spon_results.columns

Index(['euctr_id', 'dual_searched', 'searched_by', 'senior_reviewed',
       'replaced', 'replaced_reason', 'euctr_results', 'euctr_results_link',
       'euctr_results_format', 'euctr_results_date', 'ctgov_xreg', 'nct_id',
       'ctgov_results', 'ctgov_results_link', 'ctgov_results_date',
       'isrctn_xreg', 'isrctn_id', 'isrctn_results', 'isrctn_results_type',
       'isrctn_results_link', 'isrctn_additional_links', 'isrctn_results_date',
       'journal_result', 'journal_link', 'journal_source', 'journal_match',
       'journal_pub_date', 'journal_reg_numbers', 'main_result_abstract',
       'excluded_abstract', 'discl_no_analysis', 'team_discuss',
       'additional_results_located', 'notes', 'euctr_results_inc',
       'ctgov_results_inc', 'isrctn_results_inc', 'journal_results_inc',
       'any_results_inc', 'trial_id', 'sponsor_status', 'eudract_number',
       'inferred'],
      dtype='object')

In [179]:
#All Results
crosstab(spon_results, 'any_results_inc', 'sponsor_status')

any_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,37,240,277
Non-Commercial,79,143,222
Unknown,1,0,1
All,117,383,500


In [186]:
summarizer(240,277)
print('\n')
summarizer(143,222)

Outcome of Interest: 240
Total: 277
Proportion: 86.64%
95% CI: 82.64-90.65


Outcome of Interest: 143
Total: 222
Proportion: 64.41%
95% CI: 58.12-70.71


(0.5811633439714392, 0.6441441441441441, 0.707124944316849)

In [223]:
crosstab(spon_results[spon_results.inferred==0], 'any_results_inc', 'sponsor_status')

any_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,27,233,260
Non-Commercial,15,79,94
All,42,312,354


In [225]:
summarizer(233,260)
print('\n')
summarizer(79,94)

Outcome of Interest: 233
Total: 260
Proportion: 89.62%
95% CI: 85.91-93.32


Outcome of Interest: 79
Total: 94
Proportion: 84.04%
95% CI: 76.64-91.45


(0.7663928527837035, 0.8404255319148937, 0.9144582110460838)

In [224]:
crosstab(spon_results[spon_results.inferred==1], 'any_results_inc', 'sponsor_status')

any_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,10,7,17
Non-Commercial,64,64,128
Unknown,1,0,1
All,75,71,146


In [226]:
summarizer(7,17)
print('\n')
summarizer(64,128)

Outcome of Interest: 7
Total: 17
Proportion: 41.18%
95% CI: 17.78-64.57


Outcome of Interest: 64
Total: 128
Proportion: 50.0%
95% CI: 41.34-58.66


(0.41337941930464794, 0.5, 0.5866205806953521)

In [228]:
#a is the number of trials with results
#b is the total number of trials

a = [233, 7]
b = [260,17]

stat, pval = proportions_ztest(a, b)
print(pval)

1.2873952343372565e-08


In [229]:
a = [79, 64]
b = [94,128]

stat, pval = proportions_ztest(a, b)
print(pval)

1.6531601528822604e-07


In [187]:
#EUCTR
crosstab(spon_results, 'euctr_results_inc', 'sponsor_status')

euctr_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,63,214,277
Non-Commercial,170,52,222
Unknown,1,0,1
All,234,266,500


In [188]:
summarizer(214,277)
print('\n')
summarizer(52,222)

Outcome of Interest: 214
Total: 277
Proportion: 77.26%
95% CI: 72.32-82.19


Outcome of Interest: 52
Total: 222
Proportion: 23.42%
95% CI: 17.85-28.99


(0.17852172203546213, 0.23423423423423423, 0.28994674643300633)

In [231]:
#How many had results on just the EUCTR?
j_e = spon_results[spon_results.euctr_id.isin(euctr_results_ids - ctg_results_ids - isrctn_results_ids - journal_results_ids)]
crosstab(j_e, 'euctr_results_inc', 'sponsor_status')

euctr_results_inc,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Commercial,41,41
Non-Commercial,14,14
All,55,55


In [216]:
summarizer(41,240)
print('\n')
summarizer(14,143)

Outcome of Interest: 41
Total: 240
Proportion: 17.08%
95% CI: 12.32-21.84


Outcome of Interest: 14
Total: 143
Proportion: 9.79%
95% CI: 4.92-14.66


(0.04919291008563699, 0.0979020979020979, 0.1466112857185588)

In [235]:
crosstab(spon_results[spon_results.inferred==0], 'euctr_results_inc', 'sponsor_status')

euctr_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,46,214,260
Non-Commercial,43,51,94
All,89,265,354


In [237]:
summarizer(214,260)
print('\n')
summarizer(51,94)

Outcome of Interest: 214
Total: 260
Proportion: 82.31%
95% CI: 77.67-86.95


Outcome of Interest: 51
Total: 94
Proportion: 54.26%
95% CI: 44.18-64.33


(0.4418406381192571, 0.5425531914893617, 0.6432657448594662)

In [236]:
crosstab(spon_results[spon_results.inferred==1], 'euctr_results_inc', 'sponsor_status')

euctr_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,17,0,17
Non-Commercial,127,1,128
Unknown,1,0,1
All,145,1,146


In [238]:
summarizer(0,17)
print('\n')
summarizer(1,128)

Outcome of Interest: 0
Total: 17
Proportion: 0.0%
95% CI: 0.0-0.0


Outcome of Interest: 1
Total: 128
Proportion: 0.78%
95% CI: -0.74-2.31


(-0.007440068263182659, 0.0078125, 0.023065068263182657)

In [239]:
#a is the number of trials with results
#b is the total number of trials

a = [214, 0]
b = [260,17]

stat, pval = proportions_ztest(a, b)
print(pval)

4.37882920168881e-15


In [240]:
a = [51, 1]
b = [94,127]

stat, pval = proportions_ztest(a, b)
print(pval)

1.9645687089927648e-20


In [191]:
#ClinicalTrials.gov
crosstab(spon_results[spon_results.nct_id.notnull()], 'ctgov_results_inc', 'sponsor_status')

ctgov_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,106,129,235
Non-Commercial,100,4,104
All,206,133,339


In [192]:
summarizer(129,235)
print('\n')
summarizer(4,104)

Outcome of Interest: 129
Total: 235
Proportion: 54.89%
95% CI: 48.53-61.26


Outcome of Interest: 4
Total: 104
Proportion: 3.85%
95% CI: 0.15-7.54


(0.0015011899162653206, 0.038461538461538464, 0.0754218870068116)

In [211]:
#How many had results on just CTgov?
j_c = spon_results[spon_results.euctr_id.isin(ctg_results_ids - euctr_results_ids - isrctn_results_ids - journal_results_ids)]
crosstab(j_c, 'ctgov_results_inc', 'sponsor_status')

ctgov_results_inc,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Commercial,2,2
Non-Commercial,1,1
All,3,3


In [217]:
summarizer(2,240)
print('\n')
summarizer(1,143)

Outcome of Interest: 2
Total: 240
Proportion: 0.83%
95% CI: -0.32-1.98


Outcome of Interest: 1
Total: 143
Proportion: 0.7%
95% CI: -0.67-2.07


(-0.006665278531784565, 0.006993006993006993, 0.020651292517798552)

In [245]:
crosstab(spon_results[(spon_results.inferred==0) & spon_results.nct_id.notnull()], 'ctgov_results_inc', 'sponsor_status')

ctgov_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,99,128,227
Non-Commercial,41,3,44
All,140,131,271


In [246]:
summarizer(128,227)
print('\n')
summarizer(3,44)

Outcome of Interest: 128
Total: 227
Proportion: 56.39%
95% CI: 49.94-62.84


Outcome of Interest: 3
Total: 44
Proportion: 6.82%
95% CI: -0.63-14.27


(-0.0062964573330352575, 0.06818181818181818, 0.1426600936966716)

In [247]:
crosstab(spon_results[(spon_results.inferred==1) & spon_results.nct_id.notnull()], 'ctgov_results_inc', 'sponsor_status')

ctgov_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,7,1,8
Non-Commercial,59,1,60
All,66,2,68


In [248]:
summarizer(1,8)
print('\n')
summarizer(1,60)

Outcome of Interest: 1
Total: 8
Proportion: 12.5%
95% CI: -10.42-35.42


Outcome of Interest: 1
Total: 60
Proportion: 1.67%
95% CI: -1.57-4.91


(-0.01572663396673477, 0.016666666666666666, 0.049059967300068105)

In [249]:
#a is the number of trials with results
#b is the total number of trials

a = [128, 1]
b = [227,8]

stat, pval = proportions_ztest(a, b)
print(pval)

0.014214029343954382


In [250]:
a = [3, 1]
b = [44, 60]

stat, pval = proportions_ztest(a, b)
print(pval)

0.17712582445647074


In [193]:
#ISRCTN
crosstab(spon_results[spon_results.isrctn_id.notnull()], 'isrctn_results_inc', 'sponsor_status')

isrctn_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,2,0,2
Non-Commercial,28,2,30
All,30,2,32


In [196]:
summarizer(0,2)
print('\n')
summarizer(2,30)

Outcome of Interest: 0
Total: 2
Proportion: 0.0%
95% CI: 0.0-0.0


Outcome of Interest: 2
Total: 30
Proportion: 6.67%
95% CI: -2.26-15.59


(-0.022595660005692506, 0.06666666666666667, 0.15592899333902582)

In [213]:
#The ISRCTN has no unique results

In [251]:
crosstab(spon_results[(spon_results.inferred==0) & spon_results.isrctn_id.notnull()], 'isrctn_results_inc', 'sponsor_status')

isrctn_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,2,0,2
Non-Commercial,25,2,27
All,27,2,29


In [253]:
summarizer(0,2)
print('\n')
summarizer(2,27)

Outcome of Interest: 0
Total: 2
Proportion: 0.0%
95% CI: 0.0-0.0


Outcome of Interest: 2
Total: 27
Proportion: 7.41%
95% CI: -2.47-17.29


(-0.02471193201347796, 0.07407407407407407, 0.1728600801616261)

In [252]:
crosstab(spon_results[(spon_results.inferred==1) & spon_results.isrctn_id.notnull()], 'isrctn_results_inc', 'sponsor_status')

isrctn_results_inc,0,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Non-Commercial,3,3
All,3,3


In [255]:
#a is the number of trials with results
#b is the total number of trials

a = [0,0]
b = [2,0]

stat, pval = proportions_ztest(a, b)
print(pval)

nan


In [256]:
a = [2,0]
b = [27,3]

stat, pval = proportions_ztest(a, b)
print(pval)

0.6255852315243253


In [257]:
#Journal Articles
crosstab(spon_results, 'journal_results_inc', 'sponsor_status')

journal_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,110,167,277
Non-Commercial,96,126,222
Unknown,1,0,1
All,207,293,500


In [258]:
summarizer(167,277)
print('\n')
summarizer(126,222)

Outcome of Interest: 167
Total: 277
Proportion: 60.29%
95% CI: 54.53-66.05


Outcome of Interest: 126
Total: 222
Proportion: 56.76%
95% CI: 50.24-63.27


(0.5023976009162529, 0.5675675675675675, 0.6327375342188822)

In [259]:
#How many had results just in a journal?
j_e = spon_results[spon_results.euctr_id.isin(journal_results_ids - euctr_results_ids - isrctn_results_ids - ctg_results_ids)]
crosstab(j_e, 'journal_results_inc', 'sponsor_status')

journal_results_inc,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Commercial,18,18
Non-Commercial,90,90
All,108,108


In [260]:
summarizer(18,277)
print('\n')
summarizer(90,222)

Outcome of Interest: 18
Total: 277
Proportion: 6.5%
95% CI: 3.6-9.4


Outcome of Interest: 90
Total: 222
Proportion: 40.54%
95% CI: 34.08-47.0


(0.34081993456673687, 0.40540540540540543, 0.469990876244074)

In [261]:
crosstab(spon_results[(spon_results.inferred==0)], 'journal_results_inc', 'sponsor_status')

journal_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,100,160,260
Non-Commercial,30,64,94
All,130,224,354


In [265]:
summarizer(160,260)
print('\n')
summarizer(64,94)

Outcome of Interest: 160
Total: 260
Proportion: 61.54%
95% CI: 55.62-67.45


Outcome of Interest: 64
Total: 94
Proportion: 68.09%
95% CI: 58.66-77.51


(0.5866154843900065, 0.6808510638297872, 0.7750866432695679)

In [266]:
crosstab(spon_results[(spon_results.inferred==1)], 'journal_results_inc', 'sponsor_status')

journal_results_inc,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,10,7,17
Non-Commercial,66,62,128
Unknown,1,0,1
All,77,69,146


In [267]:
summarizer(7,17)
print('\n')
summarizer(62,128)

Outcome of Interest: 7
Total: 17
Proportion: 41.18%
95% CI: 17.78-64.57


Outcome of Interest: 62
Total: 128
Proportion: 48.44%
95% CI: 39.78-57.1


(0.3977967248410885, 0.484375, 0.5709532751589115)

In [268]:
#a is the number of trials with results
#b is the total number of trials

a = [160, 7]
b = [260,17]

stat, pval = proportions_ztest(a, b)
print(pval)

0.09644705649057483


In [269]:
a = [64, 62]
b = [94,128]

stat, pval = proportions_ztest(a, b)
print(pval)

0.0035039926186231063


In [270]:
#Non-EUCTR
spon_results['non_euctr_results'] = np.where((spon_results.ctgov_results_inc==1) | 
                                             (spon_results.isrctn_results_inc==1) | 
                                             (spon_results.journal_results_inc==1), 1,0)

In [271]:
crosstab(spon_results, 'non_euctr_results', 'sponsor_status')

non_euctr_results,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,78,199,277
Non-Commercial,93,129,222
Unknown,1,0,1
All,172,328,500


In [272]:
summarizer(199,277)
print('\n')
summarizer(129,222)

Outcome of Interest: 199
Total: 277
Proportion: 71.84%
95% CI: 66.54-77.14


Outcome of Interest: 129
Total: 222
Proportion: 58.11%
95% CI: 51.62-64.6


(0.516178354934141, 0.581081081081081, 0.6459838072280211)

In [275]:
crosstab(spon_results[(spon_results.inferred==0)], 'non_euctr_results', 'sponsor_status')

non_euctr_results,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,68,192,260
Non-Commercial,28,66,94
All,96,258,354


In [277]:
summarizer(192,260)
print('\n')
summarizer(66,94)

Outcome of Interest: 192
Total: 260
Proportion: 73.85%
95% CI: 68.5-79.19


Outcome of Interest: 66
Total: 94
Proportion: 70.21%
95% CI: 60.97-79.46


(0.6096758801641288, 0.7021276595744681, 0.7945794389848074)

In [276]:
crosstab(spon_results[(spon_results.inferred==1)], 'non_euctr_results', 'sponsor_status')

non_euctr_results,0,1,All
sponsor_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Commercial,10,7,17
Non-Commercial,65,63,128
Unknown,1,0,1
All,76,70,146


In [278]:
summarizer(7,17)
print('\n')
summarizer(63,128)

Outcome of Interest: 7
Total: 17
Proportion: 41.18%
95% CI: 17.78-64.57


Outcome of Interest: 63
Total: 128
Proportion: 49.22%
95% CI: 40.56-57.88


(0.40557749375145474, 0.4921875, 0.5787975062485452)

In [279]:
#a is the number of trials with results
#b is the total number of trials

a = [192, 7]
b = [260,17]

stat, pval = proportions_ztest(a, b)
print(pval)

0.0037138451221579794


In [280]:
a = [66, 63]
b = [94,128]

stat, pval = proportions_ztest(a, b)
print(pval)

0.0017326688873375395


In [283]:
print(.05 / (11 - 1 + 1))
print(.05 / (11 - 2 + 1))
print(.05 / (11 - 3 + 1))
print(.05 / (11 - 4 + 1))
print(.05 / (11 - 5 + 1))
print(.05 / (11 - 6 + 1))
print(.05 / (11 - 7 + 1))
print(.05 / (11 - 8 + 1))

0.004545454545454546
0.005
0.005555555555555556
0.00625
0.0071428571428571435
0.008333333333333333
0.01
0.0125


## Time to Searches

Categorize the follow-up trials had.

In [None]:
#Lets make a copy of the sample data

sample2 = full_sample[full_sample.eudract_number.isin(analysis_df.euctr_id.to_list())].copy()

In [None]:
sample2['final_date'] = pd.to_datetime(sample2['final_date'])
sample2['days_to_search'] = (search_start_date - sample2['final_date']) / pd.Timedelta(1,"d")

In [None]:
sample2[sample2.inferred == 0].days_to_search.describe()

In [None]:
sample2[sample2.inferred == 1].days_to_search.describe()

In [None]:
#Moods test for independent medians

inferred_data = sample2[sample2.inferred == 1].days_to_search

extracted_data = sample2[sample2.inferred == 0].days_to_search

stat, p, med, tbl = median_test(inferred_data, extracted_data)
print(p)

In [None]:
#Data for plotting
#sample2[['inferred', 'days_to_search']].to_csv(parent + '/data/graphing_data/days_to_search.csv')