In [18]:
from google.cloud import bigquery
import pandas as pd

In [19]:
projectid = "ebmdatalab"

In [20]:
euctr_country_data = pd.read_gbq("""
select
eudract_number,
eudract_number_with_country,
regexp_extract(eudract_number_with_country, '-([^-]+)$') as trial_locations,
trim(json_extract(sponsors, "$.country"),'"')  as sponsor_country
from
`euctr.sponsor_info` 
order by eudract_number
""", projectid, dialect = 'standard')

In [21]:
euctr_country_data.head()

Unnamed: 0,eudract_number,eudract_number_with_country,trial_locations,sponsor_country
0,2004-000007-18,2004-000007-18-SE,SE,Sweden
1,2004-000012-13,2004-000012-13-CZ,CZ,United States
2,2004-000012-13,2004-000012-13-IT,IT,Italy
3,2004-000012-13,2004-000012-13-EE,EE,United States
4,2004-000015-25,2004-000015-25-LT,LT,United States


In [22]:
trial_location_all = euctr_country_data[['eudract_number', 'trial_locations']].copy()

In [23]:
trial_location_dummies = pd.get_dummies(trial_location_all, columns=['trial_locations'])

In [24]:
trial_location_grouped = trial_location_dummies.groupby('eudract_number', as_index=False).max()

In [25]:
trial_location_grouped.head()

Unnamed: 0,eudract_number,trial_locations_3rd,trial_locations_AT,trial_locations_BE,trial_locations_BG,trial_locations_CZ,trial_locations_DE,trial_locations_DK,trial_locations_EE,trial_locations_ES,...,trial_locations_LV,trial_locations_MT,trial_locations_NL,trial_locations_NO,trial_locations_PL,trial_locations_PT,trial_locations_RO,trial_locations_SE,trial_locations_SI,trial_locations_SK
0,2004-000007-18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2004-000012-13,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2004-000015-25,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,2004-000016-10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2004-000020-32,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
trial_location_totals = trial_location_grouped.drop('eudract_number', axis=1).sum()

In [27]:
print(trial_location_totals)

trial_locations_3rd     1203
trial_locations_AT      3761
trial_locations_BE      5296
trial_locations_BG      1645
trial_locations_CZ      3882
trial_locations_DE     10226
trial_locations_DK      3550
trial_locations_EE       950
trial_locations_ES      8427
trial_locations_FI      2319
trial_locations_FR      4513
trial_locations_GB      8580
trial_locations_GR      1525
trial_locations_HR       276
trial_locations_HU      3940
trial_locations_IE      1052
trial_locations_IS       129
trial_locations_IT      7051
trial_locations_LT      1146
trial_locations_LU         7
trial_locations_LV       980
trial_locations_MT        18
trial_locations_NL      4809
trial_locations_NO       505
trial_locations_PL      2571
trial_locations_PT      1362
trial_locations_RO       225
trial_locations_SE      3594
trial_locations_SI       344
trial_locations_SK      1631
dtype: int64


In [28]:
sponsor_country_all = table[['eudract_number', 'sponsor_country']].copy()

In [29]:
sponsor_country_dummies = pd.get_dummies(sponsor_country_all, columns=['sponsor_country'])

In [30]:
sponsor_country_grouped = sponsor_country_dummies.groupby('eudract_number', as_index=False).max()

In [31]:
sponsor_country_totals = sponsor_country_grouped.drop('eudract_number', axis=1).sum()

In [32]:
print(sponsor_country_totals)

sponsor_country_Argentina                                  1
sponsor_country_Australia                                 96
sponsor_country_Austria                                 1270
sponsor_country_Belgium                                 2122
sponsor_country_Belize                                     3
sponsor_country_Benin                                      1
sponsor_country_Bermuda                                    3
sponsor_country_Bhutan                                     2
sponsor_country_Brazil                                     3
sponsor_country_Bulgaria                                  59
sponsor_country_Canada                                   126
sponsor_country_China                                      8
sponsor_country_Croatia                                   38
sponsor_country_Cyprus                                     4
sponsor_country_Czech Republic                           391
sponsor_country_Denmark                                 1884
sponsor_country_Djibouti

In [33]:
euctr_results_data = pd.read_gbq("""
select
trial_id as eudract_number,
max(results_expected) as results_expected,
max(has_results) as has_results
from `euctr.jan_10_19_trials_csv` 
group by trial_id
""", projectid, dialect = 'standard')

In [34]:
country_and_results = pd.merge(trial_location_grouped, euctr_results_data, on='eudract_number')

In [35]:
country_and_results.head()

Unnamed: 0,eudract_number,trial_locations_3rd,trial_locations_AT,trial_locations_BE,trial_locations_BG,trial_locations_CZ,trial_locations_DE,trial_locations_DK,trial_locations_EE,trial_locations_ES,...,trial_locations_NL,trial_locations_NO,trial_locations_PL,trial_locations_PT,trial_locations_RO,trial_locations_SE,trial_locations_SI,trial_locations_SK,results_expected,has_results
0,2004-000007-18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,2004-000012-13,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,2004-000015-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,2004-000016-10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2004-000020-32,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [118]:
countries = country_and_results.columns[1:31].values.tolist()
print(countries)

['trial_locations_3rd', 'trial_locations_AT', 'trial_locations_BE', 'trial_locations_BG', 'trial_locations_CZ', 'trial_locations_DE', 'trial_locations_DK', 'trial_locations_EE', 'trial_locations_ES', 'trial_locations_FI', 'trial_locations_FR', 'trial_locations_GB', 'trial_locations_GR', 'trial_locations_HR', 'trial_locations_HU', 'trial_locations_IE', 'trial_locations_IS', 'trial_locations_IT', 'trial_locations_LT', 'trial_locations_LU', 'trial_locations_LV', 'trial_locations_MT', 'trial_locations_NL', 'trial_locations_NO', 'trial_locations_PL', 'trial_locations_PT', 'trial_locations_RO', 'trial_locations_SE', 'trial_locations_SI', 'trial_locations_SK']


In [114]:
def country_reported(country):
    results_expected = 0
    expected_with_results = 0
    for row in country_and_results.index:
        if country_and_results.at[row, country] == 1 and country_and_results.at[row, 'results_expected'] == 1:
            results_expected += 1
        if country_and_results.at[row,country] == 1 and country_and_results.at[row, 'results_expected'] == 1 and country_and_results.at[row, 'has_results'] == 1:
            expected_with_results += 1
    return results_expected, expected_with_results
    

In [115]:
print(country_reported('trial_locations_PL'))

(613, 489)


In [166]:
location_country_reported_dict = {}
for country in countries:
    results_tuple = country_reported(country)
    if results_tuple[0] == 0:
        percentage = 0
    else:    
        percentage = round((results_tuple[1] / results_tuple[0]) * 100,2)
    location_country_reported_dict[country] = [results_tuple[0], results_tuple[1], percentage]

In [167]:
print(location_country_reported_dict)

{'trial_locations_3rd': [0, 0, 0], 'trial_locations_AT': [1022, 525, 51.37], 'trial_locations_BE': [860, 650, 75.58], 'trial_locations_BG': [473, 369, 78.01], 'trial_locations_CZ': [968, 768, 79.34], 'trial_locations_DE': [3280, 2101, 64.05], 'trial_locations_DK': [970, 457, 47.11], 'trial_locations_EE': [294, 234, 79.59], 'trial_locations_ES': [960, 770, 80.21], 'trial_locations_FI': [479, 343, 71.61], 'trial_locations_FR': [274, 165, 60.22], 'trial_locations_GB': [1655, 1099, 66.4], 'trial_locations_GR': [252, 189, 75.0], 'trial_locations_HR': [39, 32, 82.05], 'trial_locations_HU': [979, 771, 78.75], 'trial_locations_IE': [132, 87, 65.91], 'trial_locations_IS': [44, 20, 45.45], 'trial_locations_IT': [1150, 858, 74.61], 'trial_locations_LT': [351, 257, 73.22], 'trial_locations_LU': [1, 1, 100.0], 'trial_locations_LV': [304, 231, 75.99], 'trial_locations_MT': [2, 2, 100.0], 'trial_locations_NL': [455, 370, 81.32], 'trial_locations_NO': [28, 23, 82.14], 'trial_locations_PL': [613, 489, 

In [168]:
reporting_by_country = pd.DataFrame.from_dict(location_country_reported_dict, orient='index',columns = ["Results Expected", "Results Available", "Percent Reported"])

In [169]:
print(reporting_by_country.sort_values(by = "Percent Reported", ascending = False))

                     Results Expected  Results Available  Percent Reported
trial_locations_MT                  2                  2            100.00
trial_locations_LU                  1                  1            100.00
trial_locations_RO                 33                 29             87.88
trial_locations_SI                 65                 54             83.08
trial_locations_SK                443                364             82.17
trial_locations_NO                 28                 23             82.14
trial_locations_HR                 39                 32             82.05
trial_locations_NL                455                370             81.32
trial_locations_PT                189                152             80.42
trial_locations_ES                960                770             80.21
trial_locations_PL                613                489             79.77
trial_locations_EE                294                234             79.59
trial_locations_CZ       

In [102]:
sponsor_country_and_results = pd.merge(sponsor_country_grouped, euctr_results_data, on='eudract_number')

In [103]:
sponsor_country_and_results.head()

Unnamed: 0,eudract_number,sponsor_country_Argentina,sponsor_country_Australia,sponsor_country_Austria,sponsor_country_Belgium,sponsor_country_Belize,sponsor_country_Benin,sponsor_country_Bermuda,sponsor_country_Bhutan,sponsor_country_Brazil,...,sponsor_country_Taiwan,sponsor_country_Turkey,sponsor_country_Ukraine,sponsor_country_United Arab Emirates,sponsor_country_United Kingdom,sponsor_country_United States,sponsor_country_United States Minor Outlying Islands,sponsor_country_Uruguay,results_expected,has_results
0,2004-000007-18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2004-000012-13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,2004-000015-25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,2004-000016-10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2004-000020-32,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [112]:
sponsor_countries = sponsor_country_and_results.columns[1:80].values.tolist()
print(sponsor_countries)

['sponsor_country_Argentina', 'sponsor_country_Australia', 'sponsor_country_Austria', 'sponsor_country_Belgium', 'sponsor_country_Belize', 'sponsor_country_Benin', 'sponsor_country_Bermuda', 'sponsor_country_Bhutan', 'sponsor_country_Brazil', 'sponsor_country_Bulgaria', 'sponsor_country_Canada', 'sponsor_country_China', 'sponsor_country_Croatia', 'sponsor_country_Cyprus', 'sponsor_country_Czech Republic', 'sponsor_country_Denmark', 'sponsor_country_Djibouti', 'sponsor_country_Estonia', 'sponsor_country_European Union', 'sponsor_country_Falkland Islands (Malvinas)', 'sponsor_country_Finland', 'sponsor_country_France', 'sponsor_country_France, Metropolitan', 'sponsor_country_French Guiana', 'sponsor_country_Georgia', 'sponsor_country_Germany', 'sponsor_country_Greece', 'sponsor_country_Greenland', 'sponsor_country_Hong Kong', 'sponsor_country_Hungary', 'sponsor_country_Iceland', 'sponsor_country_India', 'sponsor_country_Ireland', 'sponsor_country_Israel', 'sponsor_country_Italy', 'sponso

In [123]:
def sponsor_country_reported(country):
    results_expected = 0
    expected_with_results = 0
    for row in sponsor_country_and_results.index:
        if sponsor_country_and_results.at[row, country] == 1 and sponsor_country_and_results.at[row, 'results_expected'] == 1:
            results_expected += 1
        if sponsor_country_and_results.at[row,country] == 1 and sponsor_country_and_results.at[row, 'results_expected'] == 1 and sponsor_country_and_results.at[row, 'has_results'] == 1:
            expected_with_results += 1
    return results_expected, expected_with_results

In [161]:
sponsor_country_reported_dict = {}
for country in sponsor_countries:
    results_tuple = sponsor_country_reported(country)
    if results_tuple[0] == 0:
        percentage = 0
    else:    
        percentage = round((results_tuple[1] / results_tuple[0]) * 100,2)
    sponsor_country_reported_dict[country] = [results_tuple[0], results_tuple[1], percentage]

In [162]:
print(sponsor_country_reported_dict)

{'sponsor_country_Argentina': [1, 0, 0.0], 'sponsor_country_Australia': [28, 11, 39.29], 'sponsor_country_Austria': [506, 134, 26.48], 'sponsor_country_Belgium': [364, 276, 75.82], 'sponsor_country_Belize': [1, 1, 100.0], 'sponsor_country_Benin': [0, 0, 0], 'sponsor_country_Bermuda': [2, 0, 0.0], 'sponsor_country_Bhutan': [0, 0, 0], 'sponsor_country_Brazil': [0, 0, 0], 'sponsor_country_Bulgaria': [15, 9, 60.0], 'sponsor_country_Canada': [21, 2, 9.52], 'sponsor_country_China': [1, 0, 0.0], 'sponsor_country_Croatia': [5, 2, 40.0], 'sponsor_country_Cyprus': [3, 0, 0.0], 'sponsor_country_Czech Republic': [93, 57, 61.29], 'sponsor_country_Denmark': [679, 227, 33.43], 'sponsor_country_Djibouti': [0, 0, 0], 'sponsor_country_Estonia': [28, 19, 67.86], 'sponsor_country_European Union': [2, 2, 100.0], 'sponsor_country_Falkland Islands (Malvinas)': [0, 0, 0], 'sponsor_country_Finland': [155, 62, 40.0], 'sponsor_country_France': [421, 260, 61.76], 'sponsor_country_France, Metropolitan': [9, 6, 66.

In [163]:
reporting_by_sponsor_country = pd.DataFrame.from_dict(sponsor_country_reported_dict, orient='index',columns = ["Results Expected", "Results Available", "Percent Reported"])

In [164]:
print(reporting_by_sponsor_country.sort_values(by = "Percent Reported", ascending = False))

                                             Results Expected  \
sponsor_country_Singapore                                   1   
sponsor_country_European Union                              2   
sponsor_country_Belize                                      1   
sponsor_country_Russian Federation                          4   
sponsor_country_Netherlands Antilles                        1   
sponsor_country_Luxembourg                                  1   
sponsor_country_Jersey                                      1   
sponsor_country_Ukraine                                     1   
sponsor_country_Jamaica                                     1   
sponsor_country_Switzerland                               632   
sponsor_country_Slovakia                                   18   
sponsor_country_Belgium                                   364   
sponsor_country_Spain                                     403   
sponsor_country_Hungary                                   105   
sponsor_country_Netherlan

[79 rows x 3 columns]


In [170]:
reporting_by_country.to_csv('reporting_by_country.csv')

In [171]:
reporting_by_sponsor_country.to_csv('reporting_by_sponsor_country.csv')