# Allocation of CCGs into intervention and control groups

Note - set seed for random allocation to ensure repeatability

In [1]:

# Set dates of baseline and follow-up periods
d4 = '2019-03-01' # month after end of follow-up period
d3 = '2018-09-01' # follow-up start
d2 = '2018-07-01' # month after end of baseline period
d1 = '2018-01-01' # baseline start


# Import dataset from BigQuery
import pandas as pd
import numpy as np
GBQ_PROJECT_ID = '620265099307'

q = '''SELECT * FROM ebmdatalab.measures.ccg_data_lpzomnibus
WHERE EXTRACT (YEAR from month)  >= 2018
'''
df1 = pd.read_gbq(q, GBQ_PROJECT_ID, dialect='standard',verbose=False)

        ## note: parsing dates is quite memory-intensive, make sure not too many programmes running

df1["month"] = pd.to_datetime(df1.month)

df1.head() # this gives the first few rows of data

Unnamed: 0,pct_id,month,numerator,denominator,calc_value,percentile
0,03Y,2018-06-01,8061.86989,104.165,77.395189,0.0
1,04V,2018-06-01,57374.25724,393.317,145.872813,0.5
2,01V,2018-06-01,38228.77713,125.03,305.756835,1.0
3,02Q,2018-06-01,18145.94579,117.489,154.44804,0.582474
4,07G,2018-06-01,29043.80496,176.896,164.185764,0.664948


In [2]:
### filter out the baseline and follow-up periods
import datetime

conditions = [
    (df1['month']  >= d4), # after follow-up period
    (df1['month']  >= d3), # follow-up
    (df1['month']  >= d2), # mid
    (df1['month']  >= d1), # baseline
    (df1['month']  < d1)] # before

choices = ['after', 'follow-up', 'mid', 'baseline','before']
df1['period'] = np.select(conditions, choices, default='0')

df1.head()

Unnamed: 0,pct_id,month,numerator,denominator,calc_value,percentile,period
0,03Y,2018-06-01,8061.86989,104.165,77.395189,0.0,baseline
1,04V,2018-06-01,57374.25724,393.317,145.872813,0.5,baseline
2,01V,2018-06-01,38228.77713,125.03,305.756835,1.0,baseline
3,02Q,2018-06-01,18145.94579,117.489,154.44804,0.582474,baseline
4,07G,2018-06-01,29043.80496,176.896,164.185764,0.664948,baseline


In [3]:
### aggregate data over 6-month baseline

# take columns of interest from df
df2 = df1[["pct_id","period", "month", "numerator","denominator"]]

# Perform groupby aggregation
agg_6m = df2.groupby(["pct_id","period"]).sum() 

### calculate aggregated measure values
agg_6m["calc_value"] = agg_6m.numerator / agg_6m.denominator

agg_6m = agg_6m.reset_index()
agg_6m = agg_6m.loc[agg_6m.period=="baseline"].rename(columns={"calc_value":"baseline"}).drop("period",axis=1)
agg_6m.head()

Unnamed: 0,pct_id,numerator,denominator,baseline
0,00C,108261.10945,649.675,166.638872
1,00D,254840.09124,1752.237,145.436999
2,00J,317390.42671,1557.677,203.75882
3,00K,247479.68728,1782.485,138.839703
4,00L,238929.6414,1951.199,122.452729


In [4]:
### calculate percentile for each ccg for single measure during baseline period and select the worst 40 to be randomised

df3 = agg_6m.copy()
df3["baseline_ranking"] = df3["baseline"].rank(method='min', pct=True)

top40 = df3.sort_values(by="baseline_ranking", ascending=False).head(40).reset_index(drop=True)
top40

Unnamed: 0,pct_id,numerator,denominator,baseline,baseline_ranking
0,01V,233995.2,749.704,312.116808,1.0
1,01X,361479.5,1185.007,305.044227,0.994872
2,99K,297555.5,1022.388,291.039742,0.989744
3,09P,311660.1,1130.895,275.587156,0.984615
4,01J,272497.8,989.946,275.26533,0.979487
5,09W,487557.1,1795.837,271.492954,0.974359
6,01F,211759.2,787.334,268.957273,0.969231
7,99J,769106.7,2953.688,260.388597,0.964103
8,05G,340137.1,1308.84,259.876765,0.958974
9,99A,828848.2,3196.121,259.32942,0.953846


In [5]:
### allocate bottom CCGs to intervention and control groups 

# set seeds for random number generation to ensure repeatable
# seed1 = 321

df5 = top40.copy()
import random as rd

#np.random.seed(seed1)
df5['rand_num'] = np.random.rand(len(df5))
df5["allocation_ranking"] = df5.rand_num.rank()

df5["allocation_code"]= df5.allocation_ranking.mod(2)

#create final allocation groups
df5['allocation'] = np.where(df5['allocation_code']==0,'con','I')

print (df5.loc[df5.allocation=="I"].pct_id.count(), 'CCGs have been assigned to the intervention group,')
print ("with an average spend of £",round(df5.loc[df5.allocation=="I"].baseline.sum(),0), "per 1000.")
print (df5.loc[df5.allocation=="con"].pct_id.count(), 'CCGs have been assigned to the control group,')
print ("with an average spend of £",round(df5.loc[df5.allocation=="con"].baseline.sum(),0), "per 1000.")

#df5.loc[df5.allocation == "I"].to_csv('allocated_ccgs.csv')

20 CCGs have been assigned to the intervention group,
with an average spend of £ 4740.0 per 1000.
20 CCGs have been assigned to the control group,
with an average spend of £ 4515.0 per 1000.


In [6]:
### import CCG names for CCGs allocated to intervention group
q = '''
SELECT
  code,
  name
FROM
  ebmdatalab.hscic.ccgs
WHERE org_type = "CCG"
'''

ccg = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, dialect='standard',verbose=False)

ccg.head()
dfm = df5.loc[df5.allocation == "I"].merge(ccg, how='left', left_on='pct_id',right_on='code')
dfm[["pct_id","name"]]

Unnamed: 0,pct_id,name
0,01V,NHS SOUTHPORT AND FORMBY CCG
1,01X,NHS ST HELENS CCG
2,99K,NHS HIGH WEALD LEWES HAVENS CCG
3,09P,NHS HASTINGS AND ROTHER CCG
4,09W,NHS MEDWAY CCG
5,01F,NHS HALTON CCG
6,01T,NHS SOUTH SEFTON CCG
7,11N,NHS KERNOW CCG
8,99E,NHS BASILDON AND BRENTWOOD CCG
9,03T,NHS LINCOLNSHIRE EAST CCG
