# Allocation of CCGs into intervention and control groups

Note - set seed for random allocation to ensure repeatability

In [5]:
# Set dates of baseline and follow-up periods
d4 = '2019-07-01' # month after end of follow-up period
d3 = '2019-01-01' # follow-up start
d2 = '2018-07-01' # month after end of baseline period
d1 = '2018-01-01' # baseline start


# Import dataset from BigQuery
import pandas as pd
import numpy as np
GBQ_PROJECT_ID = '620265099307'

q = '''SELECT * FROM ebmdatalab.measures.ccg_data_lpzomnibus
WHERE month >= '2018-01-01' AND month <= '2018-08-01'
'''
df1 = pd.read_gbq(q, GBQ_PROJECT_ID, dialect='standard',verbose=False)

df1["month"] = pd.to_datetime(df1.month)

df1.head() # this gives the first few rows of data

Unnamed: 0,pct_id,month,numerator,denominator,calc_value,percentile
0,00C,2018-03-01,17915.92145,108.235,165.527985,0.639175
1,00C,2018-04-01,16142.79865,108.273,149.093483,0.639175
2,00C,2018-02-01,17319.07538,108.293,159.927931,0.71134
3,00C,2018-08-01,17364.79986,108.503,160.039813,0.706186
4,00C,2018-05-01,16877.4711,108.299,155.841431,0.695876


In [6]:
df1[(df1.month == '2018-06-01') & (df1.pct_id == '03Y')]

Unnamed: 0,pct_id,month,numerator,denominator,calc_value,percentile
483,03Y,2018-06-01,7451.69973,104.165,71.537462,0.010309


In [7]:
### classify the data by period
import datetime

conditions = [
    (df1['month']  >= d4), # after follow-up period
    (df1['month']  >= d3), # follow-up
    (df1['month']  >= d2), # mid
    (df1['month']  >= d1), # baseline
    (df1['month']  < d1)] # before

choices = ['after', 'follow-up', 'mid', 'baseline','before']
df1['period'] = np.select(conditions, choices, default='0')

df1.head()

Unnamed: 0,pct_id,month,numerator,denominator,calc_value,percentile,period
0,00C,2018-03-01,17915.92145,108.235,165.527985,0.639175,baseline
1,00C,2018-04-01,16142.79865,108.273,149.093483,0.639175,baseline
2,00C,2018-02-01,17319.07538,108.293,159.927931,0.71134,baseline
3,00C,2018-08-01,17364.79986,108.503,160.039813,0.706186,mid
4,00C,2018-05-01,16877.4711,108.299,155.841431,0.695876,baseline


In [8]:
### aggregate the data over the each period, and 
### then extract just the 6 months of baseline data

# take columns of interest from df
df2 = df1[["pct_id","period", "month", "numerator","denominator"]]

# Perform groupby aggregation
agg_6m = df2.groupby(["pct_id","period"]).sum() 

### calculate aggregated measure values
agg_6m["calc_value"] = agg_6m.numerator / agg_6m.denominator

agg_6m = agg_6m.reset_index()
agg_6m = agg_6m.loc[agg_6m.period=="baseline"].rename(columns={"calc_value":"baseline"}).drop("period",axis=1)
agg_6m.head()

Unnamed: 0,pct_id,numerator,denominator,baseline
0,00C,102119.89023,649.675,157.186116
2,00D,237929.15765,1752.237,135.785945
4,00J,300047.52211,1557.677,192.624994
6,00K,235510.63095,1782.485,132.124888
8,00L,209692.13257,1951.199,107.468348


In [9]:
### select the worst ~50 to be pre-screened
#(Also exclude CCGs 99P, 99Q and 08H as per exclusion criteria)

df3 = agg_6m.copy()
df3.loc[(df3.pct_id!="08H")&(df3.pct_id !="99P")&(df3.pct_id !="99Q")].sort_values(by="baseline", ascending=False).head(50).reset_index()


Unnamed: 0,index,pct_id,numerator,denominator,baseline
0,50,01V,228220.99627,749.704,304.414804
1,54,01X,347631.36917,1185.007,293.358072
2,380,99K,289294.82914,1022.388,282.959922
3,302,09P,304240.05036,1130.895,269.025905
4,42,01J,264583.05225,989.946,267.270187
5,304,09W,475802.54155,1795.837,264.94751
6,36,01F,204807.24274,787.334,260.127522
7,378,99J,752549.25554,2953.688,254.782921
8,364,99A,803559.48806,3196.121,251.417105
9,160,05G,326127.37475,1308.84,249.172836


### The selected CCGs are pre-screened for joint medicines optimisation teams

Specifically, the 50 CCGs above were reviewed by a pharmacist for membership of joint medicines optimisations teams.  The pharmacist created a spreadsheet indicating membership, `joint_teams.csv`, used in the following cells.

This is to avoid contamination between CCGs that work together. Therefore, we block randomise taking these teams into account. 


In [5]:
# import joint team information
team = pd.read_csv('joint_teams.csv')

# give each team a proxy id, i.e. where there are teams, assign the 
# code of its members to the entire team. This  member becomes the 
# CCG we visit as the intervention for that team.
team2 = pd.DataFrame(team.groupby("joint_team")["ccg_id"].agg(["count","max"])).reset_index().rename(columns={"max":"joint_id"})
team = team.merge(team2, on="joint_team")
team.head()

Unnamed: 0,ccg_id,joint_team,count,joint_id
0,99E,"""Basildon, Brentwood and Thurrock MMT""",1,99E
1,99K,Brighton and HWLH MMT,1,99K
2,09F,EHS and HR CCGs MMT,2,09P
3,09P,EHS and HR CCGs MMT,2,09P
4,03E,Harrogate Shared Services MMT,1,03E


In [6]:
# merge aggregated prescribing data with joint team information
j1 = agg_6m.merge(team, left_on="pct_id",right_on="ccg_id", how="left")
j1.loc[j1.ccg_id.isnull(),["joint_id"]] = j1.pct_id
j1 = j1.drop("ccg_id", axis=1)
j1.head()

Unnamed: 0,pct_id,numerator,denominator,baseline,joint_team,count,joint_id
0,00C,108261.10945,649.675,166.638872,,,00C
1,00D,254840.09124,1752.237,145.436999,,,00D
2,00J,317390.42671,1557.677,203.75882,North of England CSU (NECS),1.0,00J
3,00K,247479.68728,1782.485,138.839703,,,00K
4,00L,238929.6414,1951.199,122.452729,,,00L


In [7]:
# group CCG data up to joint teams
j2 = j1.groupby("joint_id")["numerator","denominator"].sum().reset_index()
j2["baseline"] = j2.numerator / j2.denominator
j2.head()

Unnamed: 0,joint_id,numerator,denominator,baseline
0,00C,108261.10945,649.675,166.638872
1,00D,254840.09124,1752.237,145.436999
2,00J,317390.42671,1557.677,203.75882
3,00K,247479.68728,1782.485,138.839703
4,00L,238929.6414,1951.199,122.452729


In [8]:
### calculate percentile for each ccg / joint team for spend during baseline period 
# and select the worst 40 to be randomised

j3 = j2.copy()
j3["baseline_ranking"] = j3["baseline"].rank(method='min', pct=True)

top40 = j3.loc[(j3.joint_id!="08H")&(j3.joint_id !="99P")&(j3.joint_id !="99Q")].sort_values(by="baseline_ranking", ascending=False).head(40).reset_index(drop=True)
top40

Unnamed: 0,joint_id,numerator,denominator,baseline,baseline_ranking
0,01X,361479.5443,1185.007,305.044227,1.0
1,99K,297555.54012,1022.388,291.039742,0.994709
2,01J,272497.81256,989.946,275.26533,0.989418
3,01V,457415.61157,1680.072,272.259529,0.984127
4,09W,487557.09178,1795.837,271.492954,0.978836
5,01F,211759.20528,787.334,268.957273,0.973545
6,99J,769106.67373,2953.688,260.388597,0.968254
7,05G,340137.10538,1308.84,259.876765,0.962963
8,99A,828848.2062,3196.121,259.32942,0.957672
9,11A,844705.12882,3376.395,250.179594,0.952381


In [9]:
top40.describe()

Unnamed: 0,numerator,denominator,baseline,baseline_ranking
count,40.0,40.0,40.0,40.0
mean,404539.759662,1814.3347,223.749036,0.890608
std,207453.129851,907.986977,33.911384,0.066143
min,123669.20078,678.013,178.288868,0.783069
25%,274889.740273,1162.33125,197.203361,0.834656
50%,345460.292115,1559.266,216.03241,0.891534
75%,494593.84158,2160.1475,247.246846,0.948413
max,871509.12001,4482.042,305.044227,1.0


In [10]:
### allocate bottom CCGs to intervention and control groups 

# set seeds for random number generation to ensure repeatable
seed1 = 321

df5 = top40.copy()
import random as rd

np.random.seed(seed1)
df5['rand_num'] = np.random.rand(len(df5))
df5["allocation_ranking"] = df5.rand_num.rank()

df5["allocation_code"]= df5.allocation_ranking.mod(2)

#create final allocation groups
df5['allocation'] = np.where(df5['allocation_code']==0,'con','I')

print (df5.loc[df5.allocation=="I"].joint_id.count(), 'CCGs have been assigned to the intervention group,')
print ("with an average spend of £",round(df5.loc[df5.allocation=="I"].baseline.mean(),0), "per 1000. SD:",round(df5.loc[df5.allocation=="I"].baseline.std(),0))
print (df5.loc[df5.allocation=="con"].joint_id.count(), 'CCGs have been assigned to the control group,')
print ("with an average spend of £",round(df5.loc[df5.allocation=="con"].baseline.mean(),0), "per 1000. SD:",round(df5.loc[df5.allocation=="con"].baseline.std(),0))


20 CCGs have been assigned to the intervention group,
with an average spend of £ 214.0 per 1000. SD: 26.0
20 CCGs have been assigned to the control group,
with an average spend of £ 233.0 per 1000. SD: 38.0


In [11]:
### import CCG names for CCGs allocated to intervention group
q = '''
SELECT
  code,
  name
FROM
  ebmdatalab.hscic.ccgs
WHERE org_type = "CCG" 
'''

ccg = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, dialect='standard',verbose=False)

ccg.head()
dfm = df5.loc[df5.allocation == "I"].merge(ccg, how='left', left_on='joint_id',right_on='code')

#also join back to joint team info and show whether the joint team has any other CCGs in the eligilbe group.
dfm = dfm[["joint_id","name"]].merge(team2, on="joint_id", how="left").sort_values(by="joint_team").rename(columns={"count":"CCGs_included"})
dfm

dfm.to_csv('allocated_ccgs_visit.csv')

### Calculate baseline stats for whole population, to use to give context in power calculation

In [12]:
j3["baseline"].describe(percentiles = [.1, .25, .5, .75, .8,.85, .9])

count    189.000000
mean     154.205435
std       46.667971
min       79.955639
10%      103.708413
25%      118.214294
50%      150.439042
75%      173.738830
80%      182.084205
85%      202.321241
90%      222.014348
max      305.044227
Name: baseline, dtype: float64