In [2]:
# replace pracs csv load with SQL

# Set dates of baseline and follow-up periods
d4 = '2017-07-01' # month after end of follow-up period
d3 = '2017-01-01' # follow-up start
d2 = '2016-07-01' # month after end of baseline period
d1 = '2016-01-01' # baseline start

# Import dataset from BigQuery
import pandas as pd
import numpy as np
GBQ_PROJECT_ID = '620265099307'

q = '''SELECT * FROM ebmdatalab.measures.practice_data_ktt9_cephalosporins
WHERE EXTRACT (YEAR from month)  >= 2015
'''
df1 = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, dialect='standard',verbose=False)

        ## note: parsing dates is quite memory-intensive, make sure not too many programmes running

df1["month"] = pd.to_datetime(df1.month)

df1.head() # this gives the first few rows of data

Unnamed: 0,numerator,denominator,practice_id,pct_id,month,calc_value,percentile
0,16,220,A81001,00K,2015-10-01,0.072727,0.332002
1,9,197,A81001,00K,2016-04-01,0.045685,0.125554
2,26,292,A81001,00K,2015-12-01,0.089041,0.569527
3,11,206,A81001,00K,2017-09-01,0.053398,0.155435
4,14,227,A81001,00K,2016-09-01,0.061674,0.202769


In [4]:
### filter out the baseline and follow-up periods
import datetime

conditions = [
    (df1['month']  >= d4), # after follow-up period
    (df1['month']  >= d3), # follow-up
    (df1['month']  >= d2), # mid
    (df1['month']  >= d1), # baseline
    (df1['month']  < d1)] # before

choices = ['after', 'follow-up', 'mid', 'baseline','before']
df1['period'] = np.select(conditions, choices, default='0')

df1.head()

Unnamed: 0,numerator,denominator,practice_id,pct_id,month,calc_value,percentile,period
0,16,220,A81001,00K,2015-10-01,0.072727,0.332002,before
1,9,197,A81001,00K,2016-04-01,0.045685,0.125554,baseline
2,26,292,A81001,00K,2015-12-01,0.089041,0.569527,before
3,11,206,A81001,00K,2017-09-01,0.053398,0.155435,after
4,14,227,A81001,00K,2016-09-01,0.061674,0.202769,mid


In [10]:
### import practices eligible for study
q = '''
SELECT 
prac.code AS practice_id, 
ccg_id, 
total_list_size, 
CAST (open_date AS DATE) AS open_date, 
CASE WHEN d.code IS NULL THEN 0 ELSE 1 END as dispensing,
sum(items) as items,
1000*IEEE_divide(sum(items),total_list_size) AS items_per_thou
      
from ebmdatalab.hscic.practices prac
INNER JOIN ebmdatalab.hscic.practice_statistics stat ON prac.code = stat.practice 
LEFT JOIN ebmdatalab.hscic.prescribing_2018_01 p ON prac.code = p.practice -- latest month's prescribing data -- replace with [latest month]
LEFT JOIN  ebmdatalab.hscic.practices_mem m ON prac.code = m.practice AND SUBSTR(CAST(m.join_date AS STRING),1,4) > '2016' AND m.ccg_change_flag = 1
LEFT JOIN ebmdatalab.bsa.dispensing_practices_jan2017 d ON prac.code= d.code AND dispensing_patients > 0

WHERE 
DATE(stat.month) = '2017-06-01'   -- replace with '2017-06-01'
AND prac.ccg_id NOT IN ('99P','10Q')  -- exclude any practices involved in preliminary testing
AND prac.code NOT IN ('')       -- exclude any practices involved in preliminary testing -- replace with [list of practices to exclude]
                                -- also need to exclude practices without sufficient contact details. 
AND prac.setting = 4            -- include standard practices only
AND prac.status_code = 'A'      -- active status (exclude dormant and closed)
AND stat.total_list_size >= 500    -- minimum list size (at latest quarter)
AND CAST (open_date AS DATE) < '2017-01-01' -- exclude practices opened during baseline period (Jan-Jun)
AND SUBSTR(ccg_id,1,1) NOT BETWEEN 'A' AND 'Z'      -- this will exclude any practices belonging to NHS Trusts rather than CCGs as they are not standard practices
AND m.practice IS NULL    -- exclude practices which have changed CCG from 2017 onwards

GROUP BY stat.month, practice_id, dispensing, ccg_id, total_list_size, open_date

HAVING items >= 1000 
ORDER BY items asc, total_list_size asc
'''
#prac = pd.read_csv('eligible_pracs_test.csv')
prac = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, dialect='standard',verbose=False)

prac.head()

Unnamed: 0,practice_id,ccg_id,total_list_size,open_date,dispensing,items,items_per_thou
0,E87702,08Y,2060,1974-04-01,0,1040,504.854369
1,M92042,06A,3445,1988-03-30,0,1083,314.36865
2,F83672,07R,10832,1993-04-01,0,1109,102.381832
3,Y00060,05A,562,2003-04-01,0,1111,1976.868327
4,A82620,01H,744,1974-04-01,1,1113,1495.967742


In [12]:
## join data with eligible practices
# take columns of interest from df
df2 = df1[["practice_id","period", "month", "numerator","denominator"]]
df2 = df2.set_index(["practice_id","period", "month"])
dfm = df2.reset_index()
dfm = prac.merge(dfm, how='left', on='practice_id')
# take columns of interest from df
dfm = dfm[["practice_id","period","month", "numerator","denominator"]].loc[(dfm.period=="baseline")]# | (dfm.period=="follow-up")]
dfm.head(20)

Unnamed: 0,practice_id,period,month,numerator,denominator
10,E87702,baseline,2016-02-01,1,57
16,E87702,baseline,2016-03-01,1,59
19,E87702,baseline,2016-06-01,1,63
28,E87702,baseline,2016-05-01,4,51
32,E87702,baseline,2016-04-01,2,59
36,E87702,baseline,2016-01-01,0,63
38,M92042,baseline,2016-05-01,11,133
43,M92042,baseline,2016-04-01,9,133
56,M92042,baseline,2016-02-01,7,133
60,M92042,baseline,2016-01-01,2,113


In [13]:
### aggregate data over 6-month periods ( we will want to calculate the change between each)

# Perform groupby aggregation
agg_6m = dfm.groupby(["practice_id","period"]).sum() 

### filter out measures not meeting threshold values (i.e. less than 10 items prescribed on average per month)
agg_6m_f = agg_6m.loc[(agg_6m.denominator>60)] # filtering only on sum should suffice

### calculate aggregated measure values
agg_6m_f["calc_value"] = agg_6m_f.numerator / agg_6m_f.denominator

#agg_6m_f.head()
print agg_6m.denominator.loc[(agg_6m.denominator<=60)].count(), 'practices excluded for prescribing <60 total antibiotics in 6 months.'




4 practices excluded for prescribing <60 total antibiotics in 6 months.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [14]:
# extract calc_value column and unstack years
agg_6m_f = agg_6m_f.loc[~agg_6m_f.calc_value.isnull()] # exclude the extra row at bottom
dfx = agg_6m_f.reset_index()
dfx = dfx[["practice_id","calc_value","denominator"]]
#dfx = dfx.set_index(["practice_id","period"]) 
#dfx = dfx.unstack(1)

#dfx.columns = dfx.columns.droplevel()
dfx.columns.values[1] = 'baseline'

dfx.head(10)

Unnamed: 0,practice_id,baseline,denominator
0,A81001,0.051416,1342
1,A81002,0.075054,6915
2,A81004,0.081412,2776
3,A81005,0.112743,2315
4,A81006,0.068154,4035
5,A81007,0.081341,2803
6,A81009,0.077602,2719
7,A81011,0.064709,3848
8,A81012,0.100945,2011
9,A81013,0.111321,1590


In [15]:
### calculate percentile for each practice for single measure during baseline period and flag those which are in worst 20%

df3 = dfx #.reset_index()
df3["baseline_ranking"] = df3["baseline"].rank(method='min', pct=True)
df3["baseline_worst20"] = df3["baseline_ranking"] >= 0.8

#df3.to_csv('cephalosporin_change_201617.csv')
df3.head(10)

Unnamed: 0,practice_id,baseline,denominator,baseline_ranking,baseline_worst20
0,A81001,0.051416,1342,0.136403,False
1,A81002,0.075054,6915,0.4,False
2,A81004,0.081412,2776,0.481295,False
3,A81005,0.112743,2315,0.806475,True
4,A81006,0.068154,4035,0.312662,False
5,A81007,0.081341,2803,0.480576,False
6,A81009,0.077602,2719,0.432374,False
7,A81011,0.064709,3848,0.269928,False
8,A81012,0.100945,2011,0.706331,False
9,A81013,0.111321,1590,0.796403,False


In [16]:
### lookup practice ccg to use for allocation
df4 = df3.loc[df3.baseline_worst20==True]#.set_index(["practice_id","baseline"])
df4 = df4[["practice_id","baseline","baseline_ranking"]]
df5 = df4.merge(prac, how='left', on='practice_id')
#df4.columns.values[2] = 'calc_value'
#df4.columns.values[3] = 'baseline_ranking'

df5 = df5[["practice_id","ccg_id","baseline",'baseline_ranking']]
df5.head()

### repeat for the non-allocated practices (those outside worst 20%, maybe needed for controls for clicks)
others = df3.loc[df3.baseline_worst20==False]
others = others[["practice_id","baseline","baseline_ranking"]]
others = others.merge(prac, how='left', on='practice_id')
#others.columns.values[2] = 'calc_value'
#others.columns.values[3] = 'baseline_ranking'

others = others[["practice_id","ccg_id","baseline",'baseline_ranking']]
others.head()


Unnamed: 0,practice_id,ccg_id,baseline,baseline_ranking
0,A81001,00K,0.051416,0.136403
1,A81002,00K,0.075054,0.4
2,A81004,00M,0.081412,0.481295
3,A81006,00K,0.068154,0.312662
4,A81007,00K,0.081341,0.480576


In [17]:
### allocate bottom 20% practices to intervention and control groups 
import random as rd
df5['rand_num'] = np.random.rand(len(df5))
#df5 = df5.set_index(["practice_id","ccg_id"])
df5["allocation_ranking"] = df5.groupby('ccg_id').rand_num.rank()

df5["allocation_code"]= df5.allocation_ranking.mod(2)
df5 = df5.sort_values(by=['ccg_id','allocation_ranking']) 
         
df5

#assign each ccg to a random start point
ccgs = df5.loc[df5.allocation_ranking ==1].reset_index()
ccgs = ccgs[['ccg_id']]
ccgs['start_int'] = np.random.randint(1,3, size=len(ccgs)) 
ccgs['start_int2'] = np.random.randint(1,3, size=len(ccgs)) 

print ccgs.ccg_id.count(), 'CCGs are included in the intervention.'

# join tables back together
df6 = df5.merge(ccgs, how='left', on='ccg_id')

#create final allocation groups
df6['final_allocation'] = np.where(df6['start_int']==2,df6.allocation_code,1-df6.allocation_code)
df6['allocation'] = np.where(df6['final_allocation']==0,'con','I')
#df6.sort_values(by=['ccg_id', 'rand_num']) 
print df6.loc[df6.allocation=="I"].practice_id.count(), 'practices have been assigned to the intervention group.'
print df6.loc[df6.allocation=="con"].practice_id.count(), 'practices have been assigned to the control group.'

161 CCGs are included in the intervention.
694 practices have been assigned to the intervention group.
697 practices have been assigned to the control group.


In [18]:
### Allocate Intervention practices into groups A and B
# stratify by ccg and baseline ranking

df6["ranking2"] = df6.groupby(['ccg_id','allocation']).baseline_ranking.rank()
df6["allocation_code2"]= df6.ranking2.mod(2)
df6["group_ab_2"]= np.where(df6['start_int2']==2,df6.allocation_code2,1-df6.allocation_code2)
df6['group_ab_3'] = np.where(df6['group_ab_2']==0,'A','B') 
df6['group_ab'] = np.where(df6['allocation']=='con','con',df6.group_ab_3) 
# XXX add a cost_saving_measure column for practices in group A
df7 = df6[['practice_id','ccg_id','baseline','baseline_ranking','allocation','group_ab']].sort_values(by=['ccg_id', 'practice_id'])

print df7.loc[df7.group_ab=="A"].practice_id.count(), 'practices have been assigned to Intervention A.'
print df7.loc[df7.group_ab=="A"].baseline.mean(), 'is the mean baseline measure for practices in Intervention A.'
print df7.loc[df7.group_ab=="B"].practice_id.count(), 'practices have been assigned to Intervention B.'
print df7.loc[df7.group_ab=="B"].baseline.mean(), 'is the mean baseline measurefor practices in Intervention B.'

343 practices have been assigned to Intervention A.
0.13865503090536346 is the mean baseline measure for practices in Intervention A.
351 practices have been assigned to Intervention B.
0.1369134404492331 is the mean baseline measurefor practices in Intervention B.


In [21]:
df7.to_csv('practice_allocations.csv') 
#df7.groupby(df7.group_ab).practice_id.count()

In [20]:
df7.head()

Unnamed: 0,practice_id,ccg_id,baseline,baseline_ranking,allocation,group_ab
0,A83050,00J,0.124754,0.878705,I,B
1,A81025,00K,0.128418,0.897266,con,con
2,A81066,00K,0.116811,0.834532,I,B
3,A84034,00L,0.114097,0.813957,I,B
10,A81005,00M,0.112743,0.806475,con,con
