In [1]:
import pandas as pd
import numpy as np
GBQ_PROJECT_ID = '620265099307'

#q = '''SELECT * FROM ebmdatalab.outlier_detection.chem_by_subpara_by_practice_juntoaug17
#WHERE 1=1'''
#df1 = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, dialect='standard')
# rows: pct, chemical, subpara, num, denom, ratio (num and denom are items not quantity)
df1 = pd.read_csv('chem_by_subpara_by_practice_juntoaug17',dtype={'subpara': object}) # read from local csv as memory issues with pd.io

q2 = '''SELECT DISTINCT chemical, chemical_code from ebmdatalab.hscic.bnf'''
chem = pd.io.gbq.read_gbq(q2, GBQ_PROJECT_ID, dialect='standard',verbose=False)

q3 = '''SELECT DISTINCT subpara, subpara_code from ebmdatalab.hscic.bnf'''
subp = pd.io.gbq.read_gbq(q3, GBQ_PROJECT_ID, dialect='standard',verbose=False)


In [2]:
# need to flag where ccgs have not prescribed any items of the denominator in order to clean the data. 

# Step 1: amend the datafrome to include a line for every CCG and every chemical and subparagraph.

# list all subpara-chemical combinations 
a = df1[["subpara", "chemical"]].drop_duplicates()

#list all practices
b = df1[["pct","practice"]].drop_duplicates()

# cross join to make table of all CCGs and all subpara combinations 
a['tmp'] = 1
b['tmp'] = 1
c = b.merge(a, on="tmp").drop('tmp', axis=1) # 237,636 rows

# join to data - need to list every possible chemical against every CCG
data = c.merge(df1, how="left", on=["pct","practice","subpara","chemical"])  # 237,636 rows
data


# Step 2: identify those with zero subparas
# subpara totals by ccg
subpara = df1[["pct","practice","subpara","denom"]].groupby(["subpara","pct","practice"]).max().reset_index() # 42,917 rows

#list all possible subparagraphs and all ccgs
a2 = df1[["subpara"]].drop_duplicates()
a2['tmp'] = 1

# cross join to CCGs to make table of all CCGs and all subpara combinations 
c2 = b.merge(a2, on="tmp").drop('tmp', axis=1) # 56,097 rows

# join to subpara data by ccg to identify subparas prescribed by each ccg.  
d = c2.merge(subpara,how="left", on=["subpara","pct","practice"])

# for subparas never prescribed, replace NAs with zeros so that there is data present to indicate this
d = d.fillna(0)

# join back to original dataset
d2 = d.merge(data, how="left", on=["subpara","pct","practice"], suffixes=("_subpara",""))
# check how many have zero denominators:
# data.loc[(data["denom_subpara"]==0)] # 19,665 rows 

# exclude combinations where denominators are zero THEN replace NAs with 0:
data2 = d2.loc[(d2["denom_subpara"]!=0)]
data2 = data2.fillna(0)
#data2 = data2.set_index(['pct','chemical'])
data2.head(3)

Unnamed: 0,pct,practice,subpara,denom_subpara,chemical,num,denom,ratio
0,04V,C82041,307000,44.0,0307000J0,44.0,44.0,1.0
1,04V,C82041,307000,44.0,0307000K0,0.0,0.0,0.0
2,04V,C82041,307000,44.0,0307000Q0,0.0,0.0,0.0


# National
- take ratios of chemical/subparagraph from above
- calculate the *national* mean ratio and standard deviation for each chemical

In [3]:
df2_nat = data2[["chemical", "ratio"]].groupby(["chemical"]).describe()
df2_nat.columns = df2_nat.columns.droplevel()
df2_nat.head(3)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
chemical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0101010C0,5624.0,0.03567,0.148797,0.0,0.0,0.0,0.0,1.0
0101010F0,5624.0,0.003186,0.044284,0.0,0.0,0.0,0.0,1.0
0101010G0,5624.0,0.414975,0.394887,0.0,0.0,0.375,0.8,1.0


- calculate the number of standard deviations each practice is from the national mean, for each chemical

In [4]:
output_nat = data2.merge(df2_nat[['mean','std']],how='left', left_on=['chemical'],right_index=True)

output_nat['stds_from_mean'] = (output_nat['ratio'] - output_nat['mean']) / output_nat['std']
output_nat = output_nat.dropna().sort_values('stds_from_mean',ascending =False)
output_nat = output_nat.merge(chem, how="left", left_on = "chemical",right_on="chemical_code",suffixes=(""," name"))
output_nat = output_nat.merge(subp, how="left", left_on = "subpara",right_on="subpara_code",suffixes=(""," name"))
output_nat = output_nat.drop(['subpara','denom_subpara','chemical'],axis=1)
output_nat = output_nat.set_index('practice')
#output_nat.head(3)

## 99P
### More than usual

- filter out ones with small ratio

In [5]:
output_nat.loc[(output_nat.pct=='99P')
               &(output_nat.denom>200)
               &(output_nat['mean']>0.01)].sort_values('stds_from_mean',
                                                       ascending =False).head(20)

Unnamed: 0_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,chemical_code,subpara name,subpara_code
practice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83012,99P,293.0,478.0,0.612971,0.017529,0.034551,17.233945,Telmisartan,0205052Q0,Angiotensin-II Receptor Antagonists,205052
L83643,99P,136.0,281.0,0.483986,0.049498,0.043056,10.091244,Nifedipine,0206020R0,Calcium-Channel Blockers,206020
L83663,99P,164.0,561.0,0.292335,0.039735,0.02549,9.909595,Paroxetine Hydrochloride,0403030P0,Selective Serotonin Re-Uptake Inhibitors,403030
L83134,99P,711.0,1087.0,0.654094,0.051798,0.074624,8.071075,Lercanidipine Hydrochloride,0206020L0,Calcium-Channel Blockers,206020
L83023,99P,186.0,653.0,0.284839,0.017529,0.034551,7.73679,Telmisartan,0205052Q0,Angiotensin-II Receptor Antagonists,205052
L83099,99P,807.0,1375.0,0.586909,0.051798,0.074624,7.170764,Lercanidipine Hydrochloride,0206020L0,Calcium-Channel Blockers,206020
L83136,99P,185.0,765.0,0.24183,0.025756,0.031765,6.802342,Dabigatran Etexilate,0208020X0,Oral Anticoagulants,208020
L83098,99P,38.0,244.0,0.155738,0.011232,0.023091,6.258055,Triamcinolone Acetonide,1202010T0,Drugs Used In Nasal Allergy,1202010
L83050,99P,326.0,662.0,0.492447,0.027746,0.074263,6.25754,Ketoprofen,100302010,"Rubefacients,Top NSAIDS,Capsaicin & Poul",1003020
L83007,99P,176.0,229.0,0.768559,0.125677,0.103972,6.183209,Isophane Insulin,0601012S0,Intermediate And Long-Acting Insulins,601012


### Less than usual

- filter out ones with small ratio

In [6]:
output_nat.loc[(output_nat.pct=='99P')
               &(output_nat.denom>200)
               &(output_nat['mean']<0.99)].sort_values('stds_from_mean',
                                                       ascending =True).head(20)

Unnamed: 0_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,chemical_code,subpara name,subpara_code
practice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83112,99P,131.0,262.0,0.5,0.884759,0.084858,-4.534128,Tamsulosin Hydrochloride,0704010U0,Drugs For Urinary Retention,704010
L83673,99P,123.0,290.0,0.424138,0.805378,0.090115,-4.230574,Amitriptyline Hydrochloride,0403010B0,Tricyclic & Related Antidepressant Drugs,403010
L83134,99P,312.0,1087.0,0.287029,0.708416,0.130218,-3.236014,Amlodipine,0206020A0,Calcium-Channel Blockers,206020
L83136,99P,115.0,213.0,0.539906,0.822748,0.089521,-3.159482,Amoxicillin,0501013B0,Broad-Spectrum Penicillins,501013
L83128,99P,33.0,265.0,0.124528,0.715778,0.189366,-3.122261,Tiotropium,0301020Q0,Antimuscarinic Bronchodilators,301020
L83015,99P,42.0,247.0,0.17004,0.805329,0.204129,-3.112188,Baclofen,1002020C0,Skeletal Muscle Relaxants,1002020
L83002,99P,782.0,1309.0,0.597403,0.875431,0.090323,-3.07817,Alendronic Acid,0606020A0,Bisphosphonates and Other Drugs,606020
L83673,99P,40.0,427.0,0.093677,0.399537,0.102324,-2.989135,Citalopram Hydrobromide,0403030D0,Selective Serotonin Re-Uptake Inhibitors,403030
L83065,99P,477.0,573.0,0.832461,0.945259,0.038063,-2.963484,Salbutamol,0301011R0,Selective Beta(2)-Agonists,301011
L83076,99P,2.0,477.0,0.004193,0.313202,0.105334,-2.933618,Tramadol Hydrochloride,040702040,Opioid Analgesics,407020


# Within CCGs
- take ratios of chemical/subparagraph from top
- calculate the *CCG level* mean ratio and standard deviation for each chemical
- *all* results from here have rows with small ratio filtered out

In [7]:
df2 = data2[["pct","chemical", "ratio"]].groupby(["pct","chemical"],sort=False).describe()
df2.columns = df2.columns.droplevel()
df2.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
pct,chemical,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
04V,0307000J0,47.0,0.985675,0.027347,0.891892,0.980487,1.0,1.0,1.0
04V,0307000K0,47.0,0.006574,0.014032,0.0,0.0,0.0,0.0,0.066667
04V,0307000Q0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- calculate the number of standard deviations each practice is from the CCG level mean, for each chemical

In [8]:
output = data2.merge(df2[['mean','std']],how='left', left_on=['pct','chemical'],right_index=True)

output['stds_from_mean'] = (output['ratio'] - output['mean']) / output['std']
output = output.dropna().sort_values('stds_from_mean',ascending =False)
output = output.merge(chem, how="left", left_on = "chemical",right_on="chemical_code",suffixes=(""," name"))
output = output.merge(subp, how="left", left_on = "subpara",right_on="subpara_code",suffixes=(""," name"))
output = output.drop(['subpara','denom_subpara','chemical'],axis=1)
output = output.set_index('practice')
#output.head(3)

## 99P
### More than usual

In [9]:
output.loc[(output.pct=='99P')
           &(output.denom>200)
           &(output['mean']>0.01)].sort_values('stds_from_mean',
                                               ascending =False).head(20)

Unnamed: 0_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,chemical_code,subpara name,subpara_code
practice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83024,99P,265.0,520.0,0.509615,0.010224,0.054215,9.211343,Influenza,1404000H0,Vaccines And Antisera,1404000
L83663,99P,164.0,561.0,0.292335,0.031725,0.029198,8.92574,Paroxetine Hydrochloride,0403030P0,Selective Serotonin Re-Uptake Inhibitors,403030
L83012,99P,293.0,478.0,0.612971,0.018599,0.06977,8.519005,Telmisartan,0205052Q0,Angiotensin-II Receptor Antagonists,205052
L83113,99P,98.0,230.0,0.426087,0.026975,0.053875,7.408123,Diethylamine Salicylate,1003020I0,"Rubefacients,Top NSAIDS,Capsaicin & Poul",1003020
L83073,99P,113.0,497.0,0.227364,0.016114,0.030845,6.848756,Glipizide,0601021P0,Sulfonylureas,601021
L83136,99P,185.0,765.0,0.24183,0.025481,0.032314,6.695251,Dabigatran Etexilate,0208020X0,Oral Anticoagulants,208020
L83643,99P,136.0,281.0,0.483986,0.080926,0.060801,6.629179,Nifedipine,0206020R0,Calcium-Channel Blockers,206020
L83651,99P,91.0,377.0,0.241379,0.022427,0.034682,6.313059,Etodolac,1001010E0,Non-Steroidal Anti-Inflammatory Drugs,1001010
L83673,99P,132.0,290.0,0.455172,0.070876,0.061679,6.23063,Trazodone Hydrochloride,0403010X0,Tricyclic & Related Antidepressant Drugs,403010
L83643,99P,113.0,667.0,0.169415,0.018555,0.025513,5.912975,Escitalopram,0403030X0,Selective Serotonin Re-Uptake Inhibitors,403030


### Less than usual

In [10]:
output.loc[(output.pct=='99P')
           &(output.denom>200)
           &(output['mean']<0.99)].sort_values('stds_from_mean',
                                               ascending =True).head(20)

Unnamed: 0_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,chemical_code,subpara name,subpara_code
practice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83073,99P,366.0,497.0,0.736419,0.970214,0.042183,-5.542384,Gliclazide,0601021M0,Sulfonylureas,601021
L83673,99P,123.0,290.0,0.424138,0.813865,0.07035,-5.539809,Amitriptyline Hydrochloride,0403010B0,Tricyclic & Related Antidepressant Drugs,403010
L83112,99P,131.0,262.0,0.5,0.865584,0.073971,-4.94227,Tamsulosin Hydrochloride,0704010U0,Drugs For Urinary Retention,704010
L83039,99P,300.0,421.0,0.712589,0.927178,0.046157,-4.64912,Furosemide,0202020L0,Loop Diuretics,202020
L83107,99P,205.0,282.0,0.72695,0.927178,0.046157,-4.33798,Furosemide,0202020L0,Loop Diuretics,202020
L83128,99P,33.0,265.0,0.124528,0.705643,0.143997,-4.035616,Tiotropium,0301020Q0,Antimuscarinic Bronchodilators,301020
L83065,99P,477.0,573.0,0.832461,0.944131,0.027882,-4.005139,Salbutamol,0301011R0,Selective Beta(2)-Agonists,301011
L83097,99P,128.0,206.0,0.621359,0.884079,0.066316,-3.96165,Finasteride,0604020C0,Male Sex Hormones And Antagonists,604020
L83134,99P,312.0,1087.0,0.287029,0.689802,0.10383,-3.87917,Amlodipine,0206020A0,Calcium-Channel Blockers,206020
L83643,99P,276.0,491.0,0.562118,0.813865,0.07035,-3.578479,Amitriptyline Hydrochloride,0403010B0,Tricyclic & Related Antidepressant Drugs,403010


## Practices that deviate most from their CCG
### 99P

In [12]:
np.abs(output.loc[(output.pct=='99P')&(output.denom>200)&(output['mean']>0.01),['stds_from_mean']]).groupby(level=0).mean().sort_values(['stds_from_mean'],ascending=False).head(20)

Unnamed: 0_level_0,stds_from_mean
practice,Unnamed: 1_level_1
L83673,1.460568
L83643,1.394315
L83012,1.11337
L83642,1.073722
L83080,1.002173
L83663,0.982917
L83651,0.980097
L83107,0.96589
L83023,0.919517
L83639,0.907021
