In [1]:
import pandas as pd
import numpy as np
GBQ_PROJECT_ID = '620265099307'

#q = '''SELECT * FROM ebmdatalab.outlier_detection.chem_by_subpara_by_practice_juntoaug17
#WHERE 1=1'''
#df1 = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, dialect='standard')
# rows: pct, chemical, subpara, num, denom, ratio (num and denom are items not quantity)
df1 = pd.read_csv('chem_by_subpara_by_practice_juntoaug17',dtype={'subpara': object}) # read from local csv as memory issues with pd.io

q2 = '''SELECT DISTINCT chemical, chemical_code from ebmdatalab.hscic.bnf'''
chem = pd.io.gbq.read_gbq(q2, GBQ_PROJECT_ID, dialect='standard',verbose=False)

q3 = '''SELECT DISTINCT subpara, subpara_code from ebmdatalab.hscic.bnf'''
subp = pd.io.gbq.read_gbq(q3, GBQ_PROJECT_ID, dialect='standard',verbose=False)


In [2]:
# need to flag where ccgs have not prescribed any items of the denominator in order to clean the data. 

# Step 1: amend the datafrome to include a line for every CCG and every chemical and subparagraph.

# list all subpara-chemical combinations 
a = df1[["subpara", "chemical"]].drop_duplicates()

#list all practices
b = df1[["pct","practice"]].drop_duplicates()

# cross join to make table of all CCGs and all subpara combinations 
a['tmp'] = 1
b['tmp'] = 1
c = b.merge(a, on="tmp").drop('tmp', axis=1) # 237,636 rows

# join to data - need to list every possible chemical against every CCG
data = c.merge(df1, how="left", on=["pct","practice","subpara","chemical"])  # 237,636 rows
data


# Step 2: identify those with zero subparas
# subpara totals by ccg
subpara = df1[["pct","practice","subpara","denom"]].groupby(["subpara","pct","practice"]).max().reset_index() # 42,917 rows

#list all possible subparagraphs and all ccgs
a2 = df1[["subpara"]].drop_duplicates()
a2['tmp'] = 1

# cross join to CCGs to make table of all CCGs and all subpara combinations 
c2 = b.merge(a2, on="tmp").drop('tmp', axis=1) # 56,097 rows

# join to subpara data by ccg to identify subparas prescribed by each ccg.  
d = c2.merge(subpara,how="left", on=["subpara","pct","practice"])

# for subparas never prescribed, replace NAs with zeros so that there is data present to indicate this
d = d.fillna(0)

# join back to original dataset
d2 = d.merge(data, how="left", on=["subpara","pct","practice"], suffixes=("_subpara",""))
# check how many have zero denominators:
# data.loc[(data["denom_subpara"]==0)] # 19,665 rows 

# exclude combinations where denominators are zero THEN replace NAs with 0:
data2 = d2.loc[(d2["denom_subpara"]!=0)]
data2 = data2.fillna(0)
#data2 = data2.set_index(['pct','chemical'])
data2.head(3)

Unnamed: 0,pct,practice,subpara,denom_subpara,chemical,num,denom,ratio
0,04V,C82041,307000,44.0,0307000J0,44.0,44.0,1.0
1,04V,C82041,307000,44.0,0307000K0,0.0,0.0,0.0
2,04V,C82041,307000,44.0,0307000Q0,0.0,0.0,0.0


# National
- take ratios of chemical/subparagraph from above
- calculate the *national* mean ratio and standard deviation for each chemical

In [3]:
df2_nat = data2[["chemical", "ratio"]].groupby(["chemical"]).describe()
df2_nat.columns = df2_nat.columns.droplevel()
df2_nat.head(3)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
chemical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0101010C0,5624.0,0.03567,0.148797,0.0,0.0,0.0,0.0,1.0
0101010F0,5624.0,0.003186,0.044284,0.0,0.0,0.0,0.0,1.0
0101010G0,5624.0,0.414975,0.394887,0.0,0.0,0.375,0.8,1.0


- calculate the number of standard deviations each practice is from the national mean, for each chemical

In [4]:
output_nat = data2.merge(df2_nat[['mean','std']],how='left', left_on=['chemical'],right_index=True)

output_nat['stds_from_mean'] = (output_nat['ratio'] - output_nat['mean']) / output_nat['std']
output_nat = output_nat.dropna().sort_values('stds_from_mean',ascending =False)
output_nat = output_nat.merge(chem, how="left", left_on = "chemical",right_on="chemical_code",suffixes=(""," name"))
output_nat = output_nat.merge(subp, how="left", left_on = "subpara",right_on="subpara_code",suffixes=(""," name"))
output_nat = output_nat.drop(['subpara','denom_subpara','chemical'],axis=1)
output_nat = output_nat.set_index(['practice','chemical_code'])
#output_nat.head(3)

### More than usual

In [5]:
output_nat.loc[output_nat.denom>200].sort_values('stds_from_mean',
                                                 ascending =False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
F81074,020400010,06Q,1.0,970.0,0.001031,1.406066e-07,1.2e-05,85.61542,Pindolol With Diuretic,Beta-Adrenoceptor Blocking Drugs,204000
L81622,0212000AG,11H,2.0,659.0,0.003035,4.139819e-07,3.5e-05,85.60958,Policosanol,Lipid-Regulating Drugs,212000
M85025,021200000,04X,1.0,1384.0,0.000723,9.856e-08,8e-06,85.60958,Other Lipid-Regulating Preps,Lipid-Regulating Drugs,212000
F81027,0212000V0,07H,1.0,6452.0,0.000155,2.114182e-08,2e-06,85.60958,Omega-3 Marine Triglycerides,Lipid-Regulating Drugs,212000
M82043,0403010C0,05N,1.0,665.0,0.001504,2.051793e-07,1.8e-05,85.597898,Amoxapine,Tricyclic & Related Antidepressant Drugs,403010


- filter out ones with small ratio

In [6]:
output_nat.loc[(output_nat.denom>200)
               &(output_nat['mean']>0.01)].sort_values('stds_from_mean',
                                                       ascending =False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
M85670,0103050R0,13P,222.0,347.0,0.639769,0.020097,0.027576,22.471677,Pantoprazole,Proton Pump Inhibitors,103050
L84615,0205052V0,11M,425.0,701.0,0.606277,0.021632,0.029649,19.718936,Valsartan,Angiotensin-II Receptor Antagonists,205052
C83022,0205052Q0,99D,1537.0,2306.0,0.666522,0.017529,0.034551,18.78389,Telmisartan,Angiotensin-II Receptor Antagonists,205052
F81751,0205052V0,06Q,308.0,547.0,0.563071,0.021632,0.029649,18.261701,Valsartan,Angiotensin-II Receptor Antagonists,205052
M85139,0103050R0,13P,579.0,1137.0,0.509235,0.020097,0.027576,17.737998,Pantoprazole,Proton Pump Inhibitors,103050


### Less than usual

In [7]:
output_nat.loc[output_nat.denom>200].sort_values('stds_from_mean',
                                                 ascending =True).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
P81100,0501012G0,01A,315.0,316.0,0.996835,1.0,3.7e-05,-85.580372,Flucloxacillin Sodium,Penicillinase-Resistant Penicillins,501012
P81016,0906027G0,00R,118.0,330.0,0.357576,0.997524,0.024251,-26.389,Vitamin B Compound,Vitamin B Compound,906027
L81044,0906026M0,11T,561.0,605.0,0.927273,0.999868,0.003746,-19.377469,Thiamine Hydrochloride,Thiamine Hydrochloride (B1),906026
P84047,0602010V0,14L,244.0,259.0,0.942085,0.997819,0.003942,-14.140383,Levothyroxine Sodium,Thyroid Hormones,602010
K84016,0602010V0,10Q,429.0,451.0,0.95122,0.997819,0.003942,-11.822848,Levothyroxine Sodium,Thyroid Hormones,602010


- filter out ones with small ratio

In [8]:
output_nat.loc[(output_nat.denom>200)
               &(output_nat['mean']<0.99)].sort_values('stds_from_mean',
                                                       ascending =True).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
J81078,0205040D0,11J,126.0,236.0,0.533898,0.978408,0.041976,-10.589725,Doxazosin Mesilate,Alpha-Adrenoceptor Blocking Drugs,205040
L82039,0205040D0,11N,130.0,233.0,0.55794,0.978408,0.041976,-10.016972,Doxazosin Mesilate,Alpha-Adrenoceptor Blocking Drugs,205040
C83631,0906040G0,99D,235.0,365.0,0.643836,0.95513,0.031335,-9.934361,Colecalciferol,Vitamin D,906040
B81095,0601060D0,03F,169.0,258.0,0.655039,0.962421,0.03115,-9.867778,Glucose Blood Testing Reagents,Diabetic Diagnostic & Monitoring Agents,601060
M83121,0205040D0,05G,165.0,289.0,0.570934,0.978408,0.041976,-9.707403,Doxazosin Mesilate,Alpha-Adrenoceptor Blocking Drugs,205040


## 99P
### More than usual

In [9]:
output_nat.loc[(output_nat.pct=='99P')
               &(output_nat.denom>200)].sort_values('stds_from_mean',
                                                    ascending =False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83643,0212000AI,99P,1.0,667.0,0.001499,2e-06,5.3e-05,28.199412,Alirocumab,Lipid-Regulating Drugs,212000
L83628,0407010AA,99P,1.0,909.0,0.0011,1e-06,4.1e-05,26.591893,Aspirin & Caffeine,Non-Opioid Analgesics And Compound Prep,407010
L83012,0205051Y0,99P,146.0,789.0,0.185044,0.001178,0.007182,25.601196,Perindopril Arginine,Angiotensin-Converting Enzyme Inhibitors,205051
L83016,0901011Y0,99P,42.0,771.0,0.054475,0.000139,0.002582,21.042445,Ferric Maltol,Oral Iron,901011
L83044,0204000AC,99P,2.0,1567.0,0.001276,3e-06,6.2e-05,20.632495,Bisoprolol Fumarate/Aspirin,Beta-Adrenoceptor Blocking Drugs,204000


- filter out ones with small ratio

In [10]:
output_nat.loc[(output_nat.pct=='99P')
               &(output_nat.denom>200)
               &(output_nat['mean']>0.01)].sort_values('stds_from_mean',
                                                       ascending =False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83012,0205052Q0,99P,293.0,478.0,0.612971,0.017529,0.034551,17.233945,Telmisartan,Angiotensin-II Receptor Antagonists,205052
L83643,0206020R0,99P,136.0,281.0,0.483986,0.049498,0.043056,10.091244,Nifedipine,Calcium-Channel Blockers,206020
L83663,0403030P0,99P,164.0,561.0,0.292335,0.039735,0.02549,9.909595,Paroxetine Hydrochloride,Selective Serotonin Re-Uptake Inhibitors,403030
L83134,0206020L0,99P,711.0,1087.0,0.654094,0.051798,0.074624,8.071075,Lercanidipine Hydrochloride,Calcium-Channel Blockers,206020
L83023,0205052Q0,99P,186.0,653.0,0.284839,0.017529,0.034551,7.73679,Telmisartan,Angiotensin-II Receptor Antagonists,205052


### Less than usual

In [11]:
output_nat.loc[(output_nat.pct=='99P')
               &(output_nat.denom>200)].sort_values('stds_from_mean'
                                                    ,ascending =True).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83102,0602010V0,99P,1150.0,1179.0,0.975403,0.997819,0.003942,-5.687275,Levothyroxine Sodium,Thyroid Hormones,602010
L83112,0704010U0,99P,131.0,262.0,0.5,0.884759,0.084858,-4.534128,Tamsulosin Hydrochloride,Drugs For Urinary Retention,704010
L83673,0403010B0,99P,123.0,290.0,0.424138,0.805378,0.090115,-4.230574,Amitriptyline Hydrochloride,Tricyclic & Related Antidepressant Drugs,403010
L83134,0206020A0,99P,312.0,1087.0,0.287029,0.708416,0.130218,-3.236014,Amlodipine,Calcium-Channel Blockers,206020
L83136,0501013B0,99P,115.0,213.0,0.539906,0.822748,0.089521,-3.159482,Amoxicillin,Broad-Spectrum Penicillins,501013


- filter out ones with small ratio

In [12]:
output_nat.loc[(output_nat.pct=='99P')
               &(output_nat.denom>200)
               &(output_nat['mean']<0.99)].sort_values('stds_from_mean',
                                                       ascending =True).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83112,0704010U0,99P,131.0,262.0,0.5,0.884759,0.084858,-4.534128,Tamsulosin Hydrochloride,Drugs For Urinary Retention,704010
L83673,0403010B0,99P,123.0,290.0,0.424138,0.805378,0.090115,-4.230574,Amitriptyline Hydrochloride,Tricyclic & Related Antidepressant Drugs,403010
L83134,0206020A0,99P,312.0,1087.0,0.287029,0.708416,0.130218,-3.236014,Amlodipine,Calcium-Channel Blockers,206020
L83136,0501013B0,99P,115.0,213.0,0.539906,0.822748,0.089521,-3.159482,Amoxicillin,Broad-Spectrum Penicillins,501013
L83128,0301020Q0,99P,33.0,265.0,0.124528,0.715778,0.189366,-3.122261,Tiotropium,Antimuscarinic Bronchodilators,301020


# Within CCGs
- take ratios of chemical/subparagraph from top
- calculate the *CCG level* mean ratio and standard deviation for each chemical
- *all* results from here have rows with small ratio filtered out

In [13]:
df2 = data2[["pct","chemical", "ratio"]].groupby(["pct","chemical"]).describe()
df2.columns = df2.columns.droplevel()
df2.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
pct,chemical,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00C,0101010C0,11.0,0.015455,0.037779,0.0,0.0,0.0,0.0,0.12
00C,0101010F0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00C,0101010G0,11.0,0.922104,0.136904,0.55,0.936538,0.965517,1.0,1.0


- calculate the number of standard deviations each practice is from the CCG level mean, for each chemical

In [14]:
output = data2.merge(df2[['mean','std']],how='left', left_on=['pct','chemical'],right_index=True)

output['stds_from_mean'] = (output['ratio'] - output['mean']) / output['std']
output = output.dropna().sort_values('stds_from_mean',ascending =False)
output = output.merge(chem, how="left", left_on = "chemical",right_on="chemical_code",suffixes=(""," name"))
output = output.merge(subp, how="left", left_on = "subpara",right_on="subpara_code",suffixes=(""," name"))
output = output.drop(['subpara','denom_subpara','chemical'],axis=1)
output = output.set_index(['practice','chemical_code'])
#output.head(3)

### More than usual

In [15]:
output.loc[(output.denom>200)
           &(output['mean']>0.01)].sort_values('stds_from_mean',
                                               ascending =False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
D81056,1404000H0,06H,436.0,525.0,0.830476,0.013463,0.085288,9.579452,Influenza,Vaccines And Antisera,1404000
L83024,1404000H0,99P,265.0,520.0,0.509615,0.010224,0.054215,9.211343,Influenza,Vaccines And Antisera,1404000
J81074,0205052I0,11J,1213.0,1510.0,0.803311,0.037695,0.085128,8.993761,Irbesartan,Angiotensin-II Receptor Antagonists,205052
L83663,0403030P0,99P,164.0,561.0,0.292335,0.031725,0.029198,8.92574,Paroxetine Hydrochloride,Selective Serotonin Re-Uptake Inhibitors,403030
D81021,0301020S0,06H,457.0,528.0,0.86553,0.032666,0.09439,8.823692,Glycopyrronium Bromide,Antimuscarinic Bronchodilators,301020


### Less than usual

In [16]:
output.loc[(output.denom>200)
           &(output['mean']<0.99)].sort_values('stds_from_mean',
                                               ascending =True).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L84020,0601021M0,11M,205.0,374.0,0.548128,0.971112,0.05633,-7.509083,Gliclazide,Sulfonylureas,601021
J81078,0606020A0,11J,183.0,368.0,0.497283,0.886661,0.060762,-6.408204,Alendronic Acid,Bisphosphonates and Other Drugs,606020
N85038,0601021M0,12F,99.0,278.0,0.356115,0.956773,0.094338,-6.367063,Gliclazide,Sulfonylureas,601021
J81647,0301011R0,11J,149.0,220.0,0.677273,0.938204,0.041694,-6.258211,Salbutamol,Selective Beta(2)-Agonists,301011
L85044,0704010U0,11X,181.0,335.0,0.540299,0.923622,0.062176,-6.165172,Tamsulosin Hydrochloride,Drugs For Urinary Retention,704010


## 99P
### More than usual

In [17]:
output.loc[(output.pct=='99P')
           &(output.denom>200)
           &(output['mean']>0.01)].sort_values('stds_from_mean',
                                               ascending =False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83024,1404000H0,99P,265.0,520.0,0.509615,0.010224,0.054215,9.211343,Influenza,Vaccines And Antisera,1404000
L83663,0403030P0,99P,164.0,561.0,0.292335,0.031725,0.029198,8.92574,Paroxetine Hydrochloride,Selective Serotonin Re-Uptake Inhibitors,403030
L83012,0205052Q0,99P,293.0,478.0,0.612971,0.018599,0.06977,8.519005,Telmisartan,Angiotensin-II Receptor Antagonists,205052
L83113,1003020I0,99P,98.0,230.0,0.426087,0.026975,0.053875,7.408123,Diethylamine Salicylate,"Rubefacients,Top NSAIDS,Capsaicin & Poul",1003020
L83073,0601021P0,99P,113.0,497.0,0.227364,0.016114,0.030845,6.848756,Glipizide,Sulfonylureas,601021


### Less than usual

In [18]:
output.loc[(output.pct=='99P')
           &(output.denom>200)
           &(output['mean']<0.99)].sort_values('stds_from_mean',
                                               ascending =True).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean,std,stds_from_mean,chemical name,subpara name,subpara_code
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L83073,0601021M0,99P,366.0,497.0,0.736419,0.970214,0.042183,-5.542384,Gliclazide,Sulfonylureas,601021
L83673,0403010B0,99P,123.0,290.0,0.424138,0.813865,0.07035,-5.539809,Amitriptyline Hydrochloride,Tricyclic & Related Antidepressant Drugs,403010
L83112,0704010U0,99P,131.0,262.0,0.5,0.865584,0.073971,-4.94227,Tamsulosin Hydrochloride,Drugs For Urinary Retention,704010
L83039,0202020L0,99P,300.0,421.0,0.712589,0.927178,0.046157,-4.64912,Furosemide,Loop Diuretics,202020
L83107,0202020L0,99P,205.0,282.0,0.72695,0.927178,0.046157,-4.33798,Furosemide,Loop Diuretics,202020


In [19]:
output_sml = output.loc[(output.denom>200)]
output_nat_sml = output_nat.loc[(output_nat.denom>200)]
combined = output_nat_sml.merge(output_sml[['mean', 'std', 'stds_from_mean']], 
                                how='outer', 
                                left_index=True,
                                right_index=True,
                                copy=False,
                                suffixes=('_nat', '_ccg'))
combined = combined[['pct', 'num', 'denom', 'ratio',
                     'mean_nat','std_nat','stds_from_mean_nat',
                     'mean_ccg','std_ccg','stds_from_mean_ccg',
                      'chemical name', 'subpara name']]
combined

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean_nat,std_nat,stds_from_mean_nat,mean_ccg,std_ccg,stds_from_mean_ccg,chemical name,subpara name
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A81001,,00K,17.0,466.0,0.036481,0.037464,0.059876,-0.016418,0.008797,0.017213,1.608262,,Corticosteroids (Respiratory)
A81001,0103050E0,00K,54.0,1068.0,0.050562,0.028752,0.022506,0.969048,0.044969,0.023588,0.237095,Esomeprazole,Proton Pump Inhibitors
A81001,0103050L0,00K,632.0,1068.0,0.591760,0.419759,0.177378,0.969690,0.555449,0.095357,0.380796,Lansoprazole,Proton Pump Inhibitors
A81001,0103050P0,00K,375.0,1068.0,0.351124,0.525997,0.180483,-0.968918,0.384444,0.098640,-0.337798,Omeprazole,Proton Pump Inhibitors
A81001,0103050R0,00K,6.0,1068.0,0.005618,0.020097,0.027576,-0.525055,0.012014,0.014682,-0.435661,Pantoprazole,Proton Pump Inhibitors
A81001,0103050T0,00K,1.0,1068.0,0.000936,0.005395,0.009110,-0.489442,0.003124,0.003322,-0.658624,Rabeprazole Sodium,Proton Pump Inhibitors
A81001,0106020C0,00K,36.0,229.0,0.157205,0.180048,0.150468,-0.151811,0.212195,0.128830,-0.426841,Bisacodyl,Stimulant Laxatives
A81001,0106020I0,00K,90.0,229.0,0.393013,0.220825,0.154789,1.112400,0.419275,0.110455,-0.237761,Docusate Sodium,Stimulant Laxatives
A81001,0106020L0,00K,12.0,229.0,0.052402,0.038669,0.043689,0.314331,0.015775,0.014736,2.485506,Glycerol,Stimulant Laxatives
A81001,0106020M0,00K,86.0,229.0,0.375546,0.536016,0.182937,-0.877185,0.333335,0.111970,0.376987,Senna,Stimulant Laxatives


In [21]:
combined.loc[((combined.pct=='99P')
             &(combined.denom>200)
             &(combined['mean_nat']>0.01))].sort_values('stds_from_mean_nat',ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,pct,num,denom,ratio,mean_nat,std_nat,stds_from_mean_nat,mean_ccg,std_ccg,stds_from_mean_ccg,chemical name,subpara name
practice,chemical_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
L83012,0205052Q0,99P,293.0,478.0,0.612971,0.017529,0.034551,17.233945,0.018599,0.06977,8.519005,Telmisartan,Angiotensin-II Receptor Antagonists
L83643,0206020R0,99P,136.0,281.0,0.483986,0.049498,0.043056,10.091244,0.080926,0.060801,6.629179,Nifedipine,Calcium-Channel Blockers
L83663,0403030P0,99P,164.0,561.0,0.292335,0.039735,0.02549,9.909595,0.031725,0.029198,8.92574,Paroxetine Hydrochloride,Selective Serotonin Re-Uptake Inhibitors
L83134,0206020L0,99P,711.0,1087.0,0.654094,0.051798,0.074624,8.071075,0.098824,0.106922,5.193195,Lercanidipine Hydrochloride,Calcium-Channel Blockers
L83023,0205052Q0,99P,186.0,653.0,0.284839,0.017529,0.034551,7.73679,0.018599,0.06977,3.815964,Telmisartan,Angiotensin-II Receptor Antagonists
L83099,0206020L0,99P,807.0,1375.0,0.586909,0.051798,0.074624,7.170764,0.098824,0.106922,4.564845,Lercanidipine Hydrochloride,Calcium-Channel Blockers
L83136,0208020X0,99P,185.0,765.0,0.24183,0.025756,0.031765,6.802342,0.025481,0.032314,6.695251,Dabigatran Etexilate,Oral Anticoagulants
L83098,1202010T0,99P,38.0,244.0,0.155738,0.011232,0.023091,6.258055,0.013971,0.029895,4.742131,Triamcinolone Acetonide,Drugs Used In Nasal Allergy
L83050,100302010,99P,326.0,662.0,0.492447,0.027746,0.074263,6.25754,0.036006,0.077837,5.864096,Ketoprofen,"Rubefacients,Top NSAIDS,Capsaicin & Poul"
L83007,0601012S0,99P,176.0,229.0,0.768559,0.125677,0.103972,6.183209,0.378898,0.190804,2.04221,Isophane Insulin,Intermediate And Long-Acting Insulins


In [23]:
import base64
import requests
from time import sleep
from itertools import chain
from cStringIO import StringIO
from datetime import timedelta, date
from IPython.display import display, HTML
#import matplotlib.pyplot as plt

%pylab inline

# Turn off the max column width so the HTML 
# image tags don't get truncated 
pd.set_option('display.max_colwidth', -1)

# Turning off the max column will display all the data in
# our arrays so limit the number of element to display
pd.set_option('display.max_seq_items', 2)

def boxline(boxp, ratio, figsize=(5, 2), **kwags):
    fig, ax = plt.subplots(1, 1, figsize=figsize, **kwags)
    ax.boxplot(boxp['ratio_y'], 0, '', 0)
    #ax.hist(boxp['ratio_y'])
    if isinstance(ratio['ratio'], float):
        ax.plot(ratio['ratio'],1, 'ro')
    else:
        ax.plot(ratio['ratio'],np.ones(len(ratio)), 'ro')
    
    for k,v in ax.spines.items():
        v.set_visible(False)
    #ax.set_xticks([])
    ax.set_yticks([])    
    img = StringIO()
    plt.savefig(img)
    img.seek(0)
    #plt.show()
    plt.close()

    return '<img src="data:image/png;base64,{}"/>'.format(base64.b64encode(img.read()))

Populating the interactive namespace from numpy and matplotlib


In [31]:
def make_table(df, data):
    box = df.reset_index()[['chemical_code','ratio']].merge(data.reset_index(),
                   how='left',
                   left_on='chemical_code',
                   right_on='chemical')
    '''box = box.groupby(['chemical']).aggregate({'ratio_y': boxline})


    for index, row in box2.iterrows():
        print row
        row['ratio_y'] = boxline2(row['ratio_y'],row['ratio'])'''
    box = box.set_index('chemical')
    '''box2 = df.reset_index().merge(box,
                  how='left',
                  left_on='chemical_code',
                  right_index=True)'''
    box['plot'] = 0
    df = df.reset_index().set_index('chemical_code')#.index..droplevel(level=0)
    for idx in df.index:
        #print idx
        '''for p in df.practice:
            print p
        #print df.loc[idx]
            #print df.loc[df.index == idx&(df.practice==p)]#.loc[df.practice==p]
            #print df.loc[idx].query('practice==p')
        #df.loc[idx, 'plot'].loc[df.practice==p] = boxline(box.loc[idx],df.loc[idx].loc[df.practice==p])'''
        df.loc[idx, 'plot'] = boxline(box.loc[idx],df.loc[idx])    
    return HTML(df.to_html(escape=False))

In [32]:
selected = combined.loc[((combined.pct=='99P')
                         &(combined.denom>200)
                         &(combined['mean_nat']>0.01)),['pct', 'num', 'denom', 'ratio',
                                                                  'stds_from_mean_nat','stds_from_mean_ccg',
                                                                  'chemical name', 'subpara name']].sort_values('stds_from_mean_nat',
                                                                   ascending=False).head(10)
make_table(selected, data2[['chemical','ratio']])
#table[['pct','num','denom','ratio','stds_from_mean_nat','stds_from_mean_ccg','plot','chemical name','subpara name']]

Unnamed: 0_level_0,practice,pct,num,denom,ratio,stds_from_mean_nat,stds_from_mean_ccg,chemical name,subpara name,plot
chemical_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0205052Q0,L83012,99P,293.0,478.0,0.612971,17.233945,8.519005,Telmisartan,Angiotensin-II Receptor Antagonists,
0206020R0,L83643,99P,136.0,281.0,0.483986,10.091244,6.629179,Nifedipine,Calcium-Channel Blockers,
0403030P0,L83663,99P,164.0,561.0,0.292335,9.909595,8.92574,Paroxetine Hydrochloride,Selective Serotonin Re-Uptake Inhibitors,
0206020L0,L83134,99P,711.0,1087.0,0.654094,8.071075,5.193195,Lercanidipine Hydrochloride,Calcium-Channel Blockers,
0205052Q0,L83023,99P,186.0,653.0,0.284839,7.73679,3.815964,Telmisartan,Angiotensin-II Receptor Antagonists,
0206020L0,L83099,99P,807.0,1375.0,0.586909,7.170764,4.564845,Lercanidipine Hydrochloride,Calcium-Channel Blockers,
0208020X0,L83136,99P,185.0,765.0,0.24183,6.802342,6.695251,Dabigatran Etexilate,Oral Anticoagulants,
1202010T0,L83098,99P,38.0,244.0,0.155738,6.258055,4.742131,Triamcinolone Acetonide,Drugs Used In Nasal Allergy,
100302010,L83050,99P,326.0,662.0,0.492447,6.25754,5.864096,Ketoprofen,"Rubefacients,Top NSAIDS,Capsaicin & Poul",
0601012S0,L83007,99P,176.0,229.0,0.768559,6.183209,2.04221,Isophane Insulin,Intermediate And Long-Acting Insulins,


In [None]:
data2#[['chemical','ratio']]