In [1]:

import pandas as pd
import numpy as np
GBQ_PROJECT_ID = '620265099307'

q = '''SELECT * FROM ebmdatalab.outlier_detection.chem_by_subpara_by_ccg_juntoaug17
WHERE SUBSTR(pct,1,1) NOT BETWEEN 'A' AND 'Z' -- exclude non-standard CCG codes
AND SUBSTR(pct,3,1) BETWEEN 'A' AND 'Z'  -- exclude non-standard CCG codes'''

df1 = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, dialect='standard',verbose=False)
# rows: pct, chemical, subpara, num, denom, ratio (num and denom are items not quantity)

q2 = '''SELECT DISTINCT chemical, chemical_code from ebmdatalab.hscic.bnf'''
chem = pd.io.gbq.read_gbq(q2, GBQ_PROJECT_ID, dialect='standard',verbose=False)

q3 = '''SELECT DISTINCT subpara, subpara_code from ebmdatalab.hscic.bnf'''
subp = pd.io.gbq.read_gbq(q3, GBQ_PROJECT_ID, dialect='standard',verbose=False)

In [2]:
# need to flag where ccgs have not prescribed any items of the denominator in order to clean the data. 

# Step 1: amend the datafrome to include a line for every CCG and every chemical and subparagraph.

# list all subpara-chemical combinations 
a = df1[["subpara", "chemical"]].drop_duplicates()

#list all ccgs
b = df1[["pct"]].drop_duplicates()

# cross join to make table of all CCGs and all subpara combinations 
a['tmp'] = 1
b['tmp'] = 1
c = b.merge(a, on="tmp").drop('tmp', axis=1) # 237,636 rows

# join to data - need to list every possible chemical against every CCG
data = c.merge(df1, how="left", on=["pct","subpara","chemical"])  # 237,636 rows
data

Unnamed: 0,pct,subpara,chemical,num,denom,ratio
0,05X,1104020,1104020M0,30.0,1305.0,0.022989
1,05X,1104020,1104020N0,126.0,1305.0,0.096552
2,05X,1104020,1104020T0,1006.0,1305.0,0.770881
3,05X,1104020,1104020W0,32.0,1305.0,0.024521
4,05X,1104020,1104020Z0,77.0,1305.0,0.059004
5,05X,1104020,1104020AC,,,
6,05X,1104020,1104020AE,,,
7,05X,1104020,1104020B0,,,
8,05X,1104020,1104020Y0,,,
9,05X,1104020,110402000,,,


In [4]:
# Step 2: identify those with zero subparas
# subpara totals by ccg
subpara = df1[["pct","subpara","denom"]].groupby(["subpara","pct"]).max().reset_index() # 42,917 rows

#list all possible subparagraphs and all ccgs
a2 = df1[["subpara"]].drop_duplicates()
a2['tmp'] = 1

# cross join to CCGs to make table of all CCGs and all subpara combinations 
c2 = b.merge(a2, on="tmp").drop('tmp', axis=1) # 56,097 rows

# join to subpara data by ccg to identify subparas prescribed by each ccg.  
d = c2.merge(subpara,how="left", on=["subpara","pct"])

# for subparas never prescribed, replace NAs with zeros so that there is data present to indicate this
d = d.fillna(0)

# join back to original dataset
d2 = d.merge(data, how="left", on=["subpara","pct"], suffixes=("_subpara",""))
# check how many have zero denominators:
# data.loc[(data["denom_subpara"]==0)] # 19,665 rows 

# exclude combinations where denominators are zero THEN replace NAs with 0:
data2 = d2.loc[(d2["denom_subpara"]!=0)]
data2 = data2.fillna(0)
data2

Unnamed: 0,pct,subpara,denom_subpara,chemical,num,denom,ratio
0,05X,1104020,1305.0,1104020M0,30.0,1305.0,0.022989
1,05X,1104020,1305.0,1104020N0,126.0,1305.0,0.096552
2,05X,1104020,1305.0,1104020T0,1006.0,1305.0,0.770881
3,05X,1104020,1305.0,1104020W0,32.0,1305.0,0.024521
4,05X,1104020,1305.0,1104020Z0,77.0,1305.0,0.059004
5,05X,1104020,1305.0,1104020AC,0.0,0.0,0.000000
6,05X,1104020,1305.0,1104020AE,0.0,0.0,0.000000
7,05X,1104020,1305.0,1104020B0,0.0,0.0,0.000000
8,05X,1104020,1305.0,1104020Y0,0.0,0.0,0.000000
9,05X,1104020,1305.0,110402000,0.0,0.0,0.000000


In [5]:
#select columns of interest and get key stats
df2 = data2[["chemical","subpara", "ratio"]].groupby(["chemical","subpara"]).describe()
df2 = df2.unstack()
df2.columns = df2.columns.droplevel()
df2 = df2.reset_index()

#limit to chemicals prescribed by at least 10 CCGs??
df3 = df2#.loc[df2["count"]>9].reset_index()
df3["range"] = df3["max"] - df3["min"]
df3 = df3[["chemical","subpara","count","50%","min","max","range","std"]].rename(columns={"50%":"median"})

df3

Unnamed: 0,chemical,subpara,count,median,min,max,range,std
0,0101010C0,0101010,192.0,0.000000,0.000000,0.340426,0.340426,0.057828
1,0101010F0,0101010,192.0,0.000000,0.000000,0.185629,0.185629,0.013397
2,0101010G0,0101010,192.0,0.441441,0.000000,0.912442,0.912442,0.212878
3,0101010I0,0101010,192.0,0.000000,0.000000,0.294872,0.294872,0.044221
4,0101010J0,0101010,192.0,0.000000,0.000000,0.129032,0.129032,0.015764
5,0101010L0,0101010,192.0,0.000000,0.000000,0.411765,0.411765,0.055342
6,0101010N0,0101010,192.0,0.000000,0.000000,0.118644,0.118644,0.012905
7,0101010Q0,0101010,192.0,0.000000,0.000000,0.773087,0.773087,0.147875
8,0101010R0,0101010,192.0,0.240833,0.000000,0.864865,0.864865,0.172471
9,0101012B0,0101012,28.0,1.000000,0.956522,1.000000,0.043478,0.008217


In [18]:
# reshape data to put CCGs in columns
df5 = data2.pivot(index="chemical",columns='pct', values='ratio')

#sum numerators to find total volume for each chemical
num = pd.DataFrame(df1["num"].groupby(df1["chemical"]).sum()).reset_index()

#calculate kurtosis and skew for each chemical
import scipy.stats as stats
k = pd.Series(stats.kurtosis(df5, axis=1,nan_policy="omit"),name="kurtosis")
sk =  pd.Series(stats.skew(df5, axis=1,nan_policy="omit"),name="skew")

num["num centile"] = pd.qcut(num["num"], 10, labels=np.arange(1,11,1))
num

Unnamed: 0,chemical,num,num centile
0,0101010C0,724,3
1,0101010F0,31,1
2,0101010G0,19555,6
3,0101010I0,416,2
4,0101010J0,228,2
5,0101010L0,939,3
6,0101010N0,164,2
7,0101010Q0,2410,3
8,0101010R0,9243,5
9,0101012B0,996,3


In [19]:

#count non-zero values to indicate how many CCGs have prescribed each chemical. 
#count = pd.Series(df5.count(axis=1),name="CCG count")

# replace nulls with zeros to take into account CCGs prescribing none in the summary stats
'''df6 = df5#.fillna(0)
df6 = pd.DataFrame(df6.stack()).reset_index().rename(columns={0:"ratio"})
df6 = df6.groupby("chemical").describe().unstack()
df6 = df6.reset_index(col_level=1)
df6.columns = df6.columns.droplevel()
smry = df6[["50%", "min","max","std"]].rename(columns={"50%":"median","min":"abs_min","std":"std_inc_zeros","max":"max2"})
smry["abs_range"] = smry["max2"]- smry["abs_min"]'''


#compile all results together
result = pd.concat([df3, k, sk], axis=1).sort_values(by="kurtosis",ascending=False)
result = result.merge(num, on="chemical")
#result[["chemical","subpara","num","count","median","abs_min","min","max","range","abs_range", "std","std_inc_zeros","kurtosis","skew"]].round(2)
result = result[["chemical","subpara","num","num centile", "count","median","min","max","range","std","kurtosis","skew"]].round(2)

In [20]:
# Lookup chemical and subparagraph names
df4 = result.merge(chem, how="left", left_on = "chemical",right_on="chemical_code",suffixes=(""," name"))
df4 = df4.merge(subp, how="left", left_on = "subpara",right_on="subpara_code",suffixes=(""," name"))
#df3 = df3[["chemical","chemical name","subpara","subpara name","min","max","range","std"]]
df4 = df4[["chemical","chemical name","subpara","subpara name","num","num centile", "count","median","min","max","range", "std","kurtosis","skew"]].round(2)

In [21]:
# sort by range first

df4.sort_values(by=["range","kurtosis"],ascending=False).head(50)

Unnamed: 0,chemical,chemical name,subpara,subpara name,num,num centile,count,median,min,max,range,std,kurtosis,skew
253,0605010S0,Somatropin,605010,Hypothalamic&Ant Pituit Hormone&Antioest,14380,6,157.0,0.92,0.0,1.0,1.0,0.14,20.94,-3.9
270,0801030L0,Mercaptopurine,801030,Antimetabolites,16323,6,189.0,0.96,0.0,1.0,1.0,0.12,17.64,-3.44
428,0408020W0,Midazolam Hydrochloride,408020,Drugs Used In Status Epilepticus,15886,6,204.0,0.82,0.0,1.0,1.0,0.13,5.93,-1.56
466,0410010A0,Acamprosate Calcium,410010,Alcohol Dependence,29082,6,192.0,0.82,0.0,1.0,1.0,0.19,4.91,-1.91
472,0607010C0,Cabergoline,607010,Bromocriptine & Other Dopaminergic Drugs,16179,6,200.0,0.8,0.0,1.0,1.0,0.13,4.85,-1.42
496,091101000,Other Amino Acid&Nutritional Agent Preps,911010,Amino Acids & Nutritional Agents,2436,4,61.0,1.0,0.0,1.0,1.0,0.3,4.29,-2.43
555,0906011D0,Vitamin A,906011,Vitamin A,14106,5,190.0,0.8,0.0,1.0,1.0,0.19,2.95,-1.48
563,0905041R0,Zinc Sulfate Monohydrate,905041,Zinc,5892,4,115.0,0.69,0.0,1.0,1.0,0.21,2.79,-1.44
614,0607020C0,Danazol,607020,Drugs affecting Gonadotrophins,341,2,14.0,0.96,0.0,1.0,1.0,0.35,1.96,-1.95
652,0106050S0,Sodium Picosulfate,106050,Bowel Cleansing Preparations,1292,3,49.0,0.8,0.0,1.0,1.0,0.29,1.54,-1.63


In [22]:
#limit to items with at least 1% range 

df4.loc[(df4["range"] >0.1)].sort_values(by=["kurtosis"],ascending=False)


Unnamed: 0,chemical,chemical name,subpara,subpara name,num,num centile,count,median,min,max,range,std,kurtosis,skew
48,1310011AA,Retapamulin,1310011,Antibacterial Preps Only Used Topically,575,2,202.0,0.00,0.00,0.68,0.68,0.05,197.00,14.11
49,1002010M0,Neostigmine Bromide,1002010,Dgs Which Enhance Neuromus'ar Transmi'on,29,1,202.0,0.00,0.00,0.31,0.31,0.02,197.00,14.11
52,0101010F0,Magnesium Carbonate,0101010,Antacids and Simeticone,31,1,192.0,0.00,0.00,0.19,0.19,0.01,187.01,13.75
53,0902011L0,Calcium Polystyrene Sulfonate,0902011,Oral Potassium,138,2,207.0,0.00,0.00,0.43,0.43,0.03,185.25,13.50
55,0410030D0,Lofexidine Hydrochloride,0410030,Opioid Dependence,59,1,188.0,0.00,0.00,0.53,0.53,0.04,183.00,13.60
61,0410010D0,Nalmefene,0410010,Alcohol Dependence,125,2,192.0,0.00,0.00,0.22,0.22,0.02,172.72,13.05
64,0310000E0,Ephedrine Hydrochloride,0310000,Systemic Nasal Decongestants,21,1,166.0,0.00,0.00,0.24,0.24,0.02,161.01,12.77
77,090602700,Other Vitamin B Compound Preps,0906027,Vitamin B Compound,533,2,207.0,0.00,0.00,0.17,0.17,0.01,136.68,11.28
78,0501070N0,Sodium Fusidate,0501070,Some Other Antibacterials,247,2,179.0,0.00,0.00,0.66,0.66,0.05,127.88,10.89
80,0906027G0,Vitamin B Compound,0906027,Vitamin B Compound,563923,10,207.0,1.00,0.83,1.00,0.17,0.01,126.43,-10.67


In [23]:
df4.loc[(df4["range"] >0.1 & df4["num centile"]>1)].sort_values(by=["skew"],ascending=False)

Unnamed: 0,chemical,chemical name,subpara,subpara name,num,num centile,count,median,min,max,range,std,kurtosis,skew
48,1310011AA,Retapamulin,1310011,Antibacterial Preps Only Used Topically,575,2,202.0,0.00,0.00,0.68,0.68,0.05,197.00,14.11
49,1002010M0,Neostigmine Bromide,1002010,Dgs Which Enhance Neuromus'ar Transmi'on,29,1,202.0,0.00,0.00,0.31,0.31,0.02,197.00,14.11
52,0101010F0,Magnesium Carbonate,0101010,Antacids and Simeticone,31,1,192.0,0.00,0.00,0.19,0.19,0.01,187.01,13.75
55,0410030D0,Lofexidine Hydrochloride,0410030,Opioid Dependence,59,1,188.0,0.00,0.00,0.53,0.53,0.04,183.00,13.60
53,0902011L0,Calcium Polystyrene Sulfonate,0902011,Oral Potassium,138,2,207.0,0.00,0.00,0.43,0.43,0.03,185.25,13.50
61,0410010D0,Nalmefene,0410010,Alcohol Dependence,125,2,192.0,0.00,0.00,0.22,0.22,0.02,172.72,13.05
64,0310000E0,Ephedrine Hydrochloride,0310000,Systemic Nasal Decongestants,21,1,166.0,0.00,0.00,0.24,0.24,0.02,161.01,12.77
77,090602700,Other Vitamin B Compound Preps,0906027,Vitamin B Compound,533,2,207.0,0.00,0.00,0.17,0.17,0.01,136.68,11.28
78,0501070N0,Sodium Fusidate,0501070,Some Other Antibacterials,247,2,179.0,0.00,0.00,0.66,0.66,0.05,127.88,10.89
106,0905022W0,Sucroferric Oxyhydroxide,0905022,Phosphate Binding Agents,85,1,202.0,0.00,0.00,0.11,0.11,0.01,96.65,9.92
