In [518]:
import pandas as pd

# Filter out warnings per https://stackoverflow.com/a/40846742/559140
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")


In [519]:
# CSV created with output of this view: https://gist.github.com/sebbacon/c495b58803f85ce70dd26a311db7b432
dmd = pd.read_csv('products.csv')
dmd.iloc[0]

bnf_code                           010101000BBAJA0
name                           Indigestion mixture
vpid                             11772211000001100
form                               Oral suspension
form_indicator                            Discrete
form_size                                        5
form_units                                      ml
unit_of_measure                           spoonful
numerator                                       35
numerator_unit_of_measure                       mg
denominator                                      1
denominator_unit_of_measure                     ml
ingredient_count                                 3
Name: 0, dtype: object

In [520]:
squs = pd.read_csv('squs.csv')  # These were generated by running pca_quantity_fetcher.py
dmd = dmd.merge(squs, on='bnf_code')

In [521]:
# Compute the quantity_denominator. This prefers SQUs from data but where this is missing, uses a heuristic to guess.
import adq_lib
dmd['quantity_denominator'] = dmd.apply(adq_lib.compute_quantity_units, axis=1)


In [522]:
# Import ADQs (which we got from an FOI) and normalise their units to grammes, ml, units, or unit doses
adqs = pd.read_csv('adqs_2017_06_01.csv')
adqs.columns = ['bnf_code', 'bnf_name', 'adq_value', 'adq_denominator']
adqs = adqs[~pd.isnull(adqs['adq_value'])]
adqs['dose_multiplier'] = adqs['bnf_name'].apply(adq_lib.dose_from_name)
import functools
normalise_adq = functools.partial(adq_lib.normalise, number_name='adq_value', unit_name='adq_denominator')
adqs = adqs.apply(normalise_adq, axis=1)


In [523]:
# Normalise numerators and denominators from DMD in the same way
normalise_numerator = functools.partial(adq_lib.normalise, number_name='numerator', unit_name='numerator_unit_of_measure')
dmd = dmd.apply(normalise_numerator, axis=1)
normalise_denominator = functools.partial(adq_lib.normalise, number_name='denominator', unit_name='denominator_unit_of_measure')
dmd = dmd.apply(normalise_denominator, axis=1)
#

In [524]:
merged = dmd.merge(adqs, on='bnf_code')
merged

Unnamed: 0,bnf_code,name,vpid,form,form_indicator,form_size,form_units,unit_of_measure,numerator,numerator_unit_of_measure,denominator,denominator_unit_of_measure,ingredient_count,squ,quantity_denominator,bnf_name,adq_value,adq_denominator,dose_multiplier
0,0101010J0AAAAAA,Boots Magnesium trisilicate mixture,14609911000001108,Oral suspension,Discrete,5.0,ml,spoonful,0.05000,g,1.0,ml,3.0,ml,ml,Mag Trisil_Mix,30.00000,ml,1.0
1,0101010J0AAAAAA,Magnesium trisilicate oral suspension,14609911000001108,Oral suspension,Discrete,5.0,ml,spoonful,0.05000,g,1.0,ml,3.0,ml,ml,Mag Trisil_Mix,30.00000,ml,1.0
2,0101010L0BBABAA,Asilone oral suspension,3391711000001101,Oral suspension,Discrete,5.0,ml,spoonful,0.01400,g,1.0,ml,3.0,ml,ml,Asilone_Susp S/F,30.00000,,1.0
3,0101010L0BBABAA,Asilone oral suspension,3391711000001101,Oral suspension,Discrete,5.0,ml,spoonful,0.02700,g,1.0,ml,3.0,ml,ml,Asilone_Susp S/F,30.00000,,1.0
4,0101010L0BBABAA,Asilone oral suspension,3391711000001101,Oral suspension,Discrete,5.0,ml,spoonful,0.08400,g,1.0,ml,3.0,ml,ml,Asilone_Susp S/F,30.00000,,1.0
5,0101010L0BBAEAK,Asilone Antacid liquid,3391711000001101,Oral suspension,Discrete,5.0,ml,spoonful,0.01400,g,1.0,ml,3.0,ml,ml,Asilone Antacid_Liq S/F,30.00000,,1.0
6,0101010L0BBAEAK,Asilone Antacid liquid,3391711000001101,Oral suspension,Discrete,5.0,ml,spoonful,0.02700,g,1.0,ml,3.0,ml,ml,Asilone Antacid_Liq S/F,30.00000,,1.0
7,0101010L0BBAEAK,Asilone Antacid liquid,3391711000001101,Oral suspension,Discrete,5.0,ml,spoonful,0.08400,g,1.0,ml,3.0,ml,ml,Asilone Antacid_Liq S/F,30.00000,,1.0
8,0101010L0BEAAAI,Maalox Plus oral suspension,3545911000001109,Oral suspension,Discrete,5.0,ml,spoonful,0.00500,g,1.0,ml,3.0,ml,ml,Maalox Plus_Susp S/F,30.00000,,1.0
9,0101010L0BEAAAI,Maalox Plus oral suspension,3545911000001109,Oral suspension,Discrete,5.0,ml,spoonful,0.03900,g,1.0,ml,3.0,ml,ml,Maalox Plus_Susp S/F,30.00000,,1.0


# Now check

In [257]:
sql = """
WITH
  numbered AS (
  SELECT
    Practice_Code,
    BNF_Code,
    BNF_Description,
    Items,
    Quantity,
    ADQ_Usage,
    ROW_NUMBER() OVER (PARTITION BY BNF_Code ORDER BY BNF_Code) AS rownum
  FROM
    tmp_eu.raw_prescribing_data_2018_05
  WHERE
    ADQ_Usage > 0 )
SELECT
  Practice_Code,
  BNF_Code,
  BNF_Description,
  Items,
  Quantity,
  ADQ_Usage
FROM
  numbered
WHERE
  rownum = 1
"""
data_with_adqs = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard',verbose=False)



In [525]:
merged['adq_per_quantity'] = merged.apply(adq_lib.adq_per_quantity, axis=1)

In [527]:
# Do QA check against NHS Digital data
df = data_with_adqs.merge(merged, left_on="BNF_Code", right_on="bnf_code")
df['computed_adq'] = df.apply(lambda x: (x.Quantity * x.Items * x.adq_per_quantity), axis=1)


In [528]:
df2 = df[df['ADQ_Usage'].round(1) != df['computed_adq'].round(1)].set_index("BNF_Code")
print("There are approximately {}/{} ADQs that are possibly incorrect".format(len(df2), len(df)))
print("Legit reasons include ADQs having been changed since the last spreadsheet was FOIed, DMD data wrong or missing.  See notes.md for some examples")
      

There are approximately 103/3818 ADQs that are possibly incorrect
Legit reasons include ADQs having been changed since the last spreadsheet was FOIed, DMD data wrong or missing.  See notes.md for some examples
