In [1]:
import pandas as pd

# Plan

* Check if detailed dataset ADQ coverage matches spreadsheet coverage
* Implement our proposed system for working it out


## Products with form info from postgres

In [2]:
# CSV created with output of this view: https://gist.github.com/sebbacon/c495b58803f85ce70dd26a311db7b432
dmd = pd.read_csv('products.csv')
dmd.head()

Unnamed: 0,bnf_code,name,vpid,form,form_indicator,form_size,form_units,unit_of_measure,numerator,numerator_unit_of_measure,denominator,denominator_unit_of_measure,ingredient_count
0,010101000BBAJA0,Indigestion mixture,11772211000001100,Oral suspension,Discrete,5.0,ml,spoonful,35.0,mg,1.0,ml,3.0
1,010101000BBAJA0,Indigestion mixture,11772211000001100,Oral suspension,Discrete,5.0,ml,spoonful,45.0,mg,1.0,ml,3.0
2,0101010A0AAAAAA,Alexitol sodium 360mg tablets,4859911000001108,Tablet,Discrete,1.0,tablet,tablet,360.0,mg,,,1.0
3,0101010A0BBAAAA,Actal 360mg tablets,4859911000001108,Tablet,Discrete,1.0,tablet,tablet,360.0,mg,,,1.0
4,0101010C0AAAAAA,Aluminium hydroxide 475mg capsules,316937008,Capsule,Discrete,1.0,capsule,capsule,475.0,mg,,,1.0


## ADQs from FOId spreadsheet

In [3]:
adqs = pd.read_csv('adqs_2017_06_01.csv')
adqs = adqs[~pd.isnull(adqs['ADQ Value'])]

In [4]:
adqs.head()

Unnamed: 0,BNF Code,BNF Name,ADQ Value,ADQ Unit
2,0101010C0AAACAC,Alum Hydrox_Oral Susp S/F,30.0,ML
40,0101010J0AAAAAA,Mag Trisil_Mix,30.0,ML
42,0101010L0AAAAAA,Gppe Susp_Asilone S/F,30.0,
43,0101010L0AAAIAI,Gppe Susp_Maalox Plus S/F,30.0,
44,0101010L0AAAKAK,Gppe Liq_Asilone S/F,30.0,


In [5]:
print("There are {} ADQs in the spreadsheet".format(len(adqs)))

There are 5822 ADQs in the spreadsheet


## ADQs and quantity data for April from "detailed" data

In [6]:
sql = """
WITH
  numbered AS (
  SELECT
    Practice_Code,
    BNF_Code,
    BNF_Description,
    Items,
    Quantity,
    ADQ_Usage,
    ROW_NUMBER() OVER (PARTITION BY BNF_Code ORDER BY BNF_Code) AS rownum
  FROM
    tmp_eu.raw_prescribing_data_2018_04
  WHERE
    ADQ_Usage > 0 )
SELECT
  Practice_Code,
  BNF_Code,
  BNF_Description,
  Items,
  Quantity,
  ADQ_Usage
FROM
  numbered
WHERE
  rownum = 1
"""
data_with_adqs = pd.io.gbq.read_gbq(sql, 'ebmdatalab', dialect='standard',verbose=False)

In [7]:
print("There are {} ADQs in the data".format(len(data_with_adqs)))
data_with_adqs.head()

There are 3785 ADQs in the data


Unnamed: 0,Practice_Code,BNF_Code,BNF_Description,Items,Quantity,ADQ_Usage
0,P92015,0102000A0AAABAB,Alverine Cit_Cap 120mg,1,168,112.0
1,P92038,0102000P0AAABAB,Mebeverine HCl_Tab 135mg,1,100,33.33333
2,L81015,0103010D0AAADAD,Cimetidine_Oral Soln 200mg/5ml,2,200,20.0
3,B86032,0103030W0AAAAAA,Tripot Dicitratobismuthate_Tab 120mg,1,48,12.0
4,P81218,0103050L0AAAJAJ,Lansoprazole_Oral Soln 30mg/5ml,1,150,45.0


## Are there any ADQs in the detailed data not in the spreadsheet?

In [8]:
df = data_with_adqs.merge(adqs, how="left", left_on="BNF_Code", right_on="BNF Code")
df2 = data_with_adqs.merge(adqs, how="right", left_on="BNF_Code", right_on="BNF Code")
not_in_mapping = df[pd.isnull(df['BNF Code'])][['BNF_Code','BNF_Description']]
not_in_data = df2[pd.isnull(df2['BNF Code'])][['BNF_Code','BNF_Description']]
print("There are {} BNF Codes with ADQs in the FOI'd mapping which are not in the detailed prescribing data".format(
    len(not_in_data)))
print("There are {} BNF Codes with ADQs in the detailed prescribing data which are not in the FOI'd mapping".format(
    len(not_in_mapping)))
print("These appear to either be things which have had code changes since July 2017 or where the drug is there but not in that strength (possibly also new additions?")
print("Examples of generic ones that are missing:")
not_in_mapping[not_in_mapping['BNF_Code'].str.contains("^.{9}AA")].head()


There are 0 BNF Codes with ADQs in the FOI'd mapping which are not in the detailed prescribing data
There are 74 BNF Codes with ADQs in the detailed prescribing data which are not in the FOI'd mapping
These appear to either be things which have had code changes since July 2017 or where the drug is there but not in that strength (possibly also new additions?
Examples of generic ones that are missing:


Unnamed: 0,BNF_Code,BNF_Description
138,0403010V0AAAGAG,Nortriptyline_Oral Susp 10mg/5ml
394,0403010X0AAANAN,Trazodone HCl_Oral Soln 100mg/5ml S/F
399,0403030E0AAAPAP,Fluoxetine HCl_Tab 10mg
551,0103010T0AABUBU,Ranitidine HCl_Oral Soln 150mg/5ml S/F
923,0301030S0AABKBK,Theophylline_Oral Soln 50mg/5ml


## How do back-calculated ADQs differ from forward-calculated ones?

In [9]:
# Forward-calculated first
merged = dmd.merge(adqs, left_on='bnf_code', right_on='BNF Code')
merged = merged[~pd.isnull(merged['ADQ Value'])]

What we need to do:

1. Calculate "active quantity"
2. Divide that by ADQ
3. This gives "percent of adq" per quantity


The ADQ value is a measure of prescribing volume based on prescribing behaviour in England. It represents the assumed average maintenance dose per day for a drug used for its main indication in adults. The ADQ is not a recommended dose but an analytical unit to compare prescribing activity.

Please note if N/A in ADQ usage column this indicates that there is no ADQ value available and therefore no ADQ usage can be calculated.

If a strength is u/ml and the amount of units is equal to or more than 10,000u/ml the strength field on MDR cannot hold it. At this point the TE is used which is the equivalent of 1,000u. For example 10,000u/ml would be represented as 10.000TE/ml and 20,000u/ml would be 20.000TE/ml, the TE (Therapeutic Equivalent) does not affect the ADQ value.


In [10]:
# work out quantity_units
def compute_quantity_units(row):
    # Definition of solid-continuous products from https://github.com/ebmdatalab/openprescribing/issues/937
    solid_continuous = [
    'Wash',
    'Granules',
    'Paste',
    'Rectal ointment',
    'Stick',
    'Cream',
    'Cutaneous emulsion',
    'Oromucosal gel',
    'Oral gel',
    'Powder',
    'Nasal ointment',
    'Poultice',
    'Eye gel',
    'Vaginal gel',
    'Eye ointment',
    'Powder for solution for iontophoresis',
    'Effervescent powder',
    'Impregnated dressing',
    'Effervescent granules',
    'Oral emulsion',
    'Gel',
    'Ointment',
    'Foam'
    ]
    unit = None
    if row.form_indicator == 'Continuous' and row.form in solid_continuous:
        unit = 'g'
    elif row.form_indicator == 'Not applicable' or (row.form_indicator == 'Discrete' and row.unit_of_measure != 'spoonful'):
        unit = row.unit_of_measure
    else:
        unit = 'ml'
    return unit
merged['quantity_units'] = merged.apply(compute_quantity_units, axis=1)

In [113]:
def normalise(number, unit):
    # for numerator, can be ['mg', 'microgram', 'microlitre', 'ml', 'gram', 'mmol', nan, 'unit']
    if isinstance(number, str):
        number = float(number.replace(',',''))
    if unit == 'mega u' or unit == 'u':
        unit = 'unit dose'
        number *= 1000 # poss 1000000?
    elif unit == 'mcg' or unit == 'microgram':
        unit = 'g'
        number = number / 1000.0 / 1000
    elif unit == 'mg':
        unit = 'g'
        number = number / 1000.0
    elif unit == 'gramme' or unit == 'gram':
        unit = 'g'
    elif unit == 'te':
        unit = 'unit dose'
        number = number * 1000
    elif unit == 'puffs':
        unit = 'dose'  # to match dm+d terminology
    elif unit == 'microlitre':
        number = number / 1000.0
        unit = 'ml'
    elif unit == 'litre':
        number = number * 1000
        unit = 'ml'
    return number, unit
        
debug_name = 'Co-Amilofruse_Tab 5mg/40mg'    
# normalise units to all be either g, ml, dose or unit
def pick_dose_from_liquids(row):   
    # won't work for compounds:
    if row.ingredient_count > 1:
        return row
    import re
    if row['BNF Name'] == debug_name:
        debug = True
        #print(row)
    else:
        debug = False
    unit = None
    units_per_quantum = None
    # if row.unit_of_measure == 'unit dose' or # <<< adding this broke some  sachets XXX
    if row.unit_of_measure == 'unit dose' or (pd.isnull(row.unit_of_measure) and "/" in row['BNF Name']):
        # the ADQ may be expressed in numerator OR denominator units (or "dose")
        matches = re.findall(r"[^0-9.,]([0-9.,]+)(u|g|ml|mg|mcg|mega u)\b", row['BNF Name'])
        if matches:
            units_per_quantum = row.normalised_num
        for match in matches:
            number, unit = match          
            number, unit = normalise(number, unit)
            if debug:
                print(number, unit, row.normalised_adq_units)
            if unit == row.normalised_adq_units:
                # if we're talking unit dose, set the value
                # otherwise, pick it from num/denom
                if row.unit_of_measure == 'unit dose':
                    units_per_quantum = number  # XXX why?
                break
            units_per_quantum = row.normalised_denom  # second time round the loop
        if not units_per_quantum:
            # error: debug
            print("no units_per_q", row['BNF Name'], matches, row.normalised_adq_units)
        elif debug:
            #print(row)
            print("!", row['BNF Name'], matches, row.normalised_adq_units, units_per_quantum)
    row['compound_liquid_val'] = units_per_quantum  # or `number`?
    row['compound_liquid_unit'] = unit
    return row

    
def normalise_units(row):
    # everything must be g, ml, or units
    #row.index = merged.columns
    adq_val = row['ADQ Value']
    adq_unit = row['ADQ Unit'].strip().lower()
    adq_val, adq_unit = normalise(adq_val, adq_unit)
    if adq_unit == '':
        # By comparison with this old PDF, we can see that the ADQ unit of measure appears always to be
        # the same as the quantity units, when unspecified. http://webarchive.nationalarchives.gov.uk/20180328130852tf_/http://content.digital.nhs.uk/media/9376/Average-daily-quantity-ADQ-values-2012-13/pdf/adqs_2012_13.pdf/
        adq_unit = row['quantity_units']
    denom_val = row['denominator']
    denom_unit = row['denominator_unit_of_measure']
    num_val = row['numerator']
    num_unit = row['numerator_unit_of_measure']
    if denom_unit == 'mg':
        denom_val = denom_val / 1000.0
    elif denom_unit == 'litre':
        denom_val = denom_val * 1000
    # units
    num_val, num_unit = normalise(num_val, num_unit)

    # for items measured in volume and weight, pick which of the denominator or numerator 
    # contains the ADQ units
    
    row['normalised_adq_units'] = adq_unit
    row['normalised_adq_values'] = adq_val
    row['normalised_denom'] = denom_val
    row['normalised_num'] = num_val
    row['normalised_num_unit'] = num_unit
    row = pick_dose_from_liquids(row)
    return row
merged = merged.apply(normalise_units, axis=1)


('no units_per_q', 'Ispag Husk_Gran 90% S/F', [], 'g')
('no units_per_q', 'Sterculia_Gran 62% G/F', [], 'g')
('no units_per_q', 'ColiFin 1 MIU_Neb Pdr', [], 'unit dose')


In [114]:
merged[merged['BNF Name'] == debug_name]

Unnamed: 0,bnf_code,name,vpid,form,form_indicator,form_size,form_units,unit_of_measure,numerator,numerator_unit_of_measure,...,ADQ Unit,quantity_units,normalised_adq_units,normalised_adq_values,normalised_denom,normalised_num,normalised_num_unit,compound_liquid_val,compound_liquid_unit,units_per_quantum
927,0202040B0AAAAAA,Co-amilofruse 5mg/40mg tablets,318136009,Tablet,Discrete,1.0,tablet,tablet,5.0,mg,...,,tablet,tablet,1.0,,0.005,g,,,0.005
928,0202040B0AAAAAA,Co-amilofruse 5mg/40mg tablets,318136009,Tablet,Discrete,1.0,tablet,tablet,40.0,mg,...,,tablet,tablet,1.0,,0.04,g,,,0.04


In [115]:
def compute_units_per_quantum(row):
    import re
    units_per_quantum = None
    if row['BNF Name'] == debug_name:
        print(row)
        debug = True
    else:
        debug = False
    if row.form_units == 'unit dose' or (row.normalised_adq_units == row.quantity_units) or not row.normalised_adq_units:
        if row.form_units == 'sachet':
            units_per_quantum = 1
        elif row.form_indicator == 'Discrete':
            units_per_quantum = row.normalised_num 
        elif row.ingredient_count == 1:
            units_per_quantum = row.normalised_num / row.normalised_denom #dies this fix current prb?
        else:
            # for compounds, fall back to assuming it's 1
            units_per_quantum = 1
        if debug:
            print ("0***", units_per_quantum, row.normalised_adq_units, row.quantity_units, row.form_indicator)
            print(row)
    else:

        if row.quantity_units == 'ml':
            # if it's in ml, then it's something in g/ml
            # but spoonfuls may be denominated in numerator or denominator. Pick the one that matches
            if row.normalised_num_unit == row.normalised_adq_units:
                units_per_quantum = row.normalised_num
            else:
                # but what is compound_liquid_unit = should check
                units_per_quantum = row.compound_liquid_val
        else:
            # it must be in discrete units - there should be no denominator
            units_per_quantum = row.normalised_num 
            if debug:
                print ("1***", units_per_quantum)
        if not pd.isnull(row.form_size) and row.unit_of_measure not in ('spoonful', 'unit dose'):
            units_per_quantum *= row.form_size  # for example, a liquid at 50mcg/ml comes in 5ml vials
            if debug:
                print ("2***", units_per_quantum)
        if row.denominator_unit_of_measure == 'hour':
            units_per_quantum *= 24
    # deal with pack sizes. See https://github.com/ebmdatalab/openprescribing/issues/940
    pack_size = None
    pack_size_match = re.search(r'\b(\d+) ?D\b', row['BNF Name'])
    if pack_size_match:
        pack_size = int(pack_size_match.group(1))
    if not pack_size_match:
        # Also in the form of `Mesalazine_Foam Aero Enem 1g/D 14g`
        pack_size_match = re.search(r'\b1(g|mg|ml|mg)/D (\d+)(g|mg|ml|mg)', row['BNF Name'])
        if pack_size_match:
            assert pack_size_match.group(1) == pack_size_match.group(3)
            pack_size = int(pack_size_match.group(2))
    if pack_size:
        units_per_quantum *= pack_size
        if debug:
            print ("3. ***", units_per_quantum, pack_size)
    row['units_per_quantum'] = units_per_quantum
    if debug:
        print ("4. ***", units_per_quantum)
    return row
merged = merged.apply(compute_units_per_quantum, axis=1)

bnf_code                                      0202040B0AAAAAA
name                           Co-amilofruse 5mg/40mg tablets
vpid                                                318136009
form                                                   Tablet
form_indicator                                       Discrete
form_size                                                   1
form_units                                             tablet
unit_of_measure                                        tablet
numerator                                                   5
numerator_unit_of_measure                                  mg
denominator                                               NaN
denominator_unit_of_measure                               NaN
ingredient_count                                            2
BNF Code                                      0202040B0AAAAAA
BNF Name                           Co-Amilofruse_Tab 5mg/40mg
ADQ Value                                                   1
ADQ Unit

# Now compare with released data 

**This is as far as I've got!**  Pick up again from here.

In [111]:

df = data_with_adqs.merge(merged, left_on="BNF_Code", right_on="bnf_code")
#df.head()
cols = ['BNF_Code', 'BNF_Description', 'Quantity', 'ADQ_Usage', 'computed_adq', 'ADQ Value', 'ADQ Unit']
df['computed_adq'] = df.apply(lambda x: (x.Quantity * x.Items * x.units_per_quantum) / x.normalised_adq_values, axis=1)
df2 = df[df['ADQ_Usage'].round(2) != df['computed_adq'].round(2)]
# 124 don't match
# address where df2.unit_of_measure = 'unit dose', 'patch', 'pre-filled disposable injection'
len(df2)


684

In [112]:
unexplained_1 = df2[df2.unit_of_measure == 'spoonful']
df2[df2.bnf_code == '0407020B0BDAAAE'].iloc[0]
unexplained_2 = df2[df2.unit_of_measure == 'patch']  # mostly out by a factor of 4
df2[(df2.unit_of_measure != 'patch') & (df2.unit_of_measure != 'spoonful')].iloc[2]

# cannot explain discrepancy in :
#  patches 0407020B0BDAAAE, 0407020B0BDACAG, 0407020B0BIACAG, 0407020B0BNABAF (all out by factor of 4 too small)
# nebuliser liquid 0301020I0BEABAC they give double what I think
# sachets 0407010H0AADEDE they also seem wrong
#  bnf code 040801060AAA4A4 appears more than once in our dmd mapping to different VMPs / strengths
#  spray 1202010M0AAADAD - I think I'm right and they're saying per-dose. This is a pack/spray discrepancy
#  enema 0105010B0BDADAL - I think I'm right
#  capsule with 2 ingredients. They have "assumed" the capsule itself weighs one gram, or that the ADQ units is in capsules.
#  actuation foam 0105020B0BBAAAA - I think I'm right
#  Normacol granules 0106010N0BBAAAA I think they're right - we should be using numerator


Practice_Code                                          E84025
BNF_Code                                      0202040B0AAAAAA
BNF_Description                    Co-Amilofruse_Tab 5mg/40mg
Items                                                       1
Quantity                                                   84
ADQ_Usage                                                  84
bnf_code                                      0202040B0AAAAAA
name                           Co-amilofruse 5mg/40mg tablets
vpid                                                318136009
form                                                   Tablet
form_indicator                                       Discrete
form_size                                                   1
form_units                                             tablet
unit_of_measure                                        tablet
numerator                                                  40
numerator_unit_of_measure                                  mg
denomina

In [398]:
df2.iloc[2]

Practice_Code                                                          P92620
BNF_Code                                                      0407020B0BDACAG
BNF_Description                         Transtec_T/Derm Patch 70mcg/hr (40mg)
Items                                                                       1
Quantity                                                                    8
ADQ_Usage                                                                  24
bnf_code                                                      0407020B0BDACAG
name                           Transtec 70micrograms/hour transdermal patches
vpid                                                                407897008
form                                                        Transdermal patch
form_indicator                                                       Discrete
form_size                                                                   1
form_units                                                      

In [87]:
cols = ['bnf_code', 'BNF Name', 'quantity_units', 'ADQ Value', 'normalised_adq_units', 'units_per_quantum', 'normalised_adq_values']
# We see ADQ units ml, gramme, puffs, and null for compound products
compound = merged[merged.ingredient_count > 1]
df = compound[(compound.quantity_units != 'dose') & (compound['ADQ Unit'].str.strip() != "") & (compound['normalised_adq_units'] != compound.quantity_units)][cols]
print("There are {} compound, non-dose-based products where the ADQ units are named and don't match the quantity units".format(len(df.bnf_code.unique())))
print("These should be excluded from ADQ calculations as it's not possible to compute with different units")
df2 = compound[(compound.quantity_units != 'dose') & (compound['ADQ Unit'].str.strip() == "")][cols]
print("There are {} compound, non-dose-based products with no ADQ units".format(len(df2.bnf_code.unique())))
print("These include things like 'Ispag/Mebeverine_Gran Eff 3.5g/135mg S/F' (0102000X0AAAAAA). which actually has units specified in this file: http://webarchive.nationalarchives.gov.uk/20180328130852tf_/http://content.digital.nhs.uk/media/9376/Average-daily-quantity-ADQ-values-2012-13/pdf/adqs_2012_13.pdf/")
print("Hence we set the normalised_adq_units to match the quantity_units in these cases")
# Where ADQ Unit is null, it looks like the unit might be the same as the quantity_units
df3 = compound[(compound.quantity_units == 'dose') & (compound['ADQ Unit'].str.strip() == "") & (compound['normalised_adq_units'] != compound.quantity_units)][cols]
print("There are {} compound products which come in 'doses', and have ADQ units".format(len(df3)))
df4 = compound[(compound.quantity_units == 'dose') & (compound['ADQ Unit'].str.strip() != "") & (compound['normalised_adq_units'] != compound.quantity_units)][cols]
print("There are {} compound products which come in 'doses', and have no ADQ units".format(len(df4)))
df4
print("All of the above two are types of inhaler. ")
print("In fact every compound product issued in 'doses' is an inhalaler; and every product in 'doses' is either an inhaler or nasal spray. And they all have 40D or similar in name")
df5 = compound[(compound.quantity_units == 'dose')]
df5
df6 = merged[merged.quantity_units == 'dose']
dose_doses = df6[(df6['BNF Name'].str.contains('\d ?D')) & (df6.normalised_adq_units == 'dose')]
print("There are {} dose-based presentations where the ADQ is measured in grammes".format(len(df6[(df6['BNF Name'].str.contains('\d ?D')) & (df6.normalised_adq_units == 'g')])))
print("and {} dose-based presentations where the ADQ is measured in doses".format(len(df6[(df6['BNF Name'].str.contains('\d ?D')) & (df6.normalised_adq_units == 'dose')])))
gram_doses = df6[(df6['BNF Name'].str.contains('\d ?D')) & (df6.normalised_adq_units != 'dose')]
gram_doses.head()


There are 13 compound, non-dose-based products where the ADQ units are named and don't match the quantity units
These should be excluded from ADQ calculations as it's not possible to compute with different units
There are 325 compound, non-dose-based products with no ADQ units
These include things like 'Ispag/Mebeverine_Gran Eff 3.5g/135mg S/F' (0102000X0AAAAAA). which actually has units specified in this file: http://webarchive.nationalarchives.gov.uk/20180328130852tf_/http://content.digital.nhs.uk/media/9376/Average-daily-quantity-ADQ-values-2012-13/pdf/adqs_2012_13.pdf/
Hence we set the normalised_adq_units to match the quantity_units in these cases
There are 0 compound products which come in 'doses', and have ADQ units
There are 0 compound products which come in 'doses', and have no ADQ units
All of the above two are types of inhaler. 
In fact every compound product issued in 'doses' is an inhalaler; and every product in 'doses' is either an inhaler or nasal spray. And they all hav

Unnamed: 0,bnf_code,name,vpid,form,form_indicator,form_size,form_units,unit_of_measure,numerator,numerator_unit_of_measure,...,BNF Code,BNF Name,ADQ Value,ADQ Unit,quantity_units,normalised_adq_units,normalised_adq_values,normalised_denom,normalised_num,units_per_quantum
2248,0301011E0AAABAB,Formoterol 6micrograms/dose dry powder inhaler,320263003,Inhalation powder,Discrete,1.0,dose,dose,6.0,microgram,...,0301011E0AAABAB,Formoterol Fumar_Pdr For Inh 6mcg (60 D),24.0,MCG,dose,g,2.4e-05,1.0,6e-06,0.00036
2249,0301011E0AAACAC,Formoterol 12micrograms/dose dry powder inhaler,320264009,Inhalation powder,Discrete,1.0,dose,dose,12.0,microgram,...,0301011E0AAACAC,Formoterol Fumar_Pdr For Inh 12mcg(60 D),24.0,MCG,dose,g,2.4e-05,1.0,1.2e-05,0.00072
2250,0301011E0AAADAD,Formoterol 12micrograms/dose inhaler CFC free,9652711000001107,Pressurised inhalation,Discrete,1.0,dose,dose,12.0,microgram,...,0301011E0AAADAD,Formoterol Fumar_Inh 12mcg (100D) CFF,24.0,MCG,dose,g,2.4e-05,1.0,1.2e-05,0.0012
2252,0301011E0BCAAAB,Oxis 6 Turbohaler,320263003,Inhalation powder,Discrete,1.0,dose,dose,6.0,microgram,...,0301011E0BCAAAB,Oxis 6_Turbohaler 6mcg (60 D),24.0,MCG,dose,g,2.4e-05,1.0,6e-06,0.00036
2253,0301011E0BCABAC,Oxis 12 Turbohaler,320264009,Inhalation powder,Discrete,1.0,dose,dose,12.0,microgram,...,0301011E0BCABAC,Oxis 12_Turbohaler 12mcg (60 D),24.0,MCG,dose,g,2.4e-05,1.0,1.2e-05,0.00072


In [88]:
dose_doses.head()

Unnamed: 0,bnf_code,name,vpid,form,form_indicator,form_size,form_units,unit_of_measure,numerator,numerator_unit_of_measure,...,BNF Code,BNF Name,ADQ Value,ADQ Unit,quantity_units,normalised_adq_units,normalised_adq_values,normalised_denom,normalised_num,units_per_quantum
2420,0301040M0AAAAAA,Fenoterol 100micrograms/dose / Ipratropium 40m...,320440005,Pressurised inhalation,Discrete,1.0,dose,dose,40.0,microgram,...,0301040M0AAAAAA,Fenoterol/Ipratrop_Inha 100/40mcg (200D),3.0,,dose,dose,3.0,1.0,4e-05,200.0
2421,0301040M0AAAAAA,Fenoterol 100micrograms/dose / Ipratropium 40m...,320440005,Pressurised inhalation,Discrete,1.0,dose,dose,100.0,microgram,...,0301040M0AAAAAA,Fenoterol/Ipratrop_Inha 100/40mcg (200D),3.0,,dose,dose,3.0,1.0,0.0001,200.0
2424,0301040M0BBAAAA,Duovent inhaler,320440005,Pressurised inhalation,Discrete,1.0,dose,dose,40.0,microgram,...,0301040M0BBAAAA,Duovent_Inha (200 D),3.0,,dose,dose,3.0,1.0,4e-05,200.0
2425,0301040M0BBAAAA,Duovent inhaler,320440005,Pressurised inhalation,Discrete,1.0,dose,dose,100.0,microgram,...,0301040M0BBAAAA,Duovent_Inha (200 D),3.0,,dose,dose,3.0,1.0,0.0001,200.0
2428,0301040R0AAAAAA,Salbutamol 100micrograms/dose / Ipratropium 20...,320442002,Pressurised inhalation,Discrete,1.0,dose,dose,20.0,microgram,...,0301040R0AAAAAA,Salbutamol/Ipratropium_Inh 100/20 (200D),4.0,,dose,dose,4.0,1.0,2e-05,200.0


In [177]:
merged.form_size

0       5.0
1       5.0
2       5.0
3       5.0
4       5.0
5       5.0
6       5.0
7       5.0
8       5.0
9       5.0
10      5.0
11      5.0
12      5.0
13      5.0
14      5.0
15      5.0
16      5.0
17      5.0
18      5.0
19      5.0
20      1.0
21      1.0
22      1.0
23      1.0
24      1.0
25      1.0
26      1.0
27      NaN
28      NaN
29      NaN
       ... 
6692    1.0
6693    1.0
6694    1.0
6695    1.0
6696    1.0
6697    1.0
6698    1.0
6699    1.0
6700    1.0
6701    1.0
6702    1.0
6703    NaN
6704    1.0
6705    1.0
6706    1.0
6707    NaN
6708    NaN
6709    1.0
6710    1.0
6711    1.0
6712    1.0
6713    1.0
6714    1.0
6715    1.0
6716    1.0
6717    1.0
6718    1.0
6719    1.0
6720    1.0
6721    1.0
Name: form_size, Length: 6722, dtype: float64

In [99]:
df2.head()

Unnamed: 0,bnf_code,BNF Name,quantity_units,ADQ Value,normalised_adq_units,units_per_quantum,normalised_adq_values
3,0101010L0AAAAAA,Gppe Susp_Asilone S/F,ml,30.0,ml,1.0,30.0
4,0101010L0AAAAAA,Gppe Susp_Asilone S/F,ml,30.0,ml,1.0,30.0
5,0101010L0AAAAAA,Gppe Susp_Asilone S/F,ml,30.0,ml,1.0,30.0
6,0101010L0AAAIAI,Gppe Susp_Maalox Plus S/F,ml,30.0,ml,1.0,30.0
7,0101010L0AAAIAI,Gppe Susp_Maalox Plus S/F,ml,30.0,ml,1.0,30.0


In [93]:
dose_nondoses = df2[~(df2['BNF Name'].str.contains('\d ?D')) & (df2.normalised_adq_units == 'dose')]
dose_nondoses.head()

Unnamed: 0,bnf_code,BNF Name,quantity_units,ADQ Value,normalised_adq_units,units_per_quantum,normalised_adq_values


So we have discovered:
* Things whose `unit_of_measure` is `dose` are always those packs of inhaler or similar
* Things without ADQ Units are always things which come in discrete packages, and are counted in those packages

Things which come in doses:
* Are prescribed as 1 = 120D 
* ADQs are either in mg or puffs/doses.  The ones measured in doses are compound products; the ones in grams are not.
* When in puffs, the total adq units (units_per_quantum) in one quantity is just the number (200D) - so multiply quantity by 200
* When in grammes, the units_per_quantum would be num/denom * pack_size

In [None]:
merged[merged.name.str.contains("statin")]

In [None]:
merged[merged.unit_of_measure == 'tablet'].iloc[0]

In [None]:
merged.loc[543]

In [None]:
merged.loc[544]

In [None]:
merged.loc[545]

In [None]:
merged[merged.bnf_code == '0102000X0AAAAAA']

In [26]:
merged[merged['BNF Name'].str.contains('\(\d+ ?D')].form.unique()

array(['Foam', 'Inhalation powder', 'Pressurised inhalation',
       'Solution for injection', 'Spray'], dtype=object)

In [106]:
merged.merge(not_in_mapping, how='right',left_on='bnf_code', right_on='BNF_Code')

Unnamed: 0,bnf_code,name,vpid,form,form_indicator,form_size,form_units,unit_of_measure,numerator,numerator_unit_of_measure,...,ADQ Value,ADQ Unit,quantity_units,normalised_adq_units,normalised_adq_values,normalised_denom,normalised_num,units_per_quantum,BNF_Code,BNF_Description
0,,,,,,,,,,,...,,,,,,,,,0403010X0AAANAN,Trazodone HCl_Oral Soln 100mg/5ml S/F
1,,,,,,,,,,,...,,,,,,,,,0403030E0AAAPAP,Fluoxetine HCl_Tab 10mg
2,,,,,,,,,,,...,,,,,,,,,0407020ADBQACAH,Onexila XL_Tab 40mg
3,,,,,,,,,,,...,,,,,,,,,0407020B0BSABAF,Turgeon_Transdermal Patch 52.5mcg/hr
4,,,,,,,,,,,...,,,,,,,,,0102000T0BKAAAA,PepperMinn_Cap G/R 0.2ml


In [128]:
# To check later against detailed prescribing data
weird1 = merged[(merged.denominator_unit_of_measure != merged.quantity_units) & (merged.form_indicator != 'Discrete') & (merged.ingredient_count == 1)]

In [None]:
merged[merged.ingredient_count == 1 & (merged.normalised_adq_units != merged.quantity_units) & (merged.quantity_units == 'ml')]

In [238]:
for name in merged[merged.unit_of_measure == 'unit dose']['BNF Name']:
    match = re.findall(r"[^0-9.]([0-9.]+)(ml|mg|mcg|mega u)", name)
    # Also 1mega u
    if match:
        val, unit = match[-1]
        val = float(val)
        if unit == 'mega u':
            unit = 'unit dose'
            val *= 1000
        elif unit == 'mcg':
            unit = 'g'
            val = val / 1000.0 / 1000
        elif unit == 'mg':
            unit = 'g'
            val = val / 1000.0
        print(name, val, unit)
    else:
        print("----", name)

('Salbutamol_Inh Soln 2.5mg/2.5ml Ud', 2.5, 'ml')
('Salbutamol_Inh Soln 5mg/2.5ml Ud', 2.5, 'ml')
('Ventolin_Nebules Soln 2.5mg/2.5ml Ud', 2.5, 'ml')
('Ventolin_Nebules Soln 5mg/2.5ml Ud', 2.5, 'ml')
('Salamol_Steri-Neb Soln 2.5mg/2.5ml Ud', 2.5, 'ml')
('Salamol_Steri-Neb Soln 5mg/2.5ml Ud', 2.5, 'ml')
('Salbutamol Steripoule_2.5mg/2.5ml Ud', 2.5, 'ml')
('Salbutamol Steripoule_5mg/2.5ml Ud', 2.5, 'ml')
('Terbut Sulf_Inh Soln 5mg/2ml Ud', 2.0, 'ml')
('Bricanyl_Respule 2.5mg/ml 2ml Ud', 2.0, 'ml')
('Ipratrop Brom_Inh Soln 500mcg/2ml Ud', 2.0, 'ml')
('Ipratrop Brom_Inh Soln 250mcg/1ml Ud', 1.0, 'ml')
('Atrovent UDVs_Neb Soln 500mcg/2ml Ud', 2.0, 'ml')
('Atrovent UDVs_Neb Soln 250mcg/1ml Ud', 1.0, 'ml')
('Ipratrop_Steri-Neb Soln 250mcg/1ml Ud', 1.0, 'ml')
('Ipratrop_Steri-Neb Soln 500mcg/2ml Ud', 2.0, 'ml')
('Respontin_Nebules 250mcg/1ml Ud', 1.0, 'ml')
('Respontin_Nebules 500mcg/2ml Ud', 2.0, 'ml')
('Ipratrop Steripoule_250mcg/1ml Ud', 1.0, 'ml')
('Ipratrop Steripoule_500mcg/2ml Ud', 2.0,

In [239]:
merged[merged.unit_of_measure == 'unit dose'].iloc[0]

bnf_code                                                         0301011R0AAAXAX
name                           Salbutamol 2.5mg/2.5ml nebuliser liquid unit d...
vpid                                                                   320177008
form                                                            Nebuliser liquid
form_indicator                                                          Discrete
form_size                                                                    2.5
form_units                                                                    ml
unit_of_measure                                                        unit dose
numerator                                                                      1
numerator_unit_of_measure                                                     mg
denominator                                                                    1
denominator_unit_of_measure                                                   ml
ingredient_count            