## Code used to create ppu table
- table saved at ebmdatalab.outlier_detection.ppu_national_presentation_level

## Import data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
GBQ_PROJECT_ID = '620265099307'

q = '''
SELECT
  month,
  bnf_code,
  SUM(ppu) as ppu,
  SUM(quantity) as quantity
FROM
  ebmdatalab.outlier_detection.ppu_national_presentation_level
GROUP BY
  month, bnf_code
ORDER BY
  bnf_code, month
'''
ppu = pd.io.gbq.read_gbq(q, GBQ_PROJECT_ID, verbose=False, dialect='standard')
ppu.head(3)

Unnamed: 0,month,bnf_code,ppu,quantity
0,2013-12-01,23965609635,5.666,850
1,2014-01-01,23965609635,5.666,470
2,2014-02-01,23965609635,5.666,590


In [2]:
df = ppu[['month','bnf_code','ppu']]
df = df.sort_values(['bnf_code','month'])
df = df.set_index(['bnf_code', 'month'])
df = df.unstack()
df.head(3)

Unnamed: 0_level_0,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu,ppu
month,2010-08-01,2010-09-01,2010-10-01,2010-11-01,2010-12-01,2011-01-01,2011-02-01,2011-03-01,2011-04-01,2011-05-01,...,2016-11-01,2016-12-01,2017-01-01,2017-02-01,2017-03-01,2017-04-01,2017-05-01,2017-06-01,2017-07-01,2017-08-01
bnf_code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0101010C0AAAAAA,0.031264,0.031261,0.031264,0.031262,0.031263,0.038091,0.038092,0.038091,0.03809,0.038091,...,0.114256,0.114256,0.114257,0.114257,0.114256,0.114258,0.11426,0.114256,0.114256,0.114256
0101010C0AAACAC,,,,,,,,,,,...,,,,,,,,,,
0101010C0AAAHAH,,,,,,,,,,,...,,,,,,,,,,


In [3]:
quantity = ppu[['bnf_code','month','quantity']]
quantity = quantity.sort_values(['bnf_code','month'])
quantity = quantity.set_index(['bnf_code', 'month'])
quantity = quantity.unstack()
quantity.columns = range(quantity.shape[1])
quantity.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,75,76,77,78,79,80,81,82,83,84
bnf_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0101010C0AAAAAA,31967.0,42326.0,34780.0,38484.0,35777.0,35160.0,31664.0,34220.0,31867.0,35046.0,...,16290.0,26066.0,19605.0,17784.0,21539.0,17888.0,18461.0,19701.0,18034.0,18559.0
0101010C0AAACAC,,,,,,,,,,,...,,,,,,,,,,
0101010C0AAAHAH,,,,,,,,,,,...,,,,,,,,,,


## CUSUM

In [4]:
months_smoothing = 12
sensitivity = 5

# remove date labels from columns
dates = df.columns #save date labels for later
df.columns = range(df.shape[1])

# create necessary tables with same index
smax = pd.DataFrame(index=df.index)
smin = pd.DataFrame(index=df.index)
reference_value = pd.DataFrame(index=df.index)
threshold = pd.DataFrame(index=df.index)
alert = pd.DataFrame(index=df.index)
not_alert = pd.DataFrame(index=df.index)

# set initial values for s
smax[0] = 0
smin[0] = 0

# set initial reference value for each 
reference_value[0] = df.loc[:,0:months_smoothing-1].mean(axis=1)
#set initial threshold
threshold[0] = df.loc[:,0:months_smoothing-1].std(axis=1, ddof=0) * sensitivity

# loop over months of data
for i in range(1,df.shape[1]):
    
    # logic to determine alerts from previous month
    alert = (smax[i-1] > threshold[i-1]) | (smin[i-1] < -threshold[i-1])
    not_alert = (smax[i-1] <= threshold[i-1]) & (smin[i-1] >= -threshold[i-1])
    ## I'm sure there's a way of doing without this ^ one,
        # just coudn't work out the syntax to slice pandas rows
        # based on the opposite of the array of boolean values
        
    # reference value and threshold:
    ##append previous value if no alert
    reference_value.loc[not_alert, i] = reference_value.loc[not_alert, i-1]
    threshold[i] = threshold[i-1]
    
    ## reset reference after alert
    reference_value.loc[alert, i] = df.loc[alert,i-months_smoothing:i-1].mean(axis=1)
    
    # CUSUM calculation (adding on previous s values done elsewhere)
    ##threshold[[i]] / sensitivity is a way of deriving stdev without generating it again
    smax[i] = df[i] - (reference_value[i] + (0.5 * threshold[i] / sensitivity))
    smin[i] = df[i] - (reference_value[i] - (0.5 * threshold[i] / sensitivity))
    
    # retrigger logic
    retrigger_pos = (smax[i] > 0) & (smax[i-1] > threshold[i-1])
    retrigger_neg = (smin[i] < 0) & (smin[i-1] < -threshold[i-1])
    cont_cumulative = not_alert|retrigger_pos|retrigger_neg
    thres_res = alert!=retrigger_pos|retrigger_neg
    
    # reset threshold
    threshold.loc[thres_res, i] = df.loc[thres_res,i-months_smoothing:i-1].std(axis=1, ddof=0) * sensitivity
    
    # add s from previous month if no trigger, or retrigger
    smax.loc[cont_cumulative, i] = smax.loc[cont_cumulative, i] + smax.loc[cont_cumulative, i-1]
    smin.loc[cont_cumulative, i] = smin.loc[cont_cumulative, i] + smin.loc[cont_cumulative, i-1]
    
    # set baselines of zero
    smax.loc[smax[i] < 0, i] = 0
    smin.loc[smin[i] > 0, i] = 0

# determine &export alerts
alert_pos = (smax > threshold)
alert_pos.to_csv('alert_pos_pandas.csv')

alert_neg = (smin < -threshold)
alert_neg.to_csv('alert_neg_pandas.csv')

df.head(3)
# other potentially useful outputs (for drawing graphs etc)
#smax
#smin
#value
#reference_value
#threshold

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,75,76,77,78,79,80,81,82,83,84
bnf_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0101010C0AAAAAA,0.031264,0.031261,0.031264,0.031262,0.031263,0.038091,0.038092,0.038091,0.03809,0.038091,...,0.114256,0.114256,0.114257,0.114257,0.114256,0.114258,0.11426,0.114256,0.114256,0.114256
0101010C0AAACAC,,,,,,,,,,,...,,,,,,,,,,
0101010C0AAAHAH,,,,,,,,,,,...,,,,,,,,,,


## Determine size and duration of change

In [5]:
change_from = reference_value.copy()
change = pd.DataFrame(index=df.index)
change[0] = 0.0
change_proportion = pd.DataFrame(index=df.index)
change_proportion[0] = 0.0
duration = pd.DataFrame(index=df.index)
duration[0] = 0

for i in range(1,df.shape[1]):
    change_from[i].loc[(smax[i]!=0) | (smin[i]!=0)] = change_from[i-1]
    
    change[i] = (change_from[i] * quantity[i]) - (change_from[i-1] * quantity[i])
    change_proportion[i] = (change_from[i] - change_from[i-1]) /change_from[i-1]
    
    duration[i] = 0
    duration[i].loc[(smax[i-1]!=0) | (smin[i-1]!=0)] = duration[i-1] +1

## Results

In [6]:
change.columns = dates
duration.columns = dates

output = change.stack().reset_index()
output = output.merge(duration.stack().reset_index(),
                                            how='inner',
                                            on=['bnf_code','month'])
output.columns = ['bnf_code','month','cost_change','duration']

pd.options.display.float_format = '{:,.2f}'.format
output.loc[output['duration']<5].sort_values(by='cost_change')
output.sort_values(by='cost_change')

Unnamed: 0,bnf_code,month,cost_change,duration
147143,0212000B0AAABAB,2017-07-01,-36543247.51,75
147205,0212000B0AAACAC,2015-08-01,-17554616.11,56
543379,090401000BBMZA0,2012-03-01,-14756129.57,18
147034,0212000B0AAAAAA,2015-07-01,-7623230.81,51
350770,0411000D0AAABAB,2014-09-01,-6258676.20,32
147281,0212000B0AAADAD,2014-11-01,-5608633.09,34
488302,0704050Z0AAACAC,2015-07-01,-4267710.85,51
209081,040201060AAACAC,2013-05-01,-4155314.09,16
177495,0303020G0AAABAB,2014-12-01,-3994660.17,21
501907,0803041L0AAAAAA,2013-09-01,-3605885.70,29
