# Primary outcomes
**P1.  Cost per 1,000 patients for all 18 pre-specified “low-priority” treatments combined.**

**P2. Total items per 1000 across all 18 low priority treatments.**


In [1]:
%autosave 0

Autosave disabled


In [52]:
import datetime
import pandas as pd
import numpy as np
from lp_measure_conditions import tables

from analysis import compute_regression
from analysis import trim_5_percentiles

import logging
logger = logging.getLogger('pandas_gbq')
logger.setLevel(logging.ERROR)

GBQ_PROJECT_ID = '620265099307'


# Set dates of baseline and follow-up periods
baseline_start = '2017-01-01' # baseline start
mid_start = '2017-07-01' # month after end of baseline period
followup_start = '2018-01-01' # follow-up start
post_followup_start = '2018-07-01' # month after end of follow-up period

# Import dataset from BigQuery

# costs (totals, not divided into individual measures) - for P1
costs_sql = '''SELECT * FROM ebmdatalab.measures.ccg_data_lpzomnibus
WHERE EXTRACT (YEAR from month)  >= 2017
'''

# items (summed across all lp measures) - for P2:
# this is the total number of items in each lp measure
where = []
for table, condition in tables.items():
    where.append(condition)
items_sql = '''SELECT pct AS pct_id, month, sum(items) AS items FROM
  `ebmdatalab.hscic.normalised_prescribing_standard`
  WHERE  month >= '{}' AND ({})
  GROUP BY pct, month'''.format(baseline_start, " OR ".join(where))
data = pd.read_gbq(costs_sql, GBQ_PROJECT_ID, dialect='standard')
data["month"] = pd.to_datetime(data.month)
data.to_csv("../data/lowpriory_costs.csv")
items = pd.read_gbq(items_sql, GBQ_PROJECT_ID, dialect='standard')
items["month"] = pd.to_datetime(items.month)
items.to_csv("../data/lowpriory_items.csv")


In [53]:
items.head(2)

Unnamed: 0,pct_id,month,items
0,03A,2018-08-01,1995
1,02T,2018-08-01,1697


In [54]:
data.head(2)

Unnamed: 0,pct_id,stp_id,month,numerator,denominator,calc_value,percentile
0,02N,E54000005,2017-06-01,28251.89183,158.713,178.006161,0.5
1,02N,E54000005,2017-07-01,28719.04243,158.745,180.913052,0.551546


In [56]:
# merge items and costs data
data = data.merge(
    items,
    on=["pct_id","month"],
    how="outer").sort_values(by=["pct_id", "month"])
data.head()

Unnamed: 0,pct_id,stp_id,month,numerator,denominator,calc_value,percentile,items
3309,00C,E54000049,2017-01-01,20288.48891,107.615,188.528448,0.592784,939
3311,00C,E54000049,2017-02-01,18554.53583,107.615,172.415888,0.489691,799
3302,00C,E54000049,2017-03-01,22876.98454,107.615,212.581745,0.680412,967
3304,00C,E54000049,2017-04-01,20770.54861,107.776,192.719609,0.731959,872
3296,00C,E54000049,2017-05-01,21191.8976,107.886,196.428615,0.634021,955


In [57]:
### select data only for the baseline and follow-up periods

conditions = [
    (data['month'] >= post_followup_start),
    (data['month'] >= followup_start),
    (data['month'] >= mid_start),
    (data['month'] >= baseline_start),
    (data['month'] < baseline_start)]

choices = ['after', 'follow-up', 'mid', 'baseline', 'before']
data['period'] = np.select(conditions, choices, default='0')

In [58]:
data.head(2)

Unnamed: 0,pct_id,stp_id,month,numerator,denominator,calc_value,percentile,items,period
3309,00C,E54000049,2017-01-01,20288.48891,107.615,188.528448,0.592784,939,baseline
3311,00C,E54000049,2017-02-01,18554.53583,107.615,172.415888,0.489691,799,baseline


In [59]:
### select data only for the baseline and follow-up periods

conditions = [
    (data['month'] >= post_followup_start),
    (data['month'] >= followup_start),
    (data['month'] >= mid_start),
    (data['month'] >= baseline_start),
    (data['month'] < baseline_start)]

choices = ['after', 'follow-up', 'mid', 'baseline', 'before']
data['period'] = np.select(conditions, choices, default='0')
# Restrict to columns of interest
data = data[["pct_id", "period", "month", "numerator", "denominator", "items"]]
data = data.loc[
    (data['period'] == "baseline") | (data['period'] == "follow-up")
].set_index(["pct_id", "period", "month"])

data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,numerator,denominator,items
pct_id,period,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00C,baseline,2017-01-01,20288.48891,107.615,939
00C,baseline,2017-02-01,18554.53583,107.615,799
00C,baseline,2017-03-01,22876.98454,107.615,967
00C,baseline,2017-04-01,20770.54861,107.776,872
00C,baseline,2017-05-01,21191.8976,107.886,955


In [60]:
### group measurements for each CCG for each period
agg_6m = data.groupby(["pct_id", "period"]).agg(
    {"numerator": sum, "items": sum, "denominator": "mean"})
agg_6m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,items,denominator
pct_id,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00C,baseline,124312.24379,5435,107.7325
00C,follow-up,102119.89023,4054,108.279167
00D,baseline,315054.76387,15334,290.8515
00D,follow-up,237929.15765,10188,292.0395
00J,baseline,355985.80688,13532,256.521833


In [61]:
### import **allocated** Rct_Ccgs
rct_ccgs = pd.read_csv('../data/randomisation_group.csv')

# joint team information
team = pd.read_csv('../data/joint_teams.csv')

# create map of rct_ccgs to joint teams
rct_ccgs = rct_ccgs.merge(team, on="joint_team", how="left")

# fill blank ccg_ids from joint_id column, so every CCG has a value
# for joint_id
rct_ccgs["pct_id"] = rct_ccgs["ccg_id"].combine_first(rct_ccgs["joint_id"])
rct_ccgs = rct_ccgs[["joint_id", "allocation", "pct_id"]]

# merge rct_ccgs with data
rct_agg_6m = rct_ccgs.merge(agg_6m.reset_index(), on="pct_id", how="left")
rct_agg_6m.head()


Unnamed: 0,joint_id,allocation,pct_id,period,numerator,items,denominator
0,01X,con,01X,baseline,402230.98221,14688,196.474
1,01X,con,01X,follow-up,347631.36917,12087,197.501167
2,99K,con,99K,baseline,397320.37086,8691,169.696667
3,99K,con,99K,follow-up,289294.82914,7659,170.398
4,01J,con,01J,baseline,287300.90109,14316,163.463333


In [62]:
# group up to Joint team groups
# note: SUM both numerator and population denominator across geographies
rct_agg_6m = rct_agg_6m\
             .groupby(["joint_id", "allocation", "period"])\
             .sum()\
             .unstack()\
             .reset_index()
rct_agg_6m.columns = rct_agg_6m.columns.map('_'.join).map(lambda x: x.strip("_"))
rct_agg_6m['intervention'] = rct_agg_6m.allocation.map({'con': 0, 'I': 1})
rct_agg_6m.head()

Unnamed: 0,joint_id,allocation,numerator_baseline,numerator_follow-up,items_baseline,items_follow-up,denominator_baseline,denominator_follow-up,intervention
0,00J,I,355985.80688,300047.52211,13532,9905,256.521833,259.612833,1
1,00Y,con,321697.29913,263617.28942,17934,15059,252.313167,255.081167,0
2,01F,con,222717.46981,204807.24274,9072,8001,130.523333,131.222333,0
3,01J,con,287300.90109,264583.05225,14316,12915,163.463333,164.991,0
4,01V,I,473560.42907,441433.56403,20077,18749,278.8905,280.012,1


In [63]:
# calculate aggregated measure values for baseline and followup pareiods
# XXX is it right that numerator is `sum` and denominator is `mean`?
rct_agg_6m["baseline_calc_value"] = (
    rct_agg_6m.numerator_baseline / rct_agg_6m.denominator_baseline)
rct_agg_6m["follow_up_calc_value"] = (
    rct_agg_6m["numerator_follow-up"] / rct_agg_6m["denominator_follow-up"])
rct_agg_6m["baseline_items_thou"] = (
    rct_agg_6m.items_baseline / rct_agg_6m.denominator_baseline)
rct_agg_6m["follow_up_items_thou"] = (
    rct_agg_6m["items_follow-up"] / rct_agg_6m["denominator_follow-up"])

rct_agg_6m.head()

Unnamed: 0,joint_id,allocation,numerator_baseline,numerator_follow-up,items_baseline,items_follow-up,denominator_baseline,denominator_follow-up,intervention,baseline_calc_value,follow_up_calc_value,baseline_items_thou,follow_up_items_thou
0,00J,I,355985.80688,300047.52211,13532,9905,256.521833,259.612833,1,1387.740771,1155.749961,52.751845,38.152968
1,00Y,con,321697.29913,263617.28942,17934,15059,252.313167,255.081167,0,1274.992119,1033.464339,71.078336,59.036111
2,01F,con,222717.46981,204807.24274,9072,8001,130.523333,131.222333,0,1706.342185,1560.765135,69.504814,60.972853
3,01J,con,287300.90109,264583.05225,14316,12915,163.463333,164.991,0,1757.586214,1603.62112,87.579274,78.276997
4,01V,I,473560.42907,441433.56403,20077,18749,278.8905,280.012,1,1698.015634,1576.480879,71.988827,66.957845


In [64]:
# plot time series chart for intervention versus control

'''# merge MONTHLY data with practice allocations
dfp = data.loc[data.month_no>0]
dfp = dfp.loc[~pd.isnull(dfp.calc_value)]
dfp = prac.merge(dfp, how='left', on='practice_id')#.set_index('allocation')
dfp = dfp[['practice_id','month_no','allocation','calc_value']]

dfp2 = dfp.groupby(['month_no','allocation']).count()
#dfp = pd.DataFrame(dfp.to_records())#.set_index('month_no')
dfp2

import seaborn as sns#; sns.set(color_codes=True)
import matplotlib.pyplot as plt
sns.set(style="darkgrid")

g = sns.tsplot(data=dfp, time="month_no",  value="calc_value", unit="practice_id",condition="allocation")
plt.ylim((0, 0.17))
plt.show()'''


'# merge MONTHLY data with practice allocations\ndfp = data.loc[data.month_no>0]\ndfp = dfp.loc[~pd.isnull(dfp.calc_value)]\ndfp = prac.merge(dfp, how=\'left\', on=\'practice_id\')#.set_index(\'allocation\')\ndfp = dfp[[\'practice_id\',\'month_no\',\'allocation\',\'calc_value\']]\n\ndfp2 = dfp.groupby([\'month_no\',\'allocation\']).count()\n#dfp = pd.DataFrame(dfp.to_records())#.set_index(\'month_no\')\ndfp2\n\nimport seaborn as sns#; sns.set(color_codes=True)\nimport matplotlib.pyplot as plt\nsns.set(style="darkgrid")\n\ng = sns.tsplot(data=dfp, time="month_no",  value="calc_value", unit="practice_id",condition="allocation")\nplt.ylim((0, 0.17))\nplt.show()'

In [65]:
### Primary Outcome ########################
# Cost per 1,000 patients for all 18 pre-specified “low-priority” treatments combined,
# between intervention and control groups, assessed by applying a multivariable linear regression model.

formula = ('data["follow_up_calc_value"] '
           '~ data["baseline_calc_value"] + intervention')
compute_regression(rct_agg_6m, formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,228.020097,,-21.531602,477.571795
"data[""baseline_calc_value""]",0.689449,2.317439e-11,0.541058,0.837839
intervention,-11.040542,0.7664716,-85.810741,63.729658


In [66]:
### Primary Outcome P2 ########################
# ITEMS per 1,000 patients for all 18 pre-specified “low-priority” treatments combined,
# between intervention and control groups, assessed by applying a multivariable linear regression model.

formula = ('data["follow_up_items_thou"] '
           '~ data["baseline_items_thou"] + intervention')
compute_regression(rct_agg_6m, formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,-4.328017,,-11.035453,2.379419
"data[""baseline_items_thou""]",0.902156,4.976385e-18,0.786442,1.01787
intervention,-0.934645,0.6130421,-4.647518,2.778228
