# Primary outcomes
**P1.  Cost per 1,000 patients for all 18 pre-specified “low-priority” treatments combined.**

**P2. Total items per 1000 across all 18 low priority treatments.**


In [38]:
%autosave 0

Autosave disabled


In [39]:
import os
import requests
import pandas as pd
import numpy as np

from analysis import compute_regression

import logging
logger = logging.getLogger('pandas_gbq')
logger.setLevel(logging.ERROR)

GBQ_PROJECT_ID = '620265099307'
DUMMY_RUN = True  # Useful for testing; set to false when doing real analysis

# Set dates of baseline and follow-up periods
baseline_start = '2017-01-01'       # baseline start
mid_start = '2017-07-01'            # month after end of baseline period
followup_start = '2018-01-01'       # follow-up start
post_followup_start = '2018-07-01'  # month after end of follow-up period

all_measures = ['lpcoprox', 'lpdosulepin', 'lpdoxazosin', 
                'lpfentanylir', 'lpglucosamine', 'lphomeopathy', 
                'lplidocaine', 'lpliothyronine', 'lplutein', 
                'lpomega3', 'lpoxycodone', 'lpperindopril', 
                'lprubefacients', 'lptadalafil', 'lptramadolpara', 
                'lptravelvacs', 'lptrimipramine']
definition_url = (
    "https://raw.githubusercontent.com/ebmdatalab/openprescribing/"
    "{commit}/openprescribing/frontend/management/commands/measure_definitions/"
    "{measure}.json")
commit_for_measure_definitions = "6f949660fee06401102136926eaba075d963511d"



In [40]:
# Import data from BigQuery
# (Specifically, per-measure cost/items numerators, and population denominators)
if DUMMY_RUN and os.path.exists("../data/all_measure_data.csv"):
    rawdata = pd.read_csv("../data/all_measure_data.csv").drop(['Unnamed: 0'], axis=1)
else:
    rawdata = pd.DataFrame()
    sql_template = open("measure.sql", "r").read()
    for measure in all_measures:
        measure_definition = requests.get(definition_url.format(
            commit=commit_for_measure_definitions, measure=measure)).json()
        where_condition = " ".join(measure_definition['numerator_where'])
        sql = sql_template.format(
            date_from=baseline_start, 
            date_to=post_followup_start, 
            where_condition=where_condition)
        df = pd.read_gbq(sql, GBQ_PROJECT_ID, dialect='standard')
        df["month"] = pd.to_datetime(df.month)
        df["measure"] = measure
        rawdata = rawdata.append(df)
    rawdata.to_csv("../data/all_measure_data.csv")
rawdata.head(1)

Unnamed: 0,month,pct_id,items,cost,denominator,measure
0,2017-10-01,RXA,0,0.0,4.35,lpcoprox


In [41]:
# Aggregate across all measures 
data = rawdata.groupby(["pct_id", "month"]).agg(
    {'items':'sum', 'cost': 'sum', 'denominator':'first'}).reset_index()
data = data.rename(columns={"cost": "numerator"})
data['calc_value'] = data['numerator'] / data['denominator']
data.head(2)

Unnamed: 0,pct_id,month,items,numerator,denominator,calc_value
0,00C,2017-01-01,546,20288.48891,107.615,188.528448
1,00C,2017-02-01,508,18554.53583,107.615,172.415888


In [42]:
# select data only for the baseline and follow-up periods

conditions = [
    (data['month'] >= post_followup_start),
    (data['month'] >= followup_start),
    (data['month'] >= mid_start),
    (data['month'] >= baseline_start),
    (data['month'] < baseline_start)]

choices = ['after', 'follow-up', 'mid', 'baseline', 'before']
data['period'] = np.select(conditions, choices, default='0')
# Restrict to columns of interest
data = data[["pct_id", "period", "month", "numerator", "denominator", "items"]]
data = data.loc[
    (data['period'] == "baseline") | (data['period'] == "follow-up")
].set_index(["pct_id", "period", "month"])

data.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,numerator,denominator,items
pct_id,period,month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00C,baseline,2017-01-01,20288.48891,107.615,546
00C,baseline,2017-02-01,18554.53583,107.615,508
00C,baseline,2017-03-01,22876.98454,107.615,596


In [43]:
# group measurements for each CCG for each period
agg_6m = data.groupby(["pct_id", "period"]).agg(
    {"numerator": "sum", "items": "sum", "denominator": "mean"})
agg_6m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,numerator,items,denominator
pct_id,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00C,baseline,124312.24379,3288,107.7325
00C,follow-up,102119.89023,2845,108.279167
00D,baseline,315174.39124,9571,290.8515
00D,follow-up,237929.15765,7202,292.0395
00J,baseline,355985.80688,8919,256.521833


In [44]:
### CCGs that have been allocated to the RCT 
rct_ccgs = pd.read_csv('../data/randomisation_group.csv')

# Joint Team information (which CCGs work together in Joint Teams)
team = pd.read_csv('../data/joint_teams.csv')

# Map CCGs to Joint Teams
rct_ccgs = rct_ccgs.merge(team, on="joint_team", how="left")

# Fill blank ccg_ids from joint_id column, so even CCGs not in Joint Teams 
# have a value for joint_id
rct_ccgs["pct_id"] = rct_ccgs["ccg_id"].combine_first(rct_ccgs["joint_id"])
rct_ccgs = rct_ccgs[["joint_id", "allocation", "pct_id"]]

# Combine CCG/Joint Team info with measure data
rct_agg_6m = rct_ccgs.merge(agg_6m.reset_index(), on="pct_id", how="left")
rct_agg_6m.head(3)


Unnamed: 0,joint_id,allocation,pct_id,period,numerator,items,denominator
0,01X,con,01X,baseline,402230.98221,13063,196.474
1,01X,con,01X,follow-up,347634.04107,11186,197.501167
2,99K,con,99K,baseline,397320.37086,7476,169.696667


In [45]:
# aggregate up to Joint team groups
# XXX: SUM both numerator and population denominator across geographies - is this right?
rct_agg_6m = rct_agg_6m\
             .groupby(["joint_id", "allocation", "period"])\
             .sum()\
             .unstack()\
             .reset_index()
# Rename columns which have awkward names resulting from the unstack operation
rct_agg_6m.columns = rct_agg_6m.columns.map('_'.join).map(lambda x: x.strip("_"))
# Create binary "intervention" column for later regression
rct_agg_6m['intervention'] = rct_agg_6m.allocation.map({'con': 0, 'I': 1})
rct_agg_6m.head(3)

Unnamed: 0,joint_id,allocation,numerator_baseline,numerator_follow-up,items_baseline,items_follow-up,denominator_baseline,denominator_follow-up,intervention
0,00J,I,355985.80688,300047.52211,8919,7160,256.521833,259.612833,1
1,00Y,con,321697.29913,263617.28942,14662,11884,252.313167,255.081167,0
2,01F,con,222717.46981,204807.24274,6635,5873,130.523333,131.222333,0


In [46]:
# calculate aggregated measure values for baseline and followup pareiods
rct_agg_6m["baseline_calc_value"] = (
    rct_agg_6m.numerator_baseline / rct_agg_6m.denominator_baseline)
rct_agg_6m["follow_up_calc_value"] = (
    rct_agg_6m["numerator_follow-up"] / rct_agg_6m["denominator_follow-up"])
rct_agg_6m["baseline_items_thou"] = (
    rct_agg_6m.items_baseline / rct_agg_6m.denominator_baseline)
rct_agg_6m["follow_up_items_thou"] = (
    rct_agg_6m["items_follow-up"] / rct_agg_6m["denominator_follow-up"])

rct_agg_6m.head(3)

Unnamed: 0,joint_id,allocation,numerator_baseline,numerator_follow-up,items_baseline,items_follow-up,denominator_baseline,denominator_follow-up,intervention,baseline_calc_value,follow_up_calc_value,baseline_items_thou,follow_up_items_thou
0,00J,I,355985.80688,300047.52211,8919,7160,256.521833,259.612833,1,1387.740771,1155.749961,34.76897,27.57953
1,00Y,con,321697.29913,263617.28942,14662,11884,252.313167,255.081167,0,1274.992119,1033.464339,58.110325,46.589092
2,01F,con,222717.46981,204807.24274,6635,5873,130.523333,131.222333,0,1706.342185,1560.765135,50.833823,44.756101


# Primary Outcome P1

Cost per 1,000 patients for all 18 pre-specified “low-priority” treatments combined, between intervention and control groups, assessed by applying a multivariable linear regression model.


In [47]:
formula = ('data["follow_up_calc_value"] '
           '~ data["baseline_calc_value"] + intervention')
compute_regression(rct_agg_6m, formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,232.123038,,-18.855035,483.10111
"data[""baseline_calc_value""]",0.68672,2.958585e-11,0.537538,0.835903
intervention,-10.955806,0.7693495,-86.106563,64.194951


# Primary Outcome P2 
ITEMS per 1,000 patients for all 18 pre-specified “low-priority” treatments combined, between intervention and control groups, assessed by applying a multivariable linear regression model.


In [48]:
formula = ('data["follow_up_items_thou"] '
           '~ data["baseline_items_thou"] + intervention')
compute_regression(rct_agg_6m, formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,-0.71888,,-4.89683,3.45907
"data[""baseline_items_thou""]",0.854177,2.827826e-21,0.766722,0.941632
intervention,-0.545711,0.6363668,-2.865146,1.773724
