In [80]:
# Imports and variables
import os

from analysis import compute_regression
from analysis import trim_5_percentiles

import pandas as pd
import numpy as np
import analytics

import logging
logger = logging.getLogger('pandas_gbq')
logger.setLevel(logging.ERROR)

In [81]:
DUMMY_RUN = True  # Change this to False when the analysis is run for real
ANALYTICS_VIEW_ID = '101677264'
GBQ_PROJECT_ID = '620265099307'
%autosave 0

Autosave disabled


# Engagement outcomes

In [82]:
# Import page views data
#
# Timepoints:
# - 1 month before/after
# - April-Sept 2018 vs April-Sept 2019
#
from importlib import reload
reload(analytics)
if DUMMY_RUN and os.path.exists("../data/pageview_stats.csv"):
    # CCG-level data:
    all_stats = pd.read_csv("../data/pageview_stats.csv",usecols={"Page","Date","Pageviews","Unique Pageviews"})
    all_stats['Date'] = pd.to_datetime(all_stats['Date'])
else:
    ccg_query = [
        {
            'viewId': ANALYTICS_VIEW_ID,
            "samplingLevel": "LARGE",
            'dateRanges': [
                {'startDate': '2018-04-01',
                 'endDate': '2019-09-30'}
            ],
            'metrics': [
                {'expression': 'ga:pageViews'},
                {'expression': 'ga:uniquePageViews'},
            ],
            "dimensions": [
                {"name": "ga:pagePath"},
                {"name": "ga:date"},
            ],
            "dimensionFilterClauses": [{
                "operator": "AND",
                "filters": [
                    {
                        "dimensionName": "ga:pagePath",
                        "operator": "REGEXP",
                        "expressions": ["^/(ccg|practice).*lowp"]
                    },
                    {
                        "dimensionName": "ga:pagePath",
                        "not": True,
                        "operator": "PARTIAL",
                        "expressions": ["analyse"]
                    }
                ]
            }]
        }]
    colnames = ["Date", "Page", "Pageviews", "Unique Pageviews"]
    all_stats = analytics.query_analytics(ccg_query, columns=colnames)
    all_stats.to_csv("../data/pageview_stats.csv")

In [83]:
# extract if ccg/practice code from path
all_stats["org_id"] = np.where(
    all_stats.Page.str.contains("ccg"),
    all_stats.Page.str.replace('/ccg/', '').str[:3],
    all_stats.Page.str.replace('/practice/', '').str[:6])
all_stats["org_type"] = np.where(
    all_stats.Page.str.contains("ccg"),
    "ccg",
    'practice')
all_stats.head(2)

Unnamed: 0,Date,Page,Pageviews,Unique Pageviews,org_id,org_type
0,2018-08-31,/ccg/00C/?tags=lowpriority,2,1,00C,ccg
1,2018-08-31,/ccg/00C/?tags=lowpriority,2,1,00C,ccg


In [84]:
### CCGs that have been allocated to the RCT 
rct_ccgs = pd.read_csv('../data/randomisation_group.csv')

# Joint Team information (which CCGs work together in Joint Teams)
team = pd.read_csv('../data/joint_teams.csv')

# Map CCGs to Joint Teams
rct_ccgs = rct_ccgs.merge(team, on="joint_team", how="left")

# Fill blank ccg_ids from joint_id column, so even CCGs not in Joint Teams 
# have a value for joint_id
rct_ccgs["pct_id"] = rct_ccgs["ccg_id"].combine_first(rct_ccgs["joint_id"])
rct_ccgs = rct_ccgs[["joint_id", "allocation", "pct_id"]]

# Add numerical intervention field
rct_ccgs['intervention'] = rct_ccgs.allocation.map({'con': 0, 'I': 1})

rct_ccgs.head(2)

Unnamed: 0,joint_id,allocation,pct_id,intervention
0,01X,con,01X,0
1,99K,con,99K,0


In [85]:
## Map practices to joint teams, for practice-level analysis

# Get current mapping data from bigquery
practice_to_ccg = '''select distinct ccg_id, code
from `ebmdatalab.hscic.practices`
where setting = 4 and status_code != 'C'
'''

practice_to_ccg = pd.read_gbq(practice_to_ccg, GBQ_PROJECT_ID, dialect='standard')
practice_to_ccg.to_csv("../data/practice_to_ccg.csv")

In [86]:
# extract practice statistics for practices that are members of CCGs who are in the RCT
rct_practices = rct_ccgs[["pct_id"]].merge(practice_to_ccg, left_on="pct_id", right_on ="ccg_id", how="left")
# add a new "ccg_id" column just for practices
all_stats_with_ccg = all_stats.merge(
    rct_practices[["ccg_id", "code"]],
    left_on="org_id",
    right_on="code",
    how="left").drop("code", axis=1)
all_stats_with_ccg.loc[all_stats_with_ccg.org_id.str.len() == 3, "ccg_id"] = all_stats_with_ccg.org_id
# Add joint team id and allocation onto the new stats
stats_with_allocations = rct_ccgs.merge(all_stats_with_ccg, left_on="pct_id",right_on="ccg_id",how="left")

In [87]:
# import CCG population sizes

query = '''select pct_id, sum(total_list_size) as list_size
from `hscic.practice_statistics` as stats
where CAST(month AS DATE) = '2018-08-01'
group by pct_id
'''
pop = pd.read_gbq(query, GBQ_PROJECT_ID, dialect='standard')
pop.to_csv("../data/practice_statistics.csv")

In [88]:
# merge rct_ccgs with population data
ccg_populations = rct_ccgs.merge(pop, on="pct_id", how="left")[["joint_id", "list_size"]]

# group up to joint teams
joint_team_populations = ccg_populations.groupby("joint_id").sum().reset_index()
joint_team_populations.head()


Unnamed: 0,joint_id,list_size
0,00J,258773
1,00Y,256146
2,01F,131772
3,01J,165657
4,01V,280649


In [89]:
# import dates of interventions
visit_dates = pd.read_csv('../data/allocated_ccgs_visit_timetable.csv')
visit_dates["date"] = pd.to_datetime(visit_dates.date)

# merge with rct_ccgs/joint teams
allocations_with_dates = rct_ccgs.merge(visit_dates, on="joint_id", how="left").drop("pct_id", axis=1).drop_duplicates()
allocations_with_dates_and_sizes = allocations_with_dates.merge(joint_team_populations, on="joint_id")

# rank by size, to allow us to pair similar interventions and controls
allocations_with_dates_and_sizes["size_rank"] = allocations_with_dates_and_sizes.groupby("allocation").list_size.rank()

# assign dummy intervention dates to control practices by pairing on total list size
i_group = allocations_with_dates_and_sizes[["allocation", "date", "size_rank"]]\
          .loc[allocations_with_dates_and_sizes.allocation == "I"]\
          .drop("allocation", axis=1)

allocations_with_dates_and_sizes = allocations_with_dates_and_sizes.merge(i_group, on="size_rank", how="left", suffixes=["", "_int"])\
         .drop("date", axis=1)\
         .sort_values(by=["size_rank", "allocation"])
allocations_with_dates_and_sizes.head()

Unnamed: 0,joint_id,allocation,intervention,list_size,size_rank,date_int
22,05V,I,1,149403,1.0,2018-09-17
14,02G,con,0,113244,1.0,2018-09-17
23,03E,I,1,163530,2.0,2018-10-09
39,10D,con,0,113816,2.0,2018-10-09
8,05G,I,1,217452,3.0,2018-09-20


In [90]:
# join joint-group / ccg allocations, visit dates and list size info to page views data
all_data = allocations_with_dates_and_sizes.drop("size_rank", axis=1)\
       .merge(
           stats_with_allocations.drop(["allocation", "pct_id", "ccg_id", "intervention"], axis=1),
           how='left',
           on='joint_id')
all_data.head(2)

Unnamed: 0,joint_id,allocation,intervention,list_size,date_int,Date,Page,Pageviews,Unique Pageviews,org_id,org_type
0,05V,I,1,149403,2018-09-17,2018-10-12,/practice/M83024/measures/?tags=lowpriority,1,1,M83024,practice
1,05V,I,1,149403,2018-09-17,2018-10-12,/practice/M83024/measures/?tags=lowpriority,1,1,M83024,practice


In [91]:
type(all_data.Date)

pandas.core.series.Series

In [92]:
# assign each page view occurrence to before vs after intervention (1 month ~ 28 days)

all_data["datediff"] = all_data.Date - all_data.date_int
all_data["timing"] = "none"
all_data.loc[(all_data.datediff <= "28 days") & (all_data.datediff > "0 days"),
      "timing"] = "after"
all_data.loc[(all_data.datediff >= "-28 days") & (all_data.datediff < "0 days"),
      "timing"] = "before"
all_data["Unique Pageviews"] = all_data["Unique Pageviews"].fillna(0)
all_data.head(2)

Unnamed: 0,joint_id,allocation,intervention,list_size,date_int,Date,Page,Pageviews,Unique Pageviews,org_id,org_type,datediff,timing
0,05V,I,1,149403,2018-09-17,2018-10-12,/practice/M83024/measures/?tags=lowpriority,1,1,M83024,practice,25 days,after
1,05V,I,1,149403,2018-09-17,2018-10-12,/practice/M83024/measures/?tags=lowpriority,1,1,M83024,practice,25 days,after


In [93]:
# group up page views data to joint teams and sum page views before
# and after interventions

all_data_agg = all_data.groupby(["intervention", "joint_id", "org_type", "list_size", "timing"])\
      .agg({"Unique Pageviews": sum, "Page": "nunique"}).unstack().fillna(0)
all_data_agg = all_data_agg.rename(columns={"Page": "No_of_Pages"}).reset_index()
#flatten columns and drop superfluous columns
all_data_agg.columns = all_data_agg.columns.map('_'.join).map(lambda x: x.strip("_"))
all_data_agg = all_data_agg.drop(["Unique Pageviews_none","No_of_Pages_none"], axis=1)
all_data_agg.head()

Unnamed: 0,intervention,joint_id,org_type,list_size,Unique Pageviews_after,Unique Pageviews_before,No_of_Pages_after,No_of_Pages_before
0,0,00Y,ccg,256146,0.0,0.0,0.0,0.0
1,0,00Y,practice,256146,2.0,0.0,1.0,0.0
2,0,01F,practice,131772,2.0,0.0,1.0,0.0
3,0,01J,ccg,165657,0.0,2.0,0.0,1.0
4,0,01J,practice,165657,4.0,2.0,2.0,1.0


## Engagement outcome E1
Number of page views over one month on CCG pages showing low-priority measures, before vs after intervention, between intervention and control groups.



In [94]:
# filter CCG page views only:
ccg_data_agg = all_data_agg.loc[all_data_agg.org_type == "ccg"]
ccg_data_agg_trimmed = trim_5_percentiles(ccg_data_agg, debug=False)
formula = ('data["proxy_pageviews_after"] '
           ' ~ data["proxy_pageviews_before"] + intervention')
compute_regression(
    ccg_data_agg_trimmed,
    formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,2.304825,,0.292834,4.316816
"data[""proxy_pageviews_before""]",0.179167,0.542431,-0.412111,0.770444
intervention,-0.136842,0.904636,-2.438872,2.165188


## Engagement outcome E2
Number of page views over one month on practice pages showing low-priority measures, grouped up to CCGs

In [95]:
practice_data_agg = all_data_agg.loc[all_data_agg.org_type == "practice"]
practice_data_agg_trimmed = trim_5_percentiles(practice_data_agg, debug=False)
compute_regression(
    practice_data_agg_trimmed,
    formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,4.267224,,1.652123,6.882325
"data[""proxy_pageviews_before""]",0.263491,0.244478,-0.187925,0.714908
intervention,-0.263491,0.871156,-3.532446,3.005463


# Engagement outcomes E3 and E4 : Alert sign-ups


## Prepare data

In [96]:
# import data from django administration, filtered for confirmed sign-ups only (no date filter)

alerts = pd.read_csv('../data/OrgBookmark-2018-11-02.csv')
alerts["created_at"] = pd.to_datetime(alerts.created_at)

alerts.head()


Unnamed: 0,id,user,pct,practice,created_at,approved
0,2333,2815,00C,,2018-11-02 09:23:07,1
1,2331,2813,,P81770,2018-11-01 13:00:49,1
2,2330,2569,,J82102,2018-11-01 11:23:36,1
3,2329,2783,,N81087,2018-10-31 16:17:25,1
4,2327,2809,,J83029,2018-10-31 09:39:51,1


In [97]:
# map practices to joint teams (and thus only include RCT subjects)
alerts = alerts.merge(
    rct_practices[["ccg_id", "code"]],
    left_on="practice",
    right_on="code",
    how="left").drop("code",axis=1)
# Fill nulls in ccg_id column from values in pct colume
alerts.ccg_id = alerts.ccg_id.combine_first(alerts.pct)
alerts.head()

Unnamed: 0,id,user,pct,practice,created_at,approved,ccg_id
0,2333,2815,00C,,2018-11-02 09:23:07,1,00C
1,2331,2813,,P81770,2018-11-01 13:00:49,1,
2,2330,2569,,J82102,2018-11-01 11:23:36,1,
3,2329,2783,,N81087,2018-10-31 16:17:25,1,
4,2327,2809,,J83029,2018-10-31 09:39:51,1,


In [98]:
# Add RCT allocations to data
alerts = rct_ccgs.merge(alerts, left_on="pct_id", right_on="ccg_id", how="left")
# flag whether each alert is a practice or CCG alert
conditions = [
    (alerts.pct.str.len()==3),
    (alerts.practice.str.len()==6)]

choices = ['ccg', 'practice']
alerts['org_type'] = np.select(conditions, choices, default='none')
alerts.head()

Unnamed: 0,joint_id,allocation,pct_id,intervention,id,user,pct,practice,created_at,approved,ccg_id,org_type
0,01X,con,01X,0,1615.0,2042.0,01X,,2018-04-27 10:11:30,1.0,01X,ccg
1,01X,con,01X,0,799.0,997.0,,N83060,2017-10-02 11:03:46,1.0,01X,practice
2,99K,con,99K,0,1933.0,2481.0,99K,,2018-07-25 08:45:38,1.0,99K,ccg
3,99K,con,99K,0,1753.0,2243.0,99K,,2018-06-07 09:48:30,1.0,99K,ccg
4,99K,con,99K,0,1562.0,1984.0,,G81100,2018-04-22 10:28:17,1.0,99K,practice


In [99]:
# join to visit dates
alerts_with_dates_and_stats = allocations_with_dates_and_sizes\
                              .drop(["size_rank", "allocation", "intervention"],axis=1)\
                              .merge(alerts.drop(["approved"], axis=1),
                                     how='left', on='joint_id')
alerts_with_dates_and_stats.head()

Unnamed: 0,joint_id,list_size,date_int,allocation,pct_id,intervention,id,user,pct,practice,created_at,ccg_id,org_type
0,05V,149403,2018-09-17,I,05V,1,,,,,NaT,,none
1,02G,113244,2018-09-17,con,02G,0,2144.0,2658.0,02G,,2018-09-27 17:58:16,02G,ccg
2,02G,113244,2018-09-17,con,02G,0,686.0,864.0,02G,,2017-08-17 08:30:02,02G,ccg
3,03E,163530,2018-10-09,I,03E,1,1407.0,1782.0,,B82030,2018-03-05 20:49:26,03E,practice
4,03E,163530,2018-10-09,I,03E,1,1254.0,1558.0,,B82027,2018-01-23 15:04:58,03E,practice


In [100]:
# assign each page view occurrence to before vs after intervention (1
# month ~ 28 days)
alerts_with_dates_and_stats["datediff"] = (
    alerts_with_dates_and_stats.created_at - alerts_with_dates_and_stats.date_int)
alerts_with_dates_and_stats["timing"] = "none"
# all alerts set up prior to day of intervention will be used as a co-variable:
alerts_with_dates_and_stats.loc[
    (alerts_with_dates_and_stats.datediff < "0 days"),
    "timing"] = "before"

In [101]:
# main outcome: alerts set up within 3 months of intervention:
alerts_with_dates_and_stats.loc[
    (alerts_with_dates_and_stats.datediff >= "0 days") &
    (alerts_with_dates_and_stats.datediff <= "84 days"),
    "timing"] = "after"  # (within 3 months)

In [102]:
# aggregate data: sum alerts before and after intervention for each joint team
alerts_agg = alerts_with_dates_and_stats\
     .groupby(["intervention", "joint_id", "list_size", "timing", "org_type"])\
     .agg({"user": "nunique"})\
     .unstack()\
     .fillna(0)
alerts_agg = alerts_agg.rename(columns={"user": "alerts"}).unstack().reset_index().fillna(0)

# flatten columns:
alerts_agg.columns = alerts_agg.columns.map('_'.join).map(lambda x: x.rstrip("_"))

alerts_agg["list_size_100k"] = alerts_agg["list_size"]/100000
alerts_agg = alerts_agg[
    ["intervention",
     "joint_id",
     "list_size_100k",
     "alerts_ccg_after",
     "alerts_ccg_before",
     "alerts_practice_after",
     "alerts_practice_before"]]



alerts_agg.head()

Unnamed: 0,intervention,joint_id,list_size_100k,alerts_ccg_after,alerts_ccg_before,alerts_practice_after,alerts_practice_before
0,0,00Y,2.56146,0.0,3.0,0.0,3.0
1,0,01F,1.31772,0.0,0.0,1.0,1.0
2,0,01J,1.65657,0.0,0.0,0.0,1.0
3,0,01X,1.9782,0.0,1.0,0.0,1.0
4,0,02G,1.13244,1.0,1.0,0.0,0.0


In [103]:
# summary data
alerts_agg.groupby("intervention").mean()

Unnamed: 0_level_0,list_size_100k,alerts_ccg_after,alerts_ccg_before,alerts_practice_after,alerts_practice_before
intervention,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2.595924,0.15,2.4,0.2,2.2
1,3.318947,0.1,1.5,0.15,4.1


### E3 Number of registrations to OpenPrescribing CCG email alerts

In [104]:
formula = ('data["alerts_ccg_after"] ~ '
           'data["alerts_ccg_before"] + data["list_size_100k"] + intervention')
compute_regression(
    alerts_agg,
    formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.146551,,-0.113023,0.406126
"data[""alerts_ccg_before""]",-0.0367,0.255036,-0.10105,0.027651
"data[""list_size_100k""]",0.035258,0.48178,-0.065341,0.135858
intervention,-0.108522,,-0.356804,0.139759



### E4 Number of registrations to OpenPrescribing Practice email alerts grouped up to CCG
(New sign-ups within 3 months of intervention. The CCG registered population and number of sign-ups prior to the intervention will be co-variables.)

In [105]:
formula = ('data["alerts_practice_after"] ~ '
           'data["alerts_practice_before"] + data["list_size_100k"] + intervention')
compute_regression(
    alerts_agg,
    formula=formula)

Unnamed: 0_level_0,coefficient,p value,conf_int_low,conf_int_high
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.24315,,-0.107206,0.593506
"data[""alerts_practice_before""]",0.036423,0.212792,-0.021815,0.094661
"data[""list_size_100k""]",-0.04749,0.460147,-0.176493,0.081512
intervention,-0.084867,,-0.391637,0.221902
