In [1]:
#import libraries required for analysis
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import  DateFormatter
%matplotlib inline

### Data from epact2 tags

In [2]:
projectid = "ebmdatalab"
#obtain practice level data for low value medicines from ePACT2 tags
lpp_epact2_df = pd.read_gbq("""
SELECT
  epact2.lpp_code,
  practice,
  pct,
  SUM(items) AS items,
  SUM(actual_cost) AS actual_cost
FROM
  `hscic.normalised_prescribing_standard` AS rx
JOIN
  `richard.lpp_epact2` AS epact2
ON
  epact2.bnf_code=rx.bnf_code
WHERE
  rx.month BETWEEN '2018-01-01 00:00:00'
  AND '2018-03-01 00:00:00'
GROUP BY
  epact2.lpp_code,
  practice,
  pct
""", projectid, dialect='standard')

In [3]:
lpp_epact2_df.head(10)

Unnamed: 0,lpp_code,practice,pct,items,actual_cost
0,16,A83005,00C,14,969.16921
1,2,A83005,00C,57,124.60941
2,9,A83005,00C,2,939.64363
3,8,A83005,00C,37,2999.55726
4,4,A83005,00C,9,1755.9981
5,12,A83005,00C,26,385.50497
6,15,A83005,00C,34,135.20843
7,11,A83006,00C,3,237.73484
8,2,A83006,00C,14,17.12599
9,3,A83006,00C,3,27.84132


### Data from OpenPrescribing tags

In [4]:
#obtain practice level data for low value medicines missing from OP tags
lpp_op_df = pd.read_gbq("""
SELECT
  op.lpp_code,
  practice,
  pct,
  SUM(items) AS items,
  SUM(actual_cost) AS actual_cost
FROM
  `hscic.normalised_prescribing_standard` AS rx
JOIN
  `richard.lpp_op_bnf_vw` AS op
ON
  op.bnf_code=rx.bnf_code
JOIN
  richard.lpp_codes as lpp
ON
  op.lpp_code=lpp.lpp_code
WHERE
  rx.month BETWEEN '2018-01-01 00:00:00'
  AND '2018-03-01 00:00:00'
GROUP BY
  op.lpp_code,
  practice,
  pct
""", projectid, dialect='standard')

In [5]:
#obtain lpp_areas
lpp_area_df = pd.read_gbq("""
SELECT * from richard.lpp_codes
""", projectid, dialect='standard')

In [6]:
lpp_op_df.head(10)

Unnamed: 0,lpp_code,practice,pct,items,actual_cost
0,9,A83005,00C,2,939.64363
1,2,A83005,00C,57,124.60941
2,12,A83005,00C,26,385.50497
3,15,A83005,00C,34,135.20843
4,4,A83005,00C,9,1755.9981
5,10,A83005,00C,2,35.04789
6,16,A83005,00C,14,969.16921
7,8,A83005,00C,37,2999.55726
8,3,A83006,00C,3,27.84132
9,15,A83006,00C,4,25.57282


### LPP codes

In [7]:
lpp_area_df.head(20)

Unnamed: 0,lpp_code,lpp_name
0,1,Co-proxamol
1,2,Dosulepin
2,3,Prolonged-release Doxazosin
3,4,Immediate Release Fentanyl
4,5,Glucosamine and Chondroitin
5,6,Herbal Treatments
6,7,Homeopathy
7,8,Lidocaine Plasters
8,9,Liothyronine (including Armour Thyroid and lio...
9,10,Lutein and Antioxidants


So, how do we merge the two dataframes (`lpp_op_df` and `lpp_epact2_df`) and `lpp_area.df` to show a single dataframe with the `lpp_code`, `lpp_name`, `practice` and `pct`, and having columns for the `lpp_op_df` and `lpp_epact2_df` items and actual_cost, without excluding any practices on areas (e.g. showing nulls)?

In [17]:
merged = lpp_op_df.merge(lpp_epact2_df, on=['lpp_code', 'practice'], how='outer', suffixes=['_op', '_epact'])


In [27]:
merged.actual_cost_epact = merged.actual_cost_epact.apply(lambda x: np.round(x, 4))
merged.actual_cost_op = merged.actual_cost_op.apply(lambda x: np.round(x, 4))


In [28]:
not_matching = merged[merged.actual_cost_epact != merged.actual_cost_op]

In [31]:
not_matching.describe()

Unnamed: 0,items_op,actual_cost_op,items_epact,actual_cost_epact
count,5886.0,5886.0,4288.0,4288.0
mean,19.294937,351.485975,19.991138,290.681657
std,32.169105,1016.855606,34.228883,708.485083
min,1.0,0.3908,1.0,0.5662
25%,3.0,33.8016,3.0,22.43115
50%,7.0,84.4067,7.0,83.8019
75%,21.0,289.166625,22.0,289.890575
max,578.0,34964.8114,569.0,21651.3069
