In [1]:
#import libraries required for analysis
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import  DateFormatter
%matplotlib inline

### Data from epact2 tags

In [2]:
projectid = "ebmdatalab"
#obtain practice level data for low value medicines from ePACT2 tags
lpp_epact2_df = pd.read_gbq("""
SELECT
  epact2.lpp_code,
  practice,
  pct,
  SUM(items) AS items,
  SUM(actual_cost) AS actual_cost
FROM
  `hscic.normalised_prescribing_standard` AS rx
JOIN
  `richard.lpp_epact2` AS epact2
ON
  epact2.bnf_code=rx.bnf_code
WHERE
  rx.month BETWEEN '2018-01-01 00:00:00'
  AND '2018-03-01 00:00:00'
GROUP BY
  epact2.lpp_code,
  practice,
  pct
""", projectid, dialect='standard')

Requesting query... ok.
Job ID: job_qKuq4pz4gTT63lgJ-tbYuA9FLY5i
Query running...
Query done.
Cache hit.

Retrieving results...
Got 60307 rows.

Total time taken 7.18 s.
Finished at 2018-07-10 09:29:55.


In [3]:
lpp_epact2_df.head(10)

Unnamed: 0,lpp_code,practice,pct,items,actual_cost
0,4,A83005,00C,9,1755.9981
1,9,A83005,00C,2,939.64363
2,2,A83005,00C,57,124.60941
3,12,A83005,00C,26,385.50497
4,15,A83005,00C,34,135.20843
5,16,A83005,00C,14,969.16921
6,8,A83005,00C,37,2999.55726
7,17,A83006,00C,1,12.04016
8,8,A83006,00C,14,1141.34138
9,11,A83006,00C,3,237.73484


### Data from OpenPrescribing tags

In [4]:
#obtain practice level data for low value medicines missing from OP tags
lpp_op_df = pd.read_gbq("""
SELECT
  op.lpp_code,
  practice,
  pct,
  SUM(items) AS items,
  SUM(actual_cost) AS actual_cost
FROM
  `hscic.normalised_prescribing_standard` AS rx
JOIN
  `richard.lpp_op_bnf_vw` AS op
ON
  op.bnf_code=rx.bnf_code
JOIN
  richard.lpp_codes as lpp
ON
  op.lpp_code=lpp.lpp_code
WHERE
  rx.month BETWEEN '2018-01-01 00:00:00'
  AND '2018-03-01 00:00:00'
GROUP BY
  op.lpp_code,
  practice,
  pct
""", projectid, dialect='standard')

Requesting query... ok.
Job ID: job_26Om8_Dovn6HIQZi_pc2yhyNFIr_
Query running...
Query done.
Cache hit.

Retrieving results...
Got 61905 rows.

Total time taken 6.97 s.
Finished at 2018-07-10 09:30:24.


In [5]:
#obtain lpp_areas
lpp_area_df = pd.read_gbq("""
SELECT * from richard.lpp_codes
""", projectid, dialect='standard')

Requesting query... ok.
Job ID: job_YvHvFQ45DwaE1SilR9XexFO-TZmz
Query running...
Query done.
Cache hit.

Retrieving results...
Got 18 rows.

Total time taken 1.05 s.
Finished at 2018-07-10 09:30:47.


In [6]:
lpp_op_df.head(10)

Unnamed: 0,lpp_code,practice,pct,items,actual_cost
0,12,A83005,00C,26,385.50497
1,15,A83005,00C,34,135.20843
2,16,A83005,00C,14,969.16921
3,2,A83005,00C,57,124.60941
4,4,A83005,00C,9,1755.9981
5,10,A83005,00C,2,35.04789
6,9,A83005,00C,2,939.64363
7,8,A83005,00C,37,2999.55726
8,17,A83006,00C,1,12.04016
9,8,A83006,00C,14,1141.34138


### LPP codes

In [7]:
lpp_area_df.head(20)

Unnamed: 0,lpp_code,lpp_name
0,1,Co-proxamol
1,2,Dosulepin
2,3,Prolonged-release Doxazosin
3,4,Immediate Release Fentanyl
4,5,Glucosamine and Chondroitin
5,6,Herbal Treatments
6,7,Homeopathy
7,8,Lidocaine Plasters
8,9,Liothyronine (including Armour Thyroid and lio...
9,10,Lutein and Antioxidants


So, how do we merge the two dataframes (`lpp_op_df` and `lpp_epact2_df`) and `lpp_area.df` to show a single dataframe with the `lpp_code`, `lpp_name`, `practice` and `pct`, and having columns for the `lpp_op_df` and `lpp_epact2_df` items and actual_cost, without excluding any practices on areas (e.g. showing nulls)?