generated from ebmdatalab/notebook-template
/
march-impact.py
127 lines (101 loc) · 3.01 KB
/
march-impact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# ---
# jupyter:
# jupytext:
# cell_metadata_filter: all
# notebook_metadata_filter: all,-language_info
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.3.3
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# ## Impact of covid on prescribing in March
#
# - [Largest absolute increase in items](#abs)
# - [Percentage diff of high volume items](#per)
# - [Antibiotics](#abx)
# - [Overall trends](#overall)
import pandas as pd
import os as os
import numpy as np
from ebmdatalab import bq, maps, charts
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# +
sql = '''
WITH
bnf_tab AS (
SELECT
DISTINCT chemical,
chemical_code
FROM
ebmdatalab.hscic.bnf )
SELECT
SUBSTR(presc.bnf_code, 0, 9) AS chemical_code,
chemical,
SUM(case when month = "2019-03-01" then items else 0 END) as items_2019,
SUM(case when month = "2020-03-01" then items else 0 END) as items_2020
FROM
ebmdatalab.hscic.normalised_prescribing AS presc
LEFT JOIN
bnf_tab
ON
chemical_code=SUBSTR(presc.bnf_code,0,9)
WHERE
month BETWEEN TIMESTAMP('2019-03-01')
AND TIMESTAMP('2020-03-01')
GROUP BY
chemical_code,
chemical
ORDER BY
items_2020 DESC
'''
df_chemical = bq.cached_read(sql, csv_path=os.path.join('..','data','df_chemical.csv'))
df_chemical.head(5)
# +
df_march_diff = df_chemical.copy()
df_march_diff["increase"] = (df_march_diff.items_2020 - df_march_diff.items_2019).fillna(0)
df_march_diff["per_diff"] = 100*((df_march_diff.items_2020 - df_march_diff.items_2019)/df_march_diff.items_2019)
df_march_diff.head(5)
# -
# ## Largest absolute increases in items <a id='abs'></a>
df_march_diff.sort_values("increase", ascending=False).head(26)
# ## Percentage difference <a id='per'></a>
high_volume_diff = df_march_diff.loc[(df_march_diff["items_2020"] >= 50000)].sort_values("per_diff", ascending=False)
high_volume_diff.head(26)
# ## Antimicrobial Stewardship <a id='abx'></a>
# +
df_abx_a = df_march_diff[df_march_diff["chemical_code"].str.startswith("050")].sort_values("increase", ascending=False)
df_abx = df_abx_a.loc[(df_abx_a["items_2020"] >= 5)]
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_abx
# -
# ## Overall Trends <a id='overall'></a>
# +
sql2 = '''
SELECT
month,
SUM(items) AS items,
SUM(actual_cost) AS cost
FROM
ebmdatalab.hscic.normalised_prescribing
WHERE
month BETWEEN TIMESTAMP('2015-01-01')
AND TIMESTAMP('2020-03-01') #2014 seems to be duplicated so setting argument to eliminate
GROUP BY
month
ORDER BY
items DESC
'''
df_overall = bq.cached_read(sql2, csv_path=os.path.join('..','data','df_overall.csv'))
df_overall.head(5)
# -
df_overall.groupby("month").sum().plot(kind='line', title="Trens in items and cost per month since 2015")
df_march_2020 = df_overall.loc[(df_overall["month"] == "2020-03-01 00:00:00+00:00")]
df_march_2019 = df_overall.loc[(df_overall["month"] == "2019-03-01 00:00:00+00:00")]
df_march_2020
df_march_2019