In [1]:
from lib.outliers import Runner
from datetime import date
import pandas as pd

In [2]:
from_date = date(year=2021,month=4,day=1)
to_date = date(year=2021,month=8,day=1)
r = Runner(from_date,to_date,5,["practice","ccg","pcn","stp"],False)

In [None]:
r.run()

In [4]:
### Extracting all the stored z scores etc across organisations
### so that summary statistics can be calculated

e_data = pd.concat(
    (d.assign(entity=e) for e, d in r.build.results.items())
)

## Entity counts

Counts of each kind of entity (i.e., organisation).

In [5]:
### Summarising the number of each kind of entity (organisation)

e_counts = ( e_data.reset_index()[["practice","entity"]]
            .drop_duplicates()['entity']
            .value_counts()
            .to_frame()
            .rename( columns={'entity':'n'} ) )

e_counts

Unnamed: 0,n
practice,6499
pcn,1257
ccg,106
stp,42


## Chemical counts

Counts of the number of chemicals for which we have data (Z scores etc)
within each type of organisation.

In [6]:
### Summarising the number of unique chemicals analysed within
### each type of organisation

c_counts = ( e_data.reset_index()[["chemical","entity"]]
            .drop_duplicates()['entity']
            .value_counts()
            .to_frame()
            .rename( columns={'entity':'chemicals'} ) )

c_counts

Unnamed: 0,chemicals
pcn,1294
practice,1274
ccg,706
stp,364


In [7]:
### Combining the entity and chemical counts

all_counts = e_counts.join( c_counts )

In [8]:
### Calculating summary statistics for the ratio and the Z score
### within each entity type

all_summary = e_data.groupby( "entity" )[["ratio","z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
all_summary = all_summary.rename( columns={"50%":"median"}, inplace=False )

### Defining which metrics will be displayed below
metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ]

## Summary statistics for the z score in each organisation type

In [9]:
### Extracting the summary statistics for the z scores
z_tmp = all_summary[all_summary.index.isin(["z_score"], level=1)]

### Calculating IQR, removing the row index and rounding to 2dp
z_summary = ( z_tmp
         .assign( IQR = z_tmp["75%"]-z_tmp["25%"] )
         .droplevel(level=1)
         .round(2) )

z_summary.join( all_counts )[metrics_to_show]

Unnamed: 0_level_0,n,chemicals,median,max,min,IQR
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
stp,42,364,3.82,6.33,-6.33,8.9
ccg,106,706,3.57,10.2,-10.2,10.14
pcn,1257,1294,2.58,543.19,-141.33,9.86
practice,6499,1274,0.0,5512.02,-711.87,9.72


## Summary statistics for the ratio in each organisation type

In [10]:
### Extracting the summary statistics for the z scores
ratio_tmp = all_summary[all_summary.index.isin(["ratio"], level=1)]

### Calculating IQR, removing the row index and rounding to 2dp
ratio_summary = ( ratio_tmp
         .assign( IQR = ratio_tmp["75%"]-ratio_tmp["25%"] )
         .droplevel(level=1)
         .round(2) )

ratio_summary.join( all_counts )[metrics_to_show]

Unnamed: 0_level_0,n,chemicals,median,max,min,IQR
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
stp,42,364,0.09,1.0,0.0,0.64
ccg,106,706,0.12,1.0,0.0,0.61
pcn,1257,1294,0.13,1.0,0.0,0.49
practice,6499,1274,0.14,1.0,0.0,0.44
