In [16]:
from lib.outliers import Runner
from datetime import date
import pandas as pd

In [17]:
from_date = date(year=2021,month=6,day=1)
to_date = date(year=2021,month=12,day=1)
r = Runner(from_date,to_date,10,["practice","ccg","pcn","stp"],False)

In [18]:
r.build.run()
r.build.fetch_results()

Downloading: 100%|██████████| 1/1 [00:00<00:00,  7.05rows/s]


In [19]:
### Extracting all the stored z scores etc across organisations
### so that summary statistics can be calculated

e_data = pd.concat(
    (d.assign(entity=e) for e, d in r.build.results.items())
)

## Entity counts

Counts of each kind of entity (i.e., organisation).

In [20]:
### Summarising the number of each kind of entity (organisation)

e_counts = ( e_data.reset_index()[["practice","entity"]]
            .drop_duplicates()['entity']
            .value_counts()
            .to_frame()
            .rename( columns={'entity':'n'} ) )

e_counts

Unnamed: 0,n
practice,6476
pcn,1257
ccg,106
stp,42


## Chemical counts

Counts of the number of unique outlying chemicals (i.e., those identified in the top/bottom
5 z scores) amongst all organisations of the given type.

In [21]:
### Summarising the number of unique chemicals identified in the
### top/bottom five outliers amongst all organisations of the given type

c_counts = ( e_data.reset_index()[["chemical","entity"]]
            .drop_duplicates()['entity']
            .value_counts()
            .to_frame()
            .rename( columns={'entity':'chemicals'} ) )

c_counts

Unnamed: 0,chemicals
pcn,1416
practice,1346
ccg,1138
stp,680


In [22]:
### Combining the entity and chemical counts

all_counts = e_counts.join( c_counts )

In [23]:
### Defining which metrics will be displayed in the summary tables
metrics_to_show = [ "n", "chemicals", "median","max","min","IQR" ]

In [29]:
### Calculating summary statistics for the Z scores for those chemicals
### identified in the TOP 5 in at least one organisation of the entity type.
### There are the chemicals displayed in the 'Higher than most' table.

overused_summary = e_data.query('rank_high<=10').query('z_score>0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
overused_summary = overused_summary.rename( columns={"50%":"median"}, inplace=False )

In [37]:
### Calculating summary statistics for the Z scores for those chemicals
### identified in the BOTTOM 5 in at least one organisation of the entity type.
### There are the chemicals displayed in the 'Lower than most' table.

underused_summary = e_data.query('rank_low<=10').query('z_score<0').groupby( "entity" )[["z_score"]].describe().reindex(['stp', 'ccg', 'pcn', 'practice']).stack(level=0)
underused_summary = underused_summary.rename( columns={"50%":"median"}, inplace=False )

## Summary statistics for outlying Z scores in each organisation type

### Higher than most chemicals

The table below summarises the Z scores for the high outlying (i.e., top 5) chemicals
in each type of organisation. These are chemicals are seen to be used more often
in a particular organisation than its peers.

In [38]:
underused_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,median,75%,max
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
stp,z_score,420.0,-2.650387,0.932614,-6.326451,-2.872841,-2.352648,-2.08166,-1.47326
ccg,z_score,1060.0,-2.66701,1.228738,-10.198503,-2.813851,-2.302282,-1.990392,-1.325057
pcn,z_score,12570.0,-2.609923,2.474167,-159.768459,-2.673923,-2.183928,-1.900201,-1.296016
practice,z_score,64760.0,-2.495317,3.925005,-307.234735,-2.569772,-2.076544,-1.756406,-0.987765


In [32]:
### Extracting the summary statistics for the z scores
overused_tmp = overused_summary[overused_summary.index.isin(["z_score"], level=1)]

### Calculating IQR, removing the row index and rounding to 2dp
overused_toprint = ( overused_tmp
         .assign( IQR = overused_tmp["75%"]-overused_tmp["25%"] )
         .droplevel(level=1)
         .round(2) )

overused_toprint.join( all_counts )[metrics_to_show]

Unnamed: 0_level_0,n,chemicals,median,max,min,IQR
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
stp,42,680,5.42,6.33,2.68,1.64
ccg,106,1138,5.79,10.2,2.76,3.18
pcn,1257,1416,5.28,2528.09,2.26,3.39
practice,6476,1346,5.23,6825.5,1.21,3.84


### Lower than most chemicals

The table below summarises the Z scores for the low outlying (i.e., bottom 5) chemicals
in each type of organisation. These are chemicals are seen to be used less often
in a particular organisation than its peers.

In [33]:
### Extracting the summary statistics for the z scores
underused_tmp = underused_summary[underused_summary.index.isin(["z_score"], level=1)]

### Calculating IQR, removing the row index and rounding to 2dp
underused_toprint = ( underused_tmp
         .assign( IQR = underused_tmp["75%"]-underused_tmp["25%"] )
         .droplevel(level=1)
         .round(2) )

underused_toprint.join( all_counts )[metrics_to_show]

Unnamed: 0_level_0,n,chemicals,median,max,min,IQR
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
stp,42,680,-2.35,-1.47,-6.33,0.79
ccg,106,1138,-2.3,-1.33,-10.2,0.82
pcn,1257,1416,-2.18,-1.3,-159.77,0.77
practice,6476,1346,-2.08,-0.99,-307.23,0.81


### Summary

Below is a summary table that combines the 'Higher than most' and 'Lower than most'
results displayed above.

In [34]:
pd.concat([overused_toprint.join( all_counts )[metrics_to_show],
           underused_toprint[metrics_to_show[2:]]],
          keys=["Higher than most", "Lower than most"],axis=1)

Unnamed: 0_level_0,Higher than most,Higher than most,Higher than most,Higher than most,Higher than most,Higher than most,Lower than most,Lower than most,Lower than most,Lower than most
Unnamed: 0_level_1,n,chemicals,median,max,min,IQR,median,max,min,IQR
entity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
stp,42,680,5.42,6.33,2.68,1.64,-2.35,-1.47,-6.33,0.79
ccg,106,1138,5.79,10.2,2.76,3.18,-2.3,-1.33,-10.2,0.82
pcn,1257,1416,5.28,2528.09,2.26,3.39,-2.18,-1.3,-159.77,0.77
practice,6476,1346,5.23,6825.5,1.21,3.84,-2.08,-0.99,-307.23,0.81
