In [1]:
import pandas as pd
import numpy as np
import os
import re
import json
import pytz
import datetime
from simple_salesforce import Salesforce, SalesforceLogin, SFType

import presto  # import prestosql python client
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
warnings.simplefilter('ignore', InsecureRequestWarning)

In [2]:
logininfo = json.load(open('/ghds/groups/labdesk/bshih/salesforce_login.json'))

username = logininfo['username']
password = logininfo['password']
security_token = logininfo['security_token']
domain = 'login'

sf = Salesforce(username=username, password=password, security_token=security_token)

In [3]:
uid = "_svc_acs_presto"
pwd = "em6qF7Gp8E9ECYcKx3gHZ#v$H@pzT7AX3uCJp%5wesKE%VD6gZaCB8*rZ2zNgMKdPkuetSJph2H&H2eadbfCkeB3tBbUqcM^Sj5k"

conn = presto.dbapi.connect(host='acs-exports.bi.ghdna.io',
                            port=8446,
                            user='_svc_acs_presto',
                            catalog='hive',
                            schema='default',
                            http_scheme='https',
                            verify=False,
                            auth=presto.auth.BasicAuthentication(uid, pwd))
cur = conn.cursor()

In [4]:
rbp_srp = []
for r in cur.execute(
        '''SELECT CAST(json_extract(acs_default_cv19.public.operation_operation.input_data, '$.inventory["labware"]["RNA Buffer Plate ID"]') AS VARCHAR),
                  CAST(json_extract(acs_default_cv19.public.operation_operation.input_data, '$.retain_data["retain_plate"]') AS VARCHAR)
           FROM acs_default_cv19.public.operation_operation
           WHERE name = 'Scanning Swab'
        '''):
    rbp_srp.append(r)
    
link = pd.DataFrame(rbp_srp, columns=['RBP', 'SRP']).dropna().drop_duplicates()

## All Negative Plates

In [5]:
master = pd.read_pickle(
    '/ghds/groups/labdesk/bshih/c19dash/c19_dashboard/c19_call.pickle')

master = master[master['pos_tube_rack'].str.match(r'.*?(A6|B5)') == False]
master = master[master['run_sample_id'].str.startswith(('G', 'H'), na=False)]
master = master[~master['run_sample_id'].str.startswith('Ht', na=False)]

master['RBP'] = master['pos_tube_rack'].str.extract(r'(.*?):.*')
master['Date'] = pd.to_datetime(master['runid'].str[:6], yearfirst=True)
master = master.merge(link, left_on='RBP', right_on='RBP', how='left').dropna(subset=['SRP'])

master['total_not_detected'] = master['call'].map({'not_detected': 1,
                                                   'detected': 0,
                                                   'no_call': 0})
master['num_of_samples'] = master['sample_type'].map({'Sample': 1})

srp = master.groupby('RBP', sort=False).agg({'num_of_samples': 'sum',
                                             'total_not_detected': 'sum',
                                             'SRP': 'max',
                                             'Date': 'max'})

srp = srp[(srp['num_of_samples'] == srp['total_not_detected']) & (srp['total_not_detected'] > 70)]\
    .reset_index().sort_values(['Date', 'SRP'], ascending=[False, True])

srp

Unnamed: 0,RBP,num_of_samples,total_not_detected,SRP,Date
18,RBP201202A239,92,92,SRP200909A414,2021-01-31
16,RBP201202A071,92,92,SRP200909A415,2021-01-31
17,RBP201202A206,92,92,SRP200909A417,2021-01-31
215,RBP201202A212,92,92,SRP200909A409,2021-01-29
214,RBP201202A059,92,92,SRP200909A444,2021-01-29
...,...,...,...,...,...
187,RBP200702A081,92,92,SRP200728A027,2020-08-18
186,RBP200702A054,92,92,SRP200728A029,2020-08-18
103,RBP200727A040,72,72,SRP200728A084,2020-08-11
102,RBP200727A074,82,82,SRP200728A107,2020-08-11


#### SRP with Run Sample ID for PQ

In [6]:
SRP = ['SRP200910A107', 'SRP200910A104', 'SRP200910A210', 'SRP200910A198',
       'SRP200910A204', 'SRP200910A208', 'SRP200910A203', 'SRP200910A133',
       'SRP200901A344', 'SRP200910A170', 'SRP200910A185', 'SRP200910A175']

In [7]:
master[master['SRP'].isin(SRP)].sort_values(by=['SRP', 'RBP']).loc[:, ['SRP', 'run_sample_id']]

Unnamed: 0,SRP,run_sample_id
60726,SRP200901A344,HBR0723
60727,SRP200901A344,HBT0053
60728,SRP200901A344,HBJ0816
60729,SRP200901A344,HBQ0211
60730,SRP200901A344,HBT0389
...,...,...
80390,SRP200910A210,HBQ0872
80391,SRP200910A210,HBK1148
80392,SRP200910A210,HBT0901
80393,SRP200910A210,HBX0595


# >20K G19 Score Samples Various Sites
 - Did not use SRP200909A025, could not find

In [8]:
high_g19 = master.query("median_covid_ratio > 20000").dropna(axis=1, how='all').sort_values(by='Date', ascending=False).iloc[:70, :]

In [9]:
site_names = []
for i in high_g19['run_sample_id'].unique():
    site_names.extend(sf.query_all(f"SELECT GH_Sample_ID__c, Site_Name__c\
                                     FROM Order WHERE GH_Sample_ID__c = '{i}'").get('records'))

dataframe = pd.DataFrame(site_names)
df = dataframe.drop(columns='attributes').rename(columns={'GH_Sample_ID__c': 'run_sample_id',
                                                          'Site_Name__c': 'Site Name'})

In [10]:
barcode = {}
for i,j in zip(high_g19['run_sample_id'], high_g19['SRP']):
    for r in cur.execute(
            f'''SELECT json_extract(acs_default_cv19.public.operation_operation.input_data, '$.retain_data.sample_retain_map.{i}')
               FROM acs_default_cv19.public.operation_operation
               WHERE CAST(json_extract(acs_default_cv19.public.operation_operation.input_data, '$.retain_data["retain_plate"]') AS VARCHAR) = '{j}'
            '''):
        barcode[i] = json.loads(r[0])

retain_barcode = pd.DataFrame.from_dict(barcode).transpose().reset_index().rename(columns={'index':'run_sample_id'})

In [11]:
sites = high_g19.merge(df, left_on='run_sample_id', right_on='run_sample_id', how='left').merge(retain_barcode, left_on='run_sample_id', right_on='run_sample_id', how='left').sort_values(by='Site Name')
sites[sites['Site Name'] != 'Healing Grove'].sort_values(by=['runid', 'SRP'], ascending=[False, True])

Unnamed: 0,runid,run_sample_id,sample_type,pos_tube_rack,parameter_set,replicates_count,replicates_detected,replicates_not_detected,replicates_no_call,median_covid_ratio,call,flags,RBP,Date,SRP,total_not_detected,num_of_samples,Site Name,retain_barcode,retain_position
0,210128_NB552482_0011_AHNY2FBGXG,HD20780,Sample,RBP201202A200:D2,Guardant19-RTPCR,3.0,3.0,0.0,0.0,24307.16667,detected,pass,RBP201202A200,2021-01-28,SRP200909A446,0,1,Impossible Foods,4052446217,D2
1,210127_NB552470_0008_AHNY2GBGXG,HDD0016,Sample,RBP201202A050:F7,Guardant19-RTPCR,3.0,3.0,0.0,0.0,38756.5,detected,pass,RBP201202A050,2021-01-27,SRP200909A041,0,1,Delaware State University,4052463526,F7
21,210115_NB552482_0009_AHJ2VYBGXG,HCB0333,Sample,RBP201202A108:G3,Guardant19-RTPCR,3.0,3.0,0.0,0.0,29601.0,detected,pass,RBP201202A108,2021-01-15,SRP200909A368,0,1,Delaware State University,4052436942,G3
26,210114_NB552478_0012_AHJ2V7BGXG,HD90487,Sample,RBP201202A218:E10,Guardant19-RTPCR,3.0,3.0,0.0,0.0,30139.0,detected,pass,RBP201202A218,2021-01-14,SRP200909A021,0,1,QRMD,4052270269,E10
62,210105_NB552393_0054_AHHYTTBGXG,HCG0635,Sample,RBP201112A160:B8,Guardant19-RTPCR,3.0,3.0,0.0,0.0,52799.0,detected,pass,RBP201112A160,2021-01-05,SRP201026A379,0,1,Lincoln University,4052443223,B8


In [12]:
sites[sites['Site Name'] == 'Healing Grove'].query("SRP != 'SRP200909A025'").sort_values(by=['runid', 'SRP', 'retain_barcode'], ascending=[False, True, True])

Unnamed: 0,runid,run_sample_id,sample_type,pos_tube_rack,parameter_set,replicates_count,replicates_detected,replicates_not_detected,replicates_no_call,median_covid_ratio,call,flags,RBP,Date,SRP,total_not_detected,num_of_samples,Site Name,retain_barcode,retain_position
3,210126_NB552478_0014_AHNV7JBGXG,HDE0017,Sample,RBP201202A257:C7,Guardant19-RTPCR,3.0,3.0,0.0,0.0,39462.0,detected,pass,RBP201202A257,2021-01-26,SRP200909A382,0,1,Healing Grove,4052435362,C7
2,210126_NB552478_0014_AHNV7JBGXG,HDE0059,Sample,RBP201202A257:E8,Guardant19-RTPCR,3.0,3.0,0.0,0.0,22451.0,detected,pass,RBP201202A257,2021-01-26,SRP200909A382,0,1,Healing Grove,4052435387,E8
4,210126_NB552478_0014_AHNV7JBGXG,HDD0686,Sample,RBP201202A255:A3,Guardant19-RTPCR,3.0,2.0,0.0,1.0,23095.39285,detected,pass,RBP201202A255,2021-01-26,SRP200909A383,0,1,Healing Grove,4052435430,A3
5,210121_NB552470_0007_AHNV5VBGXG,HD80065,Sample,RBP201202A029:C8,Guardant19-RTPCR,3.0,3.0,0.0,0.0,23006.66667,detected,pass,RBP201202A029,2021-01-21,SRP200909A422,0,1,Healing Grove,4052469155,C8
10,210121_NB552470_0007_AHNV5VBGXG,HD90230,Sample,RBP201202A028:B9,Guardant19-RTPCR,3.0,3.0,0.0,0.0,43364.0,detected,pass,RBP201202A028,2021-01-21,SRP200909A424,0,1,Healing Grove,4052468952,B9
8,210121_NB552470_0007_AHNV5VBGXG,HDD0752,Sample,RBP201202A028:B12,Guardant19-RTPCR,3.0,3.0,0.0,0.0,37462.0,detected,pass,RBP201202A028,2021-01-21,SRP200909A424,0,1,Healing Grove,4052468955,B12
9,210121_NB552470_0007_AHNV5VBGXG,HD90329,Sample,RBP201202A028:H11,Guardant19-RTPCR,3.0,3.0,0.0,0.0,51863.0,detected,pass,RBP201202A028,2021-01-21,SRP200909A424,0,1,Healing Grove,4052469026,H11
7,210121_NB552470_0007_AHNV5VBGXG,HD80534,Sample,RBP201202A183:D5,Guardant19-RTPCR,3.0,3.0,0.0,0.0,30176.0,detected,pass,RBP201202A183,2021-01-21,SRP200909A496,0,1,Healing Grove,4052462060,D5
6,210121_NB552470_0007_AHNV5VBGXG,HDA0035,Sample,RBP201202A183:E7,Guardant19-RTPCR,3.0,3.0,0.0,0.0,29202.0,detected,pass,RBP201202A183,2021-01-21,SRP200909A496,0,1,Healing Grove,4052462074,E7
17,210119_NB552398_0061_AHNTLMBGXG,HD90146,Sample,RBP201202A113:A4,Guardant19-RTPCR,3.0,3.0,0.0,0.0,29395.0,detected,pass,RBP201202A113,2021-01-19,SRP200909A356,0,1,Healing Grove,4052458759,A4


## >30% Positive FAST Clinical Concordance

In [13]:
salesforce = pd.read_csv(
    '/ghds/groups/labdesk/bshih/c19dash/c19_dashboard/c19_tat.csv')

master['total_detected'] = master['call'].map({'not_detected': 0,
                                               'detected': 1,
                                               'no_call': 0})

labels = ['0-0.01', '0.01-1', '1-10', '10-100', '>100']
master['category'] = pd.cut(master['median_covid_ratio'],
                        [-np.inf, 0.01, 1, 10, 100, np.inf], labels=labels)
dummies = pd.get_dummies(master.category)
master = pd.concat([master, dummies], axis=1).dropna(how='all', axis=1)

binning = master.groupby('RBP', sort=False).agg({'num_of_samples': 'sum',
                                                 'total_detected': 'sum',
                                                 'SRP': 'max',
                                                 'Date': 'max',
                                                 '0-0.01': 'sum',
                                                 '0.01-1': 'sum',
                                                 '1-10': 'sum',
                                                 '10-100': 'sum',
                                                 '>100': 'sum'})

binning = binning[(binning['total_detected'] > 10) & (binning['Date'] > '2020-10-01') & (binning['Date'] < '2020-12-31')
                  ].reset_index().sort_values(['Date', 'SRP'], ascending=[False, True])

In [14]:
master.iloc[:, 15:].sum()

total_not_detected    75770
num_of_samples        79242
total_detected         2666
0-0.01                76576
0.01-1                  911
1-10                    313
10-100                  267
>100                   1175
dtype: int64

In [15]:
pd.read_pickle('/ghds/groups/labdesk/bshih/c19dash/c19_dashboard/c19_call.pickle')

Unnamed: 0,runid,run_sample_id,sample_type,pos_tube_rack,parameter_set,replicates_count,replicates_detected,replicates_not_detected,replicates_no_call,median_covid_ratio,...,barcode,covid_count,rnase_count,spikein_count,unknown,lamp_barcode,coverage,read_count,covid_n2_count,covid_s2_count
0,201212_NB552398_0053_AHHGFGBGXG,HBU0161,Sample,RBP201112A099:G6,Guardant19-RTPCR,3.0,0.0,3.0,0.0,0.00000,...,,,,,,,,,,
1,201212_NB552398_0053_AHHGFGBGXG,HBU0410,Sample,RBP201112A099:C3,Guardant19-RTPCR,3.0,0.0,2.0,1.0,0.00000,...,,,,,,,,,,
2,201212_NB552398_0053_AHHGFGBGXG,HBU0517,Sample,RBP201112A099:C9,Guardant19-RTPCR,3.0,0.0,3.0,0.0,0.00000,...,,,,,,,,,,
3,201212_NB552398_0053_AHHGFGBGXG,HBV0045,Sample,RBP201112A099:A4,Guardant19-RTPCR,3.0,1.0,2.0,0.0,0.00018,...,,,,,,,,,,
4,201212_NB552398_0053_AHHGFGBGXG,HCB0688,Sample,RBP201112A099:C7,Guardant19-RTPCR,3.0,0.0,3.0,0.0,0.00004,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,200831_NB551650_0118_AHH3LKBGXG,N2007220302,NTC,RBP200727B222:A1,Guardant19-RTPCR,3.0,0.0,3.0,0.0,0.00000,...,,,,,,,,,,
40,200831_NB551650_0118_AHH3LKBGXG,P2007220491,Positive,RBP200727B222:B1,Guardant19-RTPCR,3.0,3.0,0.0,0.0,3.29434,...,,,,,,,,,,
41,200831_NB551650_0118_AHH3LKBGXG,P2008190026,Sample,RBP200727B222:C1,Guardant19-RTPCR,3.0,0.0,0.0,3.0,0.00000,...,,,,,,,,,,
42,200831_NB551650_0118_AHH3LKBGXG,P2008200227,Sample,RBP200727B222:D1,Guardant19-RTPCR,3.0,0.0,0.0,3.0,0.00000,...,,,,,,,,,,


In [20]:
binning.sort_values(by=['Date', 'RBP'], ascending=[True, True])

Unnamed: 0,RBP,num_of_samples,total_detected,SRP,Date,0-0.01,0.01-1,1-10,10-100,>100
3,RBP200828A305,87,12,SRP200901A349,2020-10-20,75,2,2,0,8
45,RBP200828A123,92,13,SRP200910A127,2020-10-27,79,3,0,4,6
26,RBP201022A161,92,12,SRP201026A134,2020-11-07,80,4,5,1,2
25,RBP201022A115,87,30,SRP201026A097,2020-11-10,57,15,2,2,11
44,RBP201022A028,63,14,SRP201026A224,2020-11-11,49,2,1,3,8
43,RBP201022A118,92,16,SRP201026A091,2020-11-14,76,5,4,2,5
38,RBP201022A047,92,13,SRP200910A022,2020-11-17,79,4,3,2,4
39,RBP201022A050,92,20,SRP200910A162,2020-11-17,72,4,2,1,13
40,RBP201022A040,92,13,SRP200910A010,2020-11-19,79,1,3,2,7
41,RBP201022A063,92,16,SRP200910A066,2020-11-21,76,6,4,1,5


In [17]:
RBP = binning.RBP.unique()
binning.sum()['total_detected']

912

In [18]:
only_pos = master[(master['total_detected'] == 1) & (master['Date'] > '2020-10-01') & (master['RBP'].isin(RBP))
                  ].sort_values(by=['run_sample_id', 'Date'], ascending=False)
only_pos = only_pos.merge(
    salesforce, left_on='run_sample_id', right_on='run_sample_id', how='left')
# only_pos.to_csv('retain_plates.csv', index=False)

In [19]:
len(only_pos)

912