# load library

In [1]:
# auto reloading of local scripts under dev
%load_ext autoreload
%autoreload 2

In [2]:
# relying on these stdlib anyway
import http
import re
import os
import sys
import pandas as pd

In [3]:
# Google Cloud and FISS
from firecloud import api as fapi

from google.cloud import storage
storage_client = storage.Client()

In [4]:
# load local lib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from terra.future.src.table_utils import *

# load data

In [5]:
primary_namespace = 'production-long-reads'
primary_workspace = 'broad-gp-pacbio'
root_data_type='sample'
root_table = \
  fetch_existing_root_table(ns=primary_namespace,
                            ws=primary_workspace,
                            etype=root_data_type)

In [6]:
categorical_columns = {'type': 'category',
                       'columns': ['application', 'experiment_type', 'instrument', 'workspace']}

date_time_columns = {'type': 'datetime64',
                     'columns': []}

boolean_columns = {'type': 'bool',
                   'columns': ['is_ccs', 'is_corrected', 'is_isoseq']}

int_type_columns = {'type': 'int64',
                    'columns': ['insert_size']}

float_type_columns = {'type': 'float64',
                      'columns': ['lod_expected_sample']}

string_type_columns = {'type': 'str',
                       'columns': ['flowcell_id', 'bio_sample', 'description', 'well_sample', 'movie_name', 'well_name', 'sample']}

In [7]:
for n in boolean_columns['columns']:
    root_table[n] = root_table[n].astype('bool')

In [8]:
for n in categorical_columns['columns']:
    root_table[n] = root_table[n].astype('category')

In [9]:
for n in int_type_columns['columns']:
    root_table[n] = root_table[n].astype('int64')

In [10]:
for n in string_type_columns['columns']:
    root_table[n] = root_table[n].astype('str')

In [11]:
for n in float_type_columns['columns']:
    root_table[n] = root_table[n].astype('float64')

In [12]:
desired_columns_in_order = ['flowcell_id', 'bio_sample', 'description', 'well_sample',
                            'raw_est_fold_cov',
                            'lod_expected_sample', 'fingerprint_vcf', 'aligned_bam',
                            'application', 'experiment_type',
                            'is_ccs', 'is_corrected', 'is_isoseq',
                            'instrument', 'movie_name', 'well_name', 'insert_size',
                            'sample', 'workspace']

In [13]:
gmkf_flowcells = root_table.loc[root_table['workspace'].apply(lambda e: 'GMKF' in str(e)),:]

In [16]:
needed_samples = ['SM-JPU9W', 'SM-K6CQJ']

In [18]:
swap_1120 = root_table.loc[root_table['well_sample'].isin(needed_samples),:]

In [23]:
swap_1120[desired_columns_in_order]

Unnamed: 0,flowcell_id,bio_sample,description,well_sample,raw_est_fold_cov,lod_expected_sample,fingerprint_vcf,aligned_bam,application,experiment_type,is_ccs,is_corrected,is_isoseq,instrument,movie_name,well_name,insert_size,sample,workspace
339,DA105374,1-02032,CG0037-9194,SM-JPU9W,1.288426949405146,,,gs://broad-gp-pacbio-outgoing/results/PBFlowce...,hifiReads,CCS,True,True,True,64271e,m64271e_210810_163247,D01,20000,84403350-6cfc-496a-91a3-97bb6fee00bf,
444,DA105098,1-02032,CG0037-9194,SM-JPU9W,4.82098361925469,,,gs://broad-gp-pacbio-outgoing/results/PBFlowce...,hifiReads,CCS,True,True,True,64271e,m64271e_210903_223116,D01,20000,b05ed615-3c54-47b8-af0b-a3cb2ba00201,
655,DA105356,1-09889,CG0037-9170,SM-K6CQJ,8.232344372112829,0.0,,gs://broad-gp-pacbio-outgoing/results/PBFlowce...,hifiReads,CCS,True,True,True,64271e,m64271e_210809_053602,C01,16000,fe968017-2b78-481b-93be-0ec33dfd9d8e,Gabriel_GMKFLRP_Gelb_PacBio_FY20


In [20]:
missing_mercury_sample_ids = \
    swap_1120[['bio_sample', 'description', 'well_sample', 'flowcell_id']]\
        .rename({'bio_sample': 'Collab_Part_ID',
                 'description': 'Collab_SM_ID',
                 'well_sample': 'Broad_LSID',
                 'flowcell_id': 'FCID'}, axis=1)\
        .sort_values(by=['Broad_LSID', 'Collab_Part_ID', 'FCID'], axis=0)
missing_mercury_sample_ids['Broad_LSID'] = missing_mercury_sample_ids['Broad_LSID'].apply(lambda s: re.sub('^SM-', '', s))
missing_mercury_sample_ids

Unnamed: 0,Collab_Part_ID,Collab_SM_ID,Broad_LSID,FCID
444,1-02032,CG0037-9194,JPU9W,DA105098
339,1-02032,CG0037-9194,JPU9W,DA105374
655,1-09889,CG0037-9170,K6CQJ,DA105356


In [21]:
missing_mercury_sample_ids.to_csv('/Users/shuang/Desktop/swap_1120.csv',
                                  sep=',', index=False, header=False)

In [24]:
where_to_upload = swap_1120[['flowcell_id', 'input_bam']].rename({'input_bam': 'upload_location'}, axis=1).reset_index(drop=True)
where_to_upload['upload_location'] = where_to_upload['upload_location'].apply(lambda s: '/'.join(s.split('/')[:-1]))
where_to_upload

Unnamed: 0,flowcell_id,upload_location
0,DA105374,gs://broad-gp-pacbio/r64271e_20210806_181631/4...
1,DA105098,gs://broad-gp-pacbio/r64271e_20210830_132900/4...
2,DA105356,gs://broad-gp-pacbio/r64271e_20210806_181631/3...


In [25]:
where_to_upload.to_csv(f'/Users/shuang/Desktop/fingerprint_crosscheck/SWAP-1120/upload_locations.tsv',
                       sep='\t', header=False, index=False)
os.system(f"cd /Users/shuang/Desktop/fingerprint_crosscheck/SWAP-1120/ && bash upload.sh")

DA105098
DA105374
DA105356


0

In [27]:
fp_vcf_gs_paths = pd.read_csv("/Users/shuang/Desktop/fingerprint_crosscheck/SWAP-1120/uploaded_gs_paths.txt", sep='\t')
fc_2_vcf = dict(zip(fp_vcf_gs_paths.flowcell_id, fp_vcf_gs_paths.fp_vcf))
fc_2_tbi = dict(zip(fp_vcf_gs_paths.flowcell_id, fp_vcf_gs_paths.fp_vcf_tbi))
fc_2_terra_uuid = dict(zip(root_table['flowcell_id'].tolist(), root_table['sample'].tolist()))

In [28]:
fp_vcf_gs_paths


Unnamed: 0,flowcell_id,fp_vcf,fp_vcf_tbi
0,DA105098,gs://broad-gp-pacbio/r64271e_20210830_132900/4...,gs://broad-gp-pacbio/r64271e_20210830_132900/4...
1,DA105374,gs://broad-gp-pacbio/r64271e_20210806_181631/4...,gs://broad-gp-pacbio/r64271e_20210806_181631/4...
2,DA105356,gs://broad-gp-pacbio/r64271e_20210806_181631/3...,gs://broad-gp-pacbio/r64271e_20210806_181631/3...


In [29]:
fc_2_terra_uuid

{'DA103955': '01299f20-7ca3-4fdb-acb7-e98b304fe69b',
 'DA047218': '01300ed1-77fe-41bb-92c2-7907dfb910c5',
 'DA074276': '02acd991-eb6b-4820-bf38-ce4fbff6a6d6',
 'DA005858': '036a73ba-d612-4e3b-834d-738972ca1b30',
 'DA074064': '03777c72-3603-4314-9776-c52e60958340',
 'DA005915': '03f1ba7b-674c-4405-9c4d-9cf9467099ae',
 'DA134109': '04793461-05fc-4361-ba38-e8589e920c13',
 'DA134006': '047d1630-2c74-40ca-827e-4f21c9f9d289',
 'DA105301': '0483a603-24ce-4aa5-b77c-c24de3a3a745',
 'DA105359': '050d60d6-d3d5-45c2-a6ac-6bfbb3e427e3',
 'DA126120': '05162b6b-5e83-4d62-b630-0c1ae71ad9bf',
 'DA112873': '056fd028-5f56-4773-8109-9cff7615a34c',
 'DA026373': '05b4068e-ece1-4238-9cd2-3e6c50fe7bb3',
 'DA012510': '05cc4cac-5c2b-4d0b-950e-d589c69b8908',
 'DA074012': '05e1ae49-033d-4ed8-97f6-074ef7998c7f',
 'DA007812': '05eb7421-22d0-4cc2-afd0-1f8c279b7a4b',
 'DA013045': '0621700a-2a13-4a96-885e-72d7f694d378',
 'DA126144': '066f0d1f-8fb3-4e4e-b25b-5407b029da6c',
 'DA134004': '06fcfb2a-495d-40db-b72c-d237bbe2

In [30]:
def fill_in_new_column(ns: str, ws: str,
                       etype: str, ename: str,
                       attribute_name: str, attribute_value,
                       dry_run: bool = False) -> None:

    response = fapi.get_entity(ns, ws, etype, ename)
    if not response.ok:
        raise FireCloudServerError(response.status_code, response.text)

    cov = {"op":                 "AddUpdateAttribute",
           "attributeName":      attribute_name,
           "addUpdateAttribute": attribute_value}
    operations = [cov]
    if dry_run:
        print(operations)
        return

    response = fapi.update_entity(ns, ws,
                                  etype=etype,
                                  ename=ename,
                                  updates=operations)
    if not response.ok:
        raise FireCloudServerError(response.status_code, response.text)

In [31]:
do_it = True
fp_vcf_gs_paths['flowcell_id'].apply(lambda fcid: fill_in_new_column(primary_namespace, primary_workspace,
                                                                     etype='sample',
                                                                     ename=fc_2_terra_uuid.get(fcid),
                                                                     attribute_name='fingerprint_vcf',
                                                                     attribute_value=fc_2_vcf.get(fcid),
                                                                     dry_run=(not do_it)) if fc_2_vcf.get(fcid) is not None else print(fcid))

0    None
1    None
2    None
Name: flowcell_id, dtype: object

In [32]:
do_it = True
fp_vcf_gs_paths['flowcell_id'].apply(lambda fcid: fill_in_new_column(primary_namespace, primary_workspace,
                                                                     etype='sample',
                                                                     ename=fc_2_terra_uuid.get(fcid),
                                                                     attribute_name='fingerprint_vcf_tbi',
                                                                     attribute_value=fc_2_tbi.get(fcid),
                                                                     dry_run=(not do_it)) if fc_2_tbi.get(fcid) is not None else print(fcid))

0    None
1    None
2    None
Name: flowcell_id, dtype: object

# play

In [14]:
class GcsPath:
    """
    given a gs-path string, return a dict: {'bucket': bucket, 'prefix': prefix, 'file': file}
    """

    def __init__(self, gs_path: str):

        if not gs_path.startswith("gs://"):
            raise ValueError(f"Provided gs path isn't valid: {gs_path}")

        arr = re.sub("^gs://", '', gs_path).split('/')
        self.bucket = arr[0]
        self.prefix = '/'.join(arr[1:-1])
        self.file = arr[-1]

    def exists(self, client: storage.client.Client) -> bool:
        return self.is_file(client=client) or self.is_emulate_dir(client=client)

    def is_file(self, client: storage.client.Client) -> bool:
        return storage.Blob(bucket=client.bucket(self.bucket), name=f'{self.prefix}/{self.file}').exists(client)

    def is_emulate_dir(self, client: storage.client.Client) -> bool:
        if self.is_file(client=client):
            return False
        return any(True for _ in client.list_blobs(client.bucket(self.bucket), prefix=f'{self.prefix}/{self.file}'))

In [15]:
columns_to_check = ['ccs_bam', 'ccs_pbi', 'ccs_report',
                    'fq', 'gcs_input_dir',
                    'input_bam', 'input_pbi',
                    'subreads_bam', 'subreads_pbi']

In [17]:
for col in columns_to_check:
    id_x = gmkf_flowcells[col].apply(lambda gs: True if gs is None or gs=='None' or gs=='nan' else GcsPath(gs).exists(storage_client))
    if not id_x.all():
        print(f"=========================================\n{col}")
        print(gmkf_flowcells.loc[~id_x][['sample', 'bio_sample', 'well_sample', col]])

input_bam
                                  sample bio_sample well_sample  \
38  0ec460ef-b86b-42c7-8e71-b820c7557fab    1-00151    SM-K6JDW   

                                            input_bam  
38  gs://broad-gp-pacbio/r64020e_20211114_004036/1...  
input_pbi
                                  sample bio_sample well_sample  \
38  0ec460ef-b86b-42c7-8e71-b820c7557fab    1-00151    SM-K6JDW   

                                            input_pbi  
38  gs://broad-gp-pacbio/r64020e_20211114_004036/1...  
subreads_bam
                                  sample bio_sample well_sample  \
38  0ec460ef-b86b-42c7-8e71-b820c7557fab    1-00151    SM-K6JDW   

                                         subreads_bam  
38  gs://broad-gp-pacbio/r64020e_20211114_004036/1...  
subreads_pbi
                                  sample bio_sample well_sample  \
38  0ec460ef-b86b-42c7-8e71-b820c7557fab    1-00151    SM-K6JDW   

                                         subreads_pbi  
38  gs://broad-gp-pacb

In [19]:
gmkf_flowcells[gmkf_flowcells['sample'] == '0ec460ef-b86b-42c7-8e71-b820c7557fab']

Unnamed: 0,sample,aligned_bai,aligned_bam,aligned_est_fold_cov,aligned_frac_bases,aligned_num_bases,aligned_num_reads,aligned_pbi,aligned_read_length_N50,aligned_read_length_mean,...,read_qual_mean,read_qual_median,subread_read_length_N50,subread_read_length_mean,subreads_bam,subreads_pbi,total_length,well_name,well_sample,workspace
38,0ec460ef-b86b-42c7-8e71-b820c7557fab,,,,,,,,,,...,,,,,gs://broad-gp-pacbio/r64020e_20211114_004036/1...,gs://broad-gp-pacbio/r64020e_20211114_004036/1...,496777375438,A01,SM-K6JDW,Gabriel_GMKFLRP_Gelb_PacBio_FY20
