# Introduction

Brian wants to see a list of PCR duplicates for some samples done by Hudson Alpha

In [1]:
import pandas
from pathlib import Path
import sys

In [2]:
HTSW = str(Path('~/proj/htsworkflow').expanduser())
if HTSW not in sys.path:
    sys.path.append(HTSW)

from htsworkflow.submission.encoded import ENCODED 

server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [3]:
pcr_request_xlsx = Path('~/woldlab/ENCODE/HudsonAlpha_sequencing_run_April_2020_brian_pcr_request.xlsx').expanduser()
stranded_bulk_xlsx = Path('~/woldlab/ENCODE/stranded-bulk_hudson_alpha_April22_2020-files-created.xlsx').expanduser()

In [4]:
pcr_request = pandas.read_excel(pcr_request_xlsx)
stranded_bulk = pandas.read_excel(stranded_bulk_xlsx, sheet_name='Library', header=0)

In [20]:
stranded.first_valid_index()

98

In [23]:
library_ids = []
for i, row in pcr_request.iterrows():
    stranded = stranded_bulk[stranded_bulk['cdna_sample:skip'] == row['Wold cDNA #']]
    stranded = stranded.loc[stranded.first_valid_index()]
    alias = stranded['aliases:array']
    _, library_id = alias.split(':')
    library_ids.append(library_id)

In [34]:
print('for a in {}; do f=${{a}}_100/${{a}}_100-GRCh38-V24-male_genome.bam ; if [ -e $f ] ; then echo $f present; else echo $f missing; fi; done'.format(' '.join(library_ids)))

for a in SL428079_C1 SL428090_C2 SL428097_C3 SL428098_C4 SL428099_C5 SL428100_C6 SL428101_C7 SL428102_C8 SL428103_C9 SL428069_C10 SL428070_C11 SL428071_C12 SL428072_E1 SL428073_E2 SL428074_E3 SL428075_E4 SL428076_E5 SL428077_E7 SL428080_E9 SL428081_E10 SL428082_E11 SL428083_E12 SL428084_F1 SL428085_F2 SL428086_F3 SL428087_F4 SL428088_F5 SL428089_F6 SL428091_F7 SL428092_F8 SL428093_F9 SL428094_F10 SL428095_F11 SL428096_F12; do f=${a}_100/${a}_100-GRCh38-V24-male_genome.bam ; if [ -e $f ] ; then echo $f present; else echo $f missing; fi; done


In [35]:
template = """JOB {library_id}-ENC11_picard-markdup /woldlab/loxcyc/home/diane/proj/long-rna-seq-condor/woldrnaseq/picard-markdup.condor


VARS {library_id}-ENC11_picard-markdup  analysis_name="{library_id}"
VARS {library_id}-ENC11_picard-markdup  curdir="{library_id}"
VARS {library_id}-ENC11_picard-markdup  genome_root="/woldlab/loxcyc/home/diane/proj/genome/"
VARS {library_id}-ENC11_picard-markdup  genome="GRCh38" annotation="V24" sex="male" 

"""

dagman = []
for library_id in library_ids:
    dagman.append(template.format(library_id=library_id))


In [37]:
print(''.join(dagman))

JOB SL428079_C1-ENC11_picard-markdup /woldlab/loxcyc/home/diane/proj/long-rna-seq-condor/woldrnaseq/picard-markdup.condor


VARS SL428079_C1-ENC11_picard-markdup  analysis_name="SL428079_C1"
VARS SL428079_C1-ENC11_picard-markdup  curdir="SL428079_C1"
VARS SL428079_C1-ENC11_picard-markdup  genome_root="/woldlab/loxcyc/home/diane/proj/genome/"
VARS SL428079_C1-ENC11_picard-markdup  genome="GRCh38" annotation="V24" sex="male" 

JOB SL428090_C2-ENC11_picard-markdup /woldlab/loxcyc/home/diane/proj/long-rna-seq-condor/woldrnaseq/picard-markdup.condor


VARS SL428090_C2-ENC11_picard-markdup  analysis_name="SL428090_C2"
VARS SL428090_C2-ENC11_picard-markdup  curdir="SL428090_C2"
VARS SL428090_C2-ENC11_picard-markdup  genome_root="/woldlab/loxcyc/home/diane/proj/genome/"
VARS SL428090_C2-ENC11_picard-markdup  genome="GRCh38" annotation="V24" sex="male" 

JOB SL428097_C3-ENC11_picard-markdup /woldlab/loxcyc/home/diane/proj/long-rna-seq-condor/woldrnaseq/picard-markdup.condor


VARS SL428097_C3-E