In [1]:
# script: create_runbatch_config
# authors: Olga Botvinnik & Lincoln Harris
# date: 10.11.18
#
# Trying to build the input file to give tracer_pipeline.rf (required for batch mode run)

In [1]:
# get all of the run prefixes w/in immuneCells_9.27
bucketPrefixes = 's3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031'
f = 'nonImmune_bams_9.27_prefixes.txt'
! aws s3 ls $bucketPrefixes > $f
! cat $f

                           PRE 181031/


In [2]:
# read run names into a dataframe
#     with pandas!!
import pandas as pd
pd.options.display.max_colwidth = 500 # module config? 

runs_df = pd.read_table(f, delim_whitespace=True, header=None, names=['is_prefix', 'run_name'])
runs_df

Unnamed: 0,is_prefix,run_name
0,PRE,181031/


In [3]:
# can i add a full_path col? 
runs_df['full_path'] = 's3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031' + runs_df['run_name']
runs_df

Unnamed: 0,is_prefix,run_name,full_path
0,PRE,181031/,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031181031/


In [4]:
# get all of the cells in a given run directory
prefix = 's3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031'
txt = 'runX_cells.txt'
! aws s3 ls $prefix > $txt
! cat $txt

                           PRE 181031/


In [5]:
# read 180226 cell names into a dataframe
cells_df = pd.read_table(txt, delim_whitespace=True, header=None, names=['is_prefix', 'cell_name'])
cells_df

Unnamed: 0,is_prefix,cell_name
0,PRE,181031/


In [9]:
# ls one of our s3 cell directories
test_files = ! aws s3 ls $prefix\P5_B001799/ # what does backslash do? 
test_files

[]

In [13]:
# get full s3 paths for fastq file (R1), then add them to a new col in cells_df

def get_fastqs_R1(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    fq_line = [x for x in lines if x.endswith('R1_001.fastq.gz')][0] # get the fastq files, specifically
    fq_basename = fq_line.split()[-1]
    return f'{s3_location}{fq_basename}'


cells_df['input_fq_1'] = cells_df['cell_name'].map(get_fastqs_R1) # applying function, and assigning output to new col in cells_df
cells_df.head()

Unnamed: 0,is_prefix,cell_name,input_fq,input_fq_1
0,PRE,A1_B001800/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz
1,PRE,A21_1001000366/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz
2,PRE,A21_B003049/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz
3,PRE,A2_B001797/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz
4,PRE,A2_B001798/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz


In [19]:
# get full s3 paths for fastq file (R2), then add them to a new col in cells_df

def get_fastqs_R2(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    try:
        fq_line = [x for x in lines if x.endswith('R2_001.fastq.gz')][0] # get the fastq files, specifically
        fq_basename = fq_line.split()[-1]
        #print(s3_location)
        return f'{s3_location}{fq_basename}'
    except IndexError:
        return

cells_df['input_fq_2'] = cells_df['cell_name'].map(get_fastqs_R2) # applying function, and assigning output to new col in cells_df
cells_df.head()

Unnamed: 0,is_prefix,cell_name,input_fq,input_fq_1,sample_id,input_fq_2
0,PRE,A1_B001800/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,A1_B001800,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R2_001.fastq.gz
1,PRE,A21_1001000366/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,A21_1001000366,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R2_001.fastq.gz
2,PRE,A21_B003049/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,A21_B003049,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R2_001.fastq.gz
3,PRE,A2_B001797/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,A2_B001797,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R2_001.fastq.gz
4,PRE,A2_B001798/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,A2_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R2_001.fastq.gz


In [20]:
# add a sample_id col
cells_df['sample_id'] = cells_df.cell_name.str.strip('/') # getting rid of the forward slashes
cells_df.head()

Unnamed: 0,is_prefix,cell_name,input_fq,input_fq_1,sample_id,input_fq_2
0,PRE,A1_B001800/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,A1_B001800,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R2_001.fastq.gz
1,PRE,A21_1001000366/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,A21_1001000366,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R2_001.fastq.gz
2,PRE,A21_B003049/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,A21_B003049,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R2_001.fastq.gz
3,PRE,A2_B001797/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,A2_B001797,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R2_001.fastq.gz
4,PRE,A2_B001798/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,A2_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R2_001.fastq.gz


In [22]:
# building the output vcf string
import os

cells_df['output_prefix'] = 's3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/'
cells_df.head()

Unnamed: 0,is_prefix,cell_name,input_fq,input_fq_1,sample_id,input_fq_2,output_prefix
0,PRE,A1_B001800/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,A1_B001800,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
1,PRE,A21_1001000366/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,A21_1001000366,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
2,PRE,A21_B003049/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,A21_B003049,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
3,PRE,A2_B001797/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,A2_B001797,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
4,PRE,A2_B001798/,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,A2_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/


In [23]:
# subset cells_df by only what we want
cols_to_keep = ['sample_id', 'input_fq_1', 'input_fq_1', 'output_prefix']

samples_df = cells_df[cols_to_keep]
samples_df

Unnamed: 0,sample_id,input_fq_1,input_fq_1.1,output_prefix
0,A1_B001800,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
1,A21_1001000366,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
2,A21_B003049,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
3,A2_B001797,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
4,A2_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
5,A2_B001799,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001799/A2_B001799_S194_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001799/A2_B001799_S194_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
6,A2_B003785,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B003785/A2_B003785_S146_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B003785/A2_B003785_S146_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
7,A3_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A3_B001798/A3_B001798_S111_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A3_B001798/A3_B001798_S111_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
8,A4_1001000362,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A4_1001000362/A4_1001000362_S4_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A4_1001000362/A4_1001000362_S4_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
9,A4_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A4_B001798/A4_B001798_S112_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A4_B001798/A4_B001798_S112_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/


In [30]:
# rename cols? 
samples_df.columns = ['sample_id','input_fq1','input_fq2', 'output_dir']
samples_df.head()

Unnamed: 0,sample_id,input_fq1,input_fq2,output_dir,id
0,A1_B001800,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A1_B001800
1,A21_1001000366,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A21_1001000366
2,A21_B003049,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A21_B003049
3,A2_B001797,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A2_B001797
4,A2_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A2_B001798


In [31]:
# create ID col
#samples_df['id'] = samples_df['sample_id']
samples_df.head()

Unnamed: 0,sample_id,input_fq1,input_fq2,output_dir,id
0,A1_B001800,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A1_B001800
1,A21_1001000366,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A21_1001000366
2,A21_B003049,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A21_B003049
3,A2_B001797,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A2_B001797
4,A2_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/,A2_B001798


In [35]:
samples_df = samples_df[['id', 'sample_id', 'input_fq1', 'input_fq2', 'output_dir']]
samples_df.head()

Unnamed: 0,id,sample_id,input_fq1,input_fq2,output_dir
0,A1_B001800,A1_B001800,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A1_B001800/A1_B001800_S277_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
1,A21_1001000366,A21_1001000366,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_1001000366/A21_1001000366_S57_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
2,A21_B003049,A21_B003049,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A21_B003049/A21_B003049_S81_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
3,A2_B001797,A2_B001797,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001797/A2_B001797_S26_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
4,A2_B001798,A2_B001798,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/180226/A2_B001798/A2_B001798_S110_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/


In [21]:
## CAN RUN EVERYTHING FROM RIGHT HERE!
## can we make a function or a class to do everything we just did? 
import os
import pandas as pd
pd.options.display.max_colwidth = 500 # module config? 
pd.options.mode.chained_assignment = None  # disable warning message? 

# get_fastqs_R1()
#      get full s3 paths for fastq file (R1), then add them to a new col in cells_df
def get_fastqs_R1(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    try:
        fq_line = [x for x in lines if x.endswith('R1_001.fastq.gz')][0] # get the fastq files, specifically
        fq_basename = fq_line.split()[-1]
        return f'{s3_location}{fq_basename}'
    except IndexError:
        return
    
# get_fastqs_R2()
# get full s3 paths for fastq file (R2), then add them to a new col in cells_df
def get_fastqs_R2(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    try:
        fq_line = [x for x in lines if x.endswith('R2_001.fastq.gz')][0] # get the fastq files, specifically
        fq_basename = fq_line.split()[-1]
        #print(s3_location)
        return f'{s3_location}{fq_basename}'
    except IndexError:
        return

# driver()
#     Gets cell names given a prefix, and sets up dataframe
def driver(prefix): 
     
    # get all of the cells in a given run directory
    txt = 'runX_cells.txt'
    ! aws s3 ls $prefix > $txt

    # read 180226 cell names into a dataframe
    cells_df = pd.read_table(txt, delim_whitespace=True, header=None, names=['is_prefix', 'cell_name'])

    # applying function, and assigning output to new col in cells_df
    cells_df['input_fq_1'] = cells_df['cell_name'].map(get_fastqs_R1) 

    # applying function, and assigning output to new col in cells_df
    cells_df['input_fq_2'] = cells_df['cell_name'].map(get_fastqs_R2)
    
    # add a sample_id col
    cells_df['sample_id'] = cells_df.cell_name.str.strip('/') # getting rid of the forward slashes
    
    # building the output vcf string
    cells_df['output_prefix'] = 's3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/'
    
    # subset cells_df by only what we want
    cols_to_keep = ['sample_id', 'input_fq_1', 'input_fq_2', 'output_prefix']
    samples_df = cells_df[cols_to_keep]
    
    # rename cols and add ID col
    samples_df.columns = ['sample_id','input_fq1','input_fq2', 'output_dir']
    samples_df['id'] = samples_df['sample_id']

    # rearrange cols
    samples_df = samples_df[['id', 'sample_id', 'input_fq1', 'input_fq2', 'output_dir']]
    
    return samples_df
    

In [22]:
# call this our Main() i guess
#       run driver function

# get all of the run prefixes w/in immuneCells_9.27
bucketPrefixes = 's3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031'
f = 'nonImmune_bams_9.27_prefixes.txt'
! aws s3 ls $bucketPrefixes > $f
    
# read run prefixes into a pandas df
runs_df = pd.read_table(f, delim_whitespace=True, header=None, names=['is_prefix', 'run_name'])
    
# add a full_path col
runs_df['full_path'] = 's3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/'
    
big_df = pd.DataFrame() # init empty dataframe

for i in range(0, len(runs_df.index)):
    global prefix # dont like this
    prefix = runs_df['full_path'][i]
    print(prefix)
    curr_df = driver(prefix)
    toConcat = [big_df, curr_df]
    big_df = pd.concat(toConcat)
    
big_df.head()

s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/


Unnamed: 0,id,sample_id,input_fq1,input_fq2,output_dir
0,A12_B001554,A12_B001554,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A12_B001554/A12_B001554_S216_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A12_B001554/A12_B001554_S216_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
1,A12_B003528,A12_B003528,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A12_B003528/A12_B003528_S96_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A12_B003528/A12_B003528_S96_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
2,A13_B001554,A13_B001554,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A13_B001554/A13_B001554_S217_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A13_B001554/A13_B001554_S217_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
3,A13_B002572,A13_B002572,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A13_B002572/A13_B002572_S13_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A13_B002572/A13_B002572_S13_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/
4,A13_B003528,A13_B003528,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A13_B003528/A13_B003528_S97_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immune/immune_fastqs_9.27/181031/A13_B003528/A13_B003528_S97_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/immuneCells_9.27/trinity_out/


In [23]:
# write this guy to a file
import json

out_dir = '../tracer/181029'

# write samples_df to file
! mkdir -p $out_dir
big_df.to_csv(f'{out_dir}/samples.csv', index=False)

# write a config file
config =     {
    "program": "../../reflow/tracer_pipeline.rf",
    "runs_file": "samples.csv"
}

with open(f'{out_dir}/config.json', 'w') as f:
    json.dump(config, f)
    
! head -n 3 $out_dir/samples_big.csv $out_dir/config.json

head: ../tracer/181029/samples_big.csv: No such file or directory
==> ../tracer/181029/config.json <==
{"program": "../../reflow/tracer_pipeline.rf", "runs_file": "samples.csv"}