In [1]:
# script: create_runbatch_config
# authors: Olga Botvinnik & Lincoln Harris
# date: 10.3.18
#
# Trying to build the input file to give gatk_pipeline.rf (required for batch mode run)
# Lets see if we can get this working for some of Angela's cells

In [6]:
# try to get all of the run prefixes w/in nonImmune_bams_9.27
bucketPrefixes = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/'
f = 'nonImmune_bams_9.27_prefixes.txt'
! aws s3 ls $bucketPrefixes > $f
! cat $f

                           PRE 170125/
                           PRE 170202/
                           PRE 170215/
                           PRE 170504/
                           PRE 170508/
                           PRE 170510/
                           PRE 171120_concat1/
                           PRE 180226/
                           PRE 180307/
                           PRE 180319/
                           PRE 180320/
                           PRE 180405/
                           PRE 180423/
                           PRE 180516/
                           PRE 180519/
                           PRE 180601/
                           PRE 180604/
                           PRE 180711/
                           PRE 180831/
                           PRE 180911/
                           PRE 181029/
                           PRE 181120/
                           PRE vcf/
                           PRE vcf1/


In [7]:
# read run names into a dataframe
#     with pandas!!
import pandas as pd
pd.options.display.max_colwidth = 500 # module config? 

runs_df = pd.read_table(f, delim_whitespace=True, header=None, names=['is_prefix', 'run_name'])
runs_df

Unnamed: 0,is_prefix,run_name
0,PRE,170125/
1,PRE,170202/
2,PRE,170215/
3,PRE,170504/
4,PRE,170508/
5,PRE,170510/
6,PRE,171120_concat1/
7,PRE,180226/
8,PRE,180307/
9,PRE,180319/


In [8]:
# can i add a full_path col? 
runs_df['full_path'] = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/' + runs_df['run_name']
runs_df

Unnamed: 0,is_prefix,run_name,full_path
0,PRE,170125/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/
1,PRE,170202/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170202/
2,PRE,170215/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/
3,PRE,170504/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170504/
4,PRE,170508/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170508/
5,PRE,170510/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170510/
6,PRE,171120_concat1/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/171120_concat1/
7,PRE,180226/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180226/
8,PRE,180307/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180307/
9,PRE,180319/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180319/


In [9]:
# get rid of the vcf row
runs_df = runs_df.drop([22,23])
runs_df

Unnamed: 0,is_prefix,run_name,full_path
0,PRE,170125/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/
1,PRE,170202/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170202/
2,PRE,170215/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/
3,PRE,170504/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170504/
4,PRE,170508/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170508/
5,PRE,170510/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170510/
6,PRE,171120_concat1/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/171120_concat1/
7,PRE,180226/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180226/
8,PRE,180307/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180307/
9,PRE,180319/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180319/


In [4]:
# get all of the cells in a given run directory
prefix = 's3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/'
txt = 'runX_cells.txt'
! aws s3 ls $prefix > $txt
! cat $txt

                           PRE C3_1001000403/
                           PRE E3_1001000362/
                           PRE F1_1001000362/
                           PRE G5_1001000362/
                           PRE H1_1001000362/
                           PRE H7_1001000377/
                           PRE J11_1001000367/
                           PRE L2_1001000362/
                           PRE M10_1001000365/
                           PRE M2_1001000413/
                           PRE M3_1001000362/
                           PRE N2_1001000380/
                           PRE O23_1001000377/
                           PRE O24_1001000377/
                           PRE O7_B001797/


In [5]:
# read 180226 cell names into a dataframe
cells_df = pd.read_table(txt, delim_whitespace=True, header=None, names=['is_prefix', 'cell_name'])
cells_df

Unnamed: 0,is_prefix,cell_name
0,PRE,C3_1001000403/
1,PRE,E3_1001000362/
2,PRE,F1_1001000362/
3,PRE,G5_1001000362/
4,PRE,H1_1001000362/
5,PRE,H7_1001000377/
6,PRE,J11_1001000367/
7,PRE,L2_1001000362/
8,PRE,M10_1001000365/
9,PRE,M2_1001000413/


In [6]:
# ls one of our s3 cell directories
test_files = ! aws s3 ls $prefix\C3_1001000403/ # what does backslash do? 
test_files

['2018-10-05 10:58:28   10894185 C3_1001000403.vcf',
 '2018-09-28 17:55:52 1693079358 C3_1001000403_S123.homo.Aligned.out.sorted.bam',
 '2018-09-28 17:57:43    2447520 C3_1001000403_S123.homo.Aligned.out.sorted.bam.bai']

In [10]:
# get full s3 paths for bam files, then add them to a new col in cells_df

def get_bam(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    bam_line = [x for x in lines if x.endswith('bam')][0] # get the bam file, specifically
    bam_basename = bam_line.split()[-1]
    return f'{s3_location}{bam_basename}'


cells_df['input_bam'] = cells_df['cell_name'].map(get_bam) # applying function, and assigning output to new col in cells_df
cells_df.head()

Unnamed: 0,is_prefix,cell_name,input_bam
0,PRE,C3_1001000403/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/C3_1001000403/C3_1001000403_S123.homo.Aligned.out.sorted.bam
1,PRE,E3_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/E3_1001000362/E3_1001000362_S99.homo.Aligned.out.sorted.bam
2,PRE,F1_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/F1_1001000362/F1_1001000362_S121.homo.Aligned.out.sorted.bam
3,PRE,G5_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/G5_1001000362/G5_1001000362_S149.homo.Aligned.out.sorted.bam
4,PRE,H1_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/H1_1001000362/H1_1001000362_S169.homo.Aligned.out.sorted.bam


In [11]:
# sanity check
cells_df['input_bam']

0       s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/C3_1001000403/C3_1001000403_S123.homo.Aligned.out.sorted.bam
1        s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/E3_1001000362/E3_1001000362_S99.homo.Aligned.out.sorted.bam
2       s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/F1_1001000362/F1_1001000362_S121.homo.Aligned.out.sorted.bam
3       s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/G5_1001000362/G5_1001000362_S149.homo.Aligned.out.sorted.bam
4       s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/H1_1001000362/H1_1001000362_S169.homo.Aligned.out.sorted.bam
5       s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/H7_1001000377/H7_1001000377_S295.homo.Aligned.out.sorted.bam
6     s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/J11_1001000367/J11_1001000367_S179.homo.Aligned.out.sorted.bam
7       s3://darmanis-group/singlecell_lu

In [14]:
# add a sample_id col
cells_df['sample_id'] = cells_df.cell_name.str.strip('/') # getting rid of the forward slashes
cells_df.head()

Unnamed: 0,is_prefix,cell_name,input_bam,sample_id,id
0,PRE,C3_1001000403/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/C3_1001000403/C3_1001000403_S123.homo.Aligned.out.sorted.bam,C3_1001000403,C3_1001000403
1,PRE,E3_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/E3_1001000362/E3_1001000362_S99.homo.Aligned.out.sorted.bam,E3_1001000362,E3_1001000362
2,PRE,F1_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/F1_1001000362/F1_1001000362_S121.homo.Aligned.out.sorted.bam,F1_1001000362,F1_1001000362
3,PRE,G5_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/G5_1001000362/G5_1001000362_S149.homo.Aligned.out.sorted.bam,G5_1001000362,G5_1001000362
4,PRE,H1_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/H1_1001000362/H1_1001000362_S169.homo.Aligned.out.sorted.bam,H1_1001000362,H1_1001000362


In [5]:
# building the output vcf string
import os

cells_df['output_prefix'] = cells_df['input_bam'].map(os.path.dirname)
cells_df.head()

NameError: name 'cells_df' is not defined

In [16]:
# building the output vcf string
cells_df['output_vcf'] = cells_df.apply( # not sure how this works
    lambda x: '{output_prefix}/{sample_id}.vcf'.format(**x), axis=1)
cells_df.head()

Unnamed: 0,is_prefix,cell_name,input_bam,sample_id,id,output_prefix,output_vcf
0,PRE,C3_1001000403/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/C3_1001000403/C3_1001000403_S123.homo.Aligned.out.sorted.bam,C3_1001000403,C3_1001000403,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/C3_1001000403,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/C3_1001000403/C3_1001000403.vcf
1,PRE,E3_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/E3_1001000362/E3_1001000362_S99.homo.Aligned.out.sorted.bam,E3_1001000362,E3_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/E3_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/E3_1001000362/E3_1001000362.vcf
2,PRE,F1_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/F1_1001000362/F1_1001000362_S121.homo.Aligned.out.sorted.bam,F1_1001000362,F1_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/F1_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/F1_1001000362/F1_1001000362.vcf
3,PRE,G5_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/G5_1001000362/G5_1001000362_S149.homo.Aligned.out.sorted.bam,G5_1001000362,G5_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/G5_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/G5_1001000362/G5_1001000362.vcf
4,PRE,H1_1001000362/,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/H1_1001000362/H1_1001000362_S169.homo.Aligned.out.sorted.bam,H1_1001000362,H1_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/H1_1001000362,s3://darmanis-group/singlecell_lungadeno/nonImmune_bams_9.27/180226/H1_1001000362/H1_1001000362.vcf


In [15]:
# subset cells_df by only what we want
cols_to_keep = ['input_bam', 'sample_id', 'output_vcf']

samples_df = cells_df[cols_to_keep]
samples_df

KeyError: "['sample_id' 'output_vcf'] not in index"

In [9]:
# writeFunc()
#     write this guy to a file
#def writeFunc():
import json

out_dir = '../gatk/11.29.18_redo'

# write samples_df to file
! mkdir -p $out_dir
big_df.to_csv(f'{out_dir}/samples.csv', index=False)

# write a config file
config =     {
    "program": "../../reflow/gatk_pipeline.rf",
    "runs_file": "samples.csv"
}

with open(f'{out_dir}/config.json', 'w') as f:
    json.dump(config, f)
    
! head -n 3 $out_dir/samples.csv $out_dir/config.json

==> ../gatk/11.29.18_redo/samples.csv <==
id,input_bam,sample_id,output_vcf
G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/G12_1001000292/G12_1001000292_S72.homo.Aligned.out.sorted.bam,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/G12_1001000292.vcf
H2_1001000293,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/H2_1001000293/H2_1001000293_S144.homo.Aligned.out.sorted.bam,H2_1001000293,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/H2_1001000293.vcf

==> ../gatk/11.29.18_redo/config.json <==
{"program": "../../reflow/gatk_pipeline.rf", "runs_file": "samples.csv"}

In [10]:
## can we make a function or a class to do everything we just did? 

# get_bam()
#     gets full paths to bam files
def get_bam(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    bam_line = [x for x in lines if x.endswith('bam')][0] # get the bam file, specifically
    bam_basename = bam_line.split()[-1]
    return f'{s3_location}{bam_basename}'

# driver()
#     Gets cell names given a prefix, and sets up dataframe
def driver(prefix): 
    #print("in driver")
    txt = 'runX_cells.txt'
    ! aws s3 ls $prefix > $txt
    
    # read into a pandas dataframe
    cells_df = pd.read_table(txt, delim_whitespace=True, header=None, names=['is_prefix', 'cell_name'])

    # call get_bam() and add 'input_bam' col
    cells_df['input_bam'] = cells_df['cell_name'].map(get_bam) # how does this map thing work? 
    
    # get rid of forward slashes and add 'sample_id' col
    cells_df['sample_id'] = cells_df.cell_name.str.strip('/')
    
    # add output_prefix col
    cells_df['output_vcf'] = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/' + cells_df['sample_id'] + '.vcf'               
    
    # subset cells_df by only what we want
    cols_to_keep = ['input_bam', 'sample_id', 'output_vcf']
    samples_df = cells_df[cols_to_keep]
    
    return(samples_df)


In [11]:
# call this our Main() i guess
#       run driver function

big_df = pd.DataFrame() # init empty dataframe

for i in range(0, len(runs_df.index)-2):
    global prefix # dont like this
    prefix = runs_df['full_path'][i]
    print(prefix)
    curr_df = driver(prefix)
    toConcat = [big_df, curr_df]
    big_df = pd.concat(toConcat)

s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170202/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170504/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170508/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170510/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/171120_concat1/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180226/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180307/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180319/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180320/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/180405/
s3://darmanis-group/singlecell_l

IndexError: list index out of range

In [13]:
big_df.head()

Unnamed: 0,input_bam,sample_id,output_vcf
0,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/G12_1001000292/G12_1001000292_S72.homo.Aligned.out.sorted.bam,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/G12_1001000292.vcf
1,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/H2_1001000293/H2_1001000293_S144.homo.Aligned.out.sorted.bam,H2_1001000293,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/H2_1001000293.vcf
0,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170202/G12_1001000292/G12_1001000292_S51.homo.Aligned.out.sorted.bam,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/G12_1001000292.vcf
0,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/B10_1001000301/B10_1001000301_S7.homo.Aligned.out.sorted.bam,B10_1001000301,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/B10_1001000301.vcf
1,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/B10_1001000302/B10_1001000302_S45.homo.Aligned.out.sorted.bam,B10_1001000302,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/B10_1001000302.vcf


In [15]:
len(big_df.index)

8262

In [7]:
# add id col? 
big_df["id"] = big_df["sample_id"]
big_df.head()

Unnamed: 0,input_bam,sample_id,output_vcf,id
0,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/G12_1001000292/G12_1001000292_S72.homo.Aligned.out.sorted.bam,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/G12_1001000292.vcf,G12_1001000292
1,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/H2_1001000293/H2_1001000293_S144.homo.Aligned.out.sorted.bam,H2_1001000293,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/H2_1001000293.vcf,H2_1001000293
0,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170202/G12_1001000292/G12_1001000292_S51.homo.Aligned.out.sorted.bam,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/G12_1001000292.vcf,G12_1001000292
0,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/B10_1001000301/B10_1001000301_S7.homo.Aligned.out.sorted.bam,B10_1001000301,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/B10_1001000301.vcf,B10_1001000301
1,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/B10_1001000302/B10_1001000302_S45.homo.Aligned.out.sorted.bam,B10_1001000302,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/B10_1001000302.vcf,B10_1001000302


In [8]:
# need to fix col order
big_df = big_df[['id', 'input_bam', 'sample_id', 'output_vcf']]
big_df.head()

Unnamed: 0,id,input_bam,sample_id,output_vcf
0,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/G12_1001000292/G12_1001000292_S72.homo.Aligned.out.sorted.bam,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/G12_1001000292.vcf
1,H2_1001000293,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170125/H2_1001000293/H2_1001000293_S144.homo.Aligned.out.sorted.bam,H2_1001000293,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/H2_1001000293.vcf
0,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170202/G12_1001000292/G12_1001000292_S51.homo.Aligned.out.sorted.bam,G12_1001000292,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/G12_1001000292.vcf
0,B10_1001000301,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/B10_1001000301/B10_1001000301_S7.homo.Aligned.out.sorted.bam,B10_1001000301,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/B10_1001000301.vcf
1,B10_1001000302,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/170215/B10_1001000302/B10_1001000302_S45.homo.Aligned.out.sorted.bam,B10_1001000302,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_bams_9.27/vcf1/B10_1001000302.vcf


In [3]:
# messing around
import pandas as pd
database = pd.read_csv("/Users/lincoln.harris/Desktop/CosmicGenomeScreensMutantExport.tsv", delimiter = '\t')
database.head()

Unnamed: 0,Gene name,Accession Number,Gene CDS length,HGNC ID,Sample name,ID_sample,ID_tumour,Primary site,Site subtype 1,Site subtype 2,...,Mutation strand,SNP,FATHMM prediction,FATHMM score,Mutation somatic status,Pubmed_PMID,ID_STUDY,Sample Type,Tumour origin,Age
0,TRIO,ENST00000344204,9294,12303,MZ7-mel,753596,672566,skin,NS,NS,...,+,n,PATHOGENIC,0.98488,Confirmed somatic variant,,30.0,cell-line,metastasis,
1,ABCC4_ENST00000376887,ENST00000376887,3978,,05-165K5,2575184,2436521,prostate,NS,NS,...,-,,,,Confirmed somatic variant,26928463.0,,autopsy - NOS,metastasis,69.00
2,TP53_ENST00000413465,ENST00000413465,858,11998,PD11327a,1744637,1649978,breast,NS,NS,...,-,n,PATHOGENIC,0.99623,Confirmed somatic variant,,652.0,NS,NS,
3,EXD3,ENST00000340951,2274,26023,HUB-02-C2-098,2607142,2466854,large_intestine,NS,NS,...,-,n,NEUTRAL,0.02736,Confirmed somatic variant,,670.0,organoid culture,NS,
4,ELMO2,ENST00000396391,2163,17233,HUB-02-B2-113,2607139,2466851,large_intestine,NS,NS,...,-,n,NEUTRAL,0.22837,Confirmed somatic variant,,670.0,organoid culture,NS,
5,C6orf165_ENST00000369562,ENST00000369562,1869,,LOVO,2301996,2167279,large_intestine,NS,NS,...,+,n,PATHOGENIC,0.96754,Variant of unknown origin,24755471.0,,cell-line,NS,
6,LYST,ENST00000389793,11406,1968,TCGA-05-4427-01,1780073,1684072,lung,NS,NS,...,-,n,PATHOGENIC,0.93522,Variant of unknown origin,,417.0,fresh/frozen - NOS,primary,65.00
7,TP53_ENST00000269305,ENST00000269305,1182,,TCGA-EE-A29M-06,2121640,1995876,skin,NS,NS,...,-,n,PATHOGENIC,0.99744,Confirmed somatic variant,,540.0,NS,NS,33.00
8,APBB1IP,ENST00000376236,2001,17379,LAU50_2,1675357,1588296,skin,leg,NS,...,+,y,PATHOGENIC,0.83812,Variant of unknown origin,22197931.0,,cell-line,metastasis,
9,MKS1_ENST00000337050,ENST00000337050,1566,7121,P04-2599,1691461,1599728,prostate,NS,NS,...,-,n,PATHOGENIC,0.87460,Variant of unknown origin,22610119.0,,surgery fresh/frozen,primary,68.00


In [4]:
list(database)

['Gene name',
 'Accession Number',
 'Gene CDS length',
 'HGNC ID',
 'Sample name',
 'ID_sample',
 'ID_tumour',
 'Primary site',
 'Site subtype 1',
 'Site subtype 2',
 'Site subtype 3',
 'Primary histology',
 'Histology subtype 1',
 'Histology subtype 2',
 'Histology subtype 3',
 'Genome-wide screen',
 'Mutation ID',
 'Mutation CDS',
 'Mutation AA',
 'Mutation Description',
 'Mutation zygosity',
 'LOH',
 'GRCh',
 'Mutation genome position',
 'Mutation strand',
 'SNP',
 'FATHMM prediction',
 'FATHMM score',
 'Mutation somatic status',
 'Pubmed_PMID',
 'ID_STUDY',
 'Sample Type',
 'Tumour origin',
 'Age']

In [5]:
database.head()

Unnamed: 0,Gene name,Accession Number,Gene CDS length,HGNC ID,Sample name,ID_sample,ID_tumour,Primary site,Site subtype 1,Site subtype 2,...,Mutation strand,SNP,FATHMM prediction,FATHMM score,Mutation somatic status,Pubmed_PMID,ID_STUDY,Sample Type,Tumour origin,Age
0,TRIO,ENST00000344204,9294,12303.0,MZ7-mel,753596,672566,skin,NS,NS,...,+,n,PATHOGENIC,0.98488,Confirmed somatic variant,,30.0,cell-line,metastasis,
1,ABCC4_ENST00000376887,ENST00000376887,3978,,05-165K5,2575184,2436521,prostate,NS,NS,...,-,,,,Confirmed somatic variant,26928463.0,,autopsy - NOS,metastasis,69.0
2,TP53_ENST00000413465,ENST00000413465,858,11998.0,PD11327a,1744637,1649978,breast,NS,NS,...,-,n,PATHOGENIC,0.99623,Confirmed somatic variant,,652.0,NS,NS,
3,EXD3,ENST00000340951,2274,26023.0,HUB-02-C2-098,2607142,2466854,large_intestine,NS,NS,...,-,n,NEUTRAL,0.02736,Confirmed somatic variant,,670.0,organoid culture,NS,
4,ELMO2,ENST00000396391,2163,17233.0,HUB-02-B2-113,2607139,2466851,large_intestine,NS,NS,...,-,n,NEUTRAL,0.22837,Confirmed somatic variant,,670.0,organoid culture,NS,


In [30]:
#database.loc[['Carcinoma'], ['Primary histology']]
#database.loc[['TRIO'], ['ID_sample']]
#database.where('TRIO', database)

#database['Primary histology'] == 'carcinoma'
#database['Primary site'] == 'lung'

pHistList = database.index[database['Primary histology'] == 'carcinoma'].tolist()
pSiteList = database.index[database['Primary site'] == 'lung'].tolist()

shared = list(set(pHistList) & set(pSiteList))
shared

[1048577,
 1048580,
 6,
 2097159,
 3145737,
 3145738,
 3145739,
 2097166,
 3145744,
 18,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 1048598,
 1048601,
 1048606,
 1048607,
 32,
 1048608,
 34,
 35,
 2097184,
 37,
 38,
 3145764,
 40,
 41,
 2097193,
 43,
 44,
 45,
 3145773,
 4194348,
 4194349,
 5242929,
 4194355,
 5242931,
 2097206,
 1048631,
 2097207,
 57,
 2097211,
 3145787,
 4194366,
 3145793,
 66,
 67,
 2097220,
 4194371,
 4194372,
 2097223,
 4194373,
 5242947,
 5242951,
 2097227,
 76,
 1048652,
 4194381,
 5242952,
 1048663,
 3145816,
 5242969,
 2097243,
 1048668,
 4194396,
 1048671,
 5242977,
 4194405,
 3145835,
 1048687,
 5242993,
 5242996,
 4194422,
 4194424,
 2097273,
 1048703,
 127,
 3145858,
 1048707,
 3145859,
 135,
 4194449,
 1048724,
 3145877,
 1048726,
 1048727,
 3145883,
 3145884,
 3145885,
 4194460,
 159,
 1048737,
 4194468,
 4194469,
 4194470,
 2097319,
 4194471,
 5243047,
 5243051,
 5243052,
 5243053,
 3145903,
 3145904,
 4194481,
 4194484,
 3145912,
 2097337,
 3145916,
 31

In [27]:
database['Primary histology'] == 'carcinoma'
database['Primary histology'] == 'carcinoma'

0          False
1           True
2           True
3           True
4           True
5           True
6           True
7          False
8          False
9           True
10          True
11          True
12         False
13          True
14          True
15          True
16          True
17          True
18          True
19          True
20          True
21          True
22          True
23          True
24          True
25          True
26          True
27          True
28          True
29          True
           ...  
5925510    False
5925511     True
5925512     True
5925513     True
5925514     True
5925515    False
5925516     True
5925517     True
5925518    False
5925519     True
5925520     True
5925521    False
5925522     True
5925523     True
5925524    False
5925525     True
5925526    False
5925527     True
5925528     True
5925529     True
5925530     True
5925531     True
5925532     True
5925533     True
5925534     True
5925535     True
5925536     True
5925537     Tr

In [33]:
database_laud = database.iloc[shared]

In [35]:
database_laud['Primary histology']
database_laud['Primary site']

1048577    lung
1048580    lung
6          lung
2097159    lung
3145737    lung
3145738    lung
3145739    lung
2097166    lung
3145744    lung
18         lung
20         lung
21         lung
22         lung
23         lung
24         lung
25         lung
26         lung
27         lung
1048598    lung
1048601    lung
1048606    lung
1048607    lung
32         lung
1048608    lung
34         lung
35         lung
2097184    lung
37         lung
38         lung
3145764    lung
           ... 
5242807    lung
1048506    lung
4194235    lung
5242813    lung
1048512    lung
3145664    lung
4194241    lung
1048517    lung
1048518    lung
2097094    lung
2097095    lung
4194249    lung
5242821    lung
2097099    lung
1048529    lung
4194258    lung
5242834    lung
4194262    lung
5242845    lung
2097118    lung
1048548    lung
2097125    lung
4194277    lung
3145704    lung
3145705    lung
3145706    lung
4194288    lung
4194293    lung
4194294    lung
2097149    lung
Name: Primary site, Leng

In [38]:
def getGenomePos(sample):
    chr = sample[0]
    chr = chr.replace("chr", "")
    pos = sample[1]
    genomePos = chr + ':' + str(pos) + '-' + str(pos)

    return(genomePos)

In [46]:
# test
import VCF
df = VCF.dataframe("/Users/lincoln.harris/Desktop/vcf/all/A10_1001000408.vcf")

out = df.apply(getGenomePos, axis=1)
out
out[10]

'1:633714-633714'