In [15]:
import pandas as pd
import numpy as np
import s3fs
import tqdm
import subprocess
import shutil, os, glob


In [16]:
targetfn_df = pd.read_csv('s3://darmanis-group/danle/botryllus/botryllus_samples.csv')
targetfn_df = pd.melt(targetfn_df, id_vars=['Chimeras set','Time chimera sampled following fusion'])
targetfn_df['translated_value'] = [ x.replace(' ', '_') \
                                     .replace('  ', '_') \
                                     .replace('.', '_') \
                                     .replace('..', '_') \
                                     .replace('-', '_') \
                                     .replace('--', '_') \
                                     .replace('__', '_') \
                                     .replace('ï','i') if x is not np.nan \
                                   else x \
                                   for x in targetfn_df['value']
                                  ]
# create sample id conversion
value2id_dict = {}
for val_val, key_val in enumerate(targetfn_df['translated_value'].drop_duplicates().dropna()):
    value2id_dict[key_val] = f'sample_{val_val}'
value2id_dict[np.nan] = np.nan
targetfn_df['sample_id'] = [value2id_dict.get(x) for x in targetfn_df['translated_value']]


In [17]:
fs_df = pd.read_csv('/home/daniel_le/data/botryllus/fs.txt', header=None)
fs_df.columns = ['path']
fs_df['fastq'] = [x.endswith('q.gz') for x in fs_df['path']]
fs_df = fs_df[fs_df['fastq'] == True]
fs_df['fn'] = [x.split('/')[-1] for x in fs_df['path']]
fs_df['d1_dir'] = [x.split('/')[-2] for x in fs_df['path']]
fs_df['d1_dir_translated'] = [ x.replace(' ', '_') \
                                 .replace('  ', '_') \
                                 .replace('.', '_') \
                                 .replace('..', '_') \
                                 .replace('-', '_') \
                                 .replace('--', '_') \
                                 .replace('__', '_') \
                               for x in fs_df['d1_dir']
                              ]
fs_df['fn_translated'] = [ x.replace(' ', '_') \
                                 .replace('  ', '_') \
                                 .replace('.', '_') \
                                 .replace('..', '_') \
                                 .replace('-', '_') \
                                 .replace('--', '_') \
                                 .replace('__', '_') \
                               for x in fs_df['fn']
                              ]


In [18]:
match_df = pd.DataFrame()
for target in tqdm.tqdm(targetfn_df['translated_value'].drop_duplicates().dropna()):
    if '944axByd196_6_x_sc109e_92' in target:
        word_len = 30
    else:
        word_len = len(target)
    fs_df_slice = fs_df[[x.startswith(target[:word_len]) or y.startswith(target[:word_len]) \
                         for x,y in zip(fs_df['fn_translated'], fs_df['d1_dir_translated'])]]
    fs_df_slice['target'] = target
    match_df = match_df.append(fs_df_slice)
match_df['sample_id'] = [value2id_dict.get(x) for x in match_df['target']]
match_df = match_df.drop_duplicates()

# metrics
targets = set(targetfn_df['translated_value'].dropna())
print('targets:', len(targets))

hits = set(match_df['target'])
print('hits:', len(hits))

missing = targets-hits
[print(x) for x in missing]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
100%|██████████| 98/98 [00:00<00:00, 256.22it/s]

targets: 98
hits: 97
3966_944axByd196_6_x_sc109e_92_resorbing_zooid_early_D_chimera_944_side_DCRS





[None]

In [19]:
# check if R1/R2 are properly paired
match_df['fn_translated_prefix'] = [(x
                                     .replace('-R1.','_R2_')
                                     .replace('_R1_','_R2_')
                                    ) for x in match_df['fn_translated']]
match_df_check = pd.DataFrame(match_df
                              .groupby('sample_id')['fn_translated_prefix']
                              .value_counts()
                              .rename('count')).reset_index()
match_df_check[match_df_check['count']!=2]


Unnamed: 0,sample_id,fn_translated_prefix,count


In [None]:
output_dir = '/home/daniel_le/data/botryllus/results/'
tmp_dir = '/home/daniel_le/data/botryllus/tmp/'
indexed_fasta = '/home/daniel_le/data/botryllus/genome/botznik-chr.fa'
n_threads = 32

# update queue based on completed targets
queued_targets = list(set(match_df['sample_id']))
completed_targets = [x.split('/')[-1].split('.sorted.bam.bai')[0] for x in glob.glob(f'{output_dir}*') if x.endswith('.bai')]
queued_targets = list(set(queued_targets)-set(completed_targets))

# iterate through queue
for sample in tqdm.tqdm(queued_targets):
# for sample in tqdm.tqdm(['sample_75']): # for testing
    print('starting', sample)
    # prep tmp folder
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
        os.mkdir(tmp_dir)
    else:
        os.mkdir(tmp_dir)
    # prep output foler    
    if os.path.exists(output_dir):
        pass
    else:
        os.mkdir(output_dir)
    
#     # set file names
    tmp_R1_fn = f'{tmp_dir}R1.fq.gz'
    tmp_R2_fn = f'{tmp_dir}R2.fq.gz'
    tmp_sam = f'{tmp_dir}{sample}.sam'
    tmp_bam = f'{tmp_dir}{sample}.bam'
    output_sortedbam = f'{output_dir}{sample}.sorted.bam'
    
    # rclone subprocess call to download files
    match_df_slice = match_df[match_df['sample_id'] == sample]
    input_fastqs = match_df_slice['path'].tolist()
    for input_path in input_fastqs:
        subprocess.call(['rclone',
                         'copy',
                         f'gdrive:Botryllus/{input_path}',
                         tmp_dir,
                        ])
    
    # extract files in tmp for cat
    R1_inputs = sorted([x for x in glob.glob(f'{tmp_dir}*') if ('-R1.' in x) or ('_R1_' in x)])
    R2_inputs = sorted([x for x in glob.glob(f'{tmp_dir}*') if ('-R2.' in x) or ('_R2_' in x)])
    for inputs,outfn in zip([R1_inputs,R2_inputs], [tmp_R1_fn,tmp_R2_fn]):
        for input_fn in inputs:
            with open(outfn, 'a') as outfile:
                subprocess.call(['cat', input_fn], stdout=outfile)
                
    # run bwa mem
    with open(tmp_sam, 'w') as outfile:
        subprocess.call(['bwa',
                         'mem',
                         '-t',
                         str(n_threads),
                         '-M',
                         indexed_fasta,
                         tmp_R1_fn,
                         tmp_R2_fn,
                        ], stdout=outfile)
        
    # samtools convert sam to bam
    with open(tmp_bam, 'w') as outfile:
        subprocess.call(['samtools',
                         'view',
                         '-S',
                         '-b',
                         tmp_sam,
                        ], stdout=outfile)
        
    # samtools sort bam
    with open(output_sortedbam, 'w') as outfile:
        subprocess.call(['samtools',
                         'sort',
                         tmp_bam,
                        ], stdout=outfile)
        
    # samtools index sorted bam
    subprocess.call(['samtools',
                     'index',
                     output_sortedbam,
                    ])
    
    print('completed')


  0%|          | 0/97 [00:00<?, ?it/s]

starting sample_83


  1%|          | 1/97 [29:07<46:35:14, 1747.03s/it]

completed
starting sample_27


  2%|▏         | 2/97 [38:28<36:43:11, 1391.49s/it]

completed
starting sample_40


  3%|▎         | 3/97 [43:48<27:56:11, 1069.91s/it]

completed
starting sample_89
