Make conf files for spacegraphcats queries

In [1]:
import yaml
import io
import pandas as pd
import re

In [2]:
m = pd.read_csv("../inputs/working_metadata.tsv", sep = "\t", header = 0)
SAMPLES = m.sort_values(by='read_count')['run_accession']
LIBRARIES = m['library_name'].unique().tolist()

Example conf file

```
catlas_base: MSM79H58
input_sequences:
- inputs/cat/MSM79H58_1.fastq.gz
- inputs/cat/MSM79H58_2.fastq.gz
ksize: 31
radius: 1
search:
- ~/github/spacegraphcats/data/2.fa.gz
searchquick:
- ~/github/spacegraphcats/data/63.fa.gz

hashval_ksize: 51
hashval_queries: ../calc_cosmo_kmers/hmp_scaled2k_th138.labels.txt
```

In [3]:
genomes = ["ERS235530_10.fna.gz", "ERS235531_43.fna.gz", "ERS235603_16.fna.gz", 
           "ERS396297_11.fna.gz", "ERS396519_11.fna.gz", "ERS473255_26.fna.gz", 
           "ERS537218_9.fna.gz", "ERS537235_19.fna.gz", "ERS537328_30.fna.gz", 
           "ERS537353_12.fna.gz", "ERS608524_37.fna.gz", "ERS608576_22.fna.gz", 
           "GCF_000371685.1_Clos_bolt_90B3_V1_genomic.fna.gz", 
           "GCF_000508885.1_ASM50888v1_genomic.fna.gz", 
           "GCF_001405615.1_13414_6_47_genomic.fna.gz", 
           "GCF_900036035.1_RGNV35913_genomic.fna.gz", 
           "LeChatelierE_2013__MH0074__bin.19.fa.gz", "LiJ_2014__O2.UC28-1__bin.61.fa.gz",
           "LiSS_2016__FAT_DON_8-22-0-0__bin.28.fa.gz", "LoombaR_2017__SID1050_bax__bin.11.fa.gz",
           "NielsenHB_2014__MH0094__bin.44.fa.gz", "QinJ_2012__CON-091__bin.20.fa.gz",
           "SRR4305229_bin.5.fa.gz", "SRR5127401_bin.3.fa.gz", "SRR5558047_bin.10.fa.gz",
           "SRR6028281_bin.3.fa.gz", "SRS075078_49.fna.gz", "SRS103987_37.fna.gz", 
           "SRS104400_110.fna.gz", "SRS143598_15.fna.gz", "SRS1719112_8.fna.gz", 
           "SRS1719498_9.fna.gz", "SRS1719577_6.fna.gz", "SRS1735506_4.fna.gz", 
           "SRS1735645_19.fna.gz", "SRS294916_20.fna.gz", "SRS476209_42.fna.gz", 
           "VatanenT_2016__G80445__bin.9.fa.gz", "VogtmannE_2016__MMRS43563715ST-27-0-0__bin.70.fa.gz",
           "XieH_2016__YSZC12003_37172__bin.63.fa.gz", "ZeeviD_2015__PNP_Main_232__bin.27.fa.gz"]

In [4]:
len(genomes)

41

In [5]:
paths = ["outputs/gather_matches_loso/" + genome for genome in genomes]

In [6]:
for library in LIBRARIES:
    yml = {'catlas_base': library,
           'input_sequences': ['outputs/abundtrim/' + library + '.abundtrim.fq.gz'],
           'ksize': 31, 
           'radius': 1,
           'search': paths,
           'searchquick': paths[0]}
    with io.open("../inputs/sgc_conf/" + library + '_r1_conf.yml', 'w', encoding='utf8') as outfile:
        yaml.dump(yml, outfile, default_flow_style=False, allow_unicode=True, sort_keys=False)

In [7]:
paths2 = ["outputs/gather_matches_loso_prokka/" + genome + '.ffn' for genome in genomes]

In [8]:
paths2 = [re.sub(".gz", "", path) for path in paths2]

In [9]:
paths2

['outputs/gather_matches_loso_prokka/ERS235530_10.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS235531_43.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS235603_16.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS396297_11.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS396519_11.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS473255_26.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS537218_9.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS537235_19.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS537328_30.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS537353_12.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS608524_37.fna.ffn',
 'outputs/gather_matches_loso_prokka/ERS608576_22.fna.ffn',
 'outputs/gather_matches_loso_prokka/GCF_000371685.1_Clos_bolt_90B3_V1_genomic.fna.ffn',
 'outputs/gather_matches_loso_prokka/GCF_000508885.1_ASM50888v1_genomic.fna.ffn',
 'outputs/gather_matches_loso_prokka/GCF_001405615.1_13414_6_47_genomic.fna.ffn',
 'outputs/gather_matches_los

In [10]:
for library in LIBRARIES:
    yml = {'catlas_base': library,
           'input_sequences': ['outputs/abundtrim/' + library + '.abundtrim.fq.gz'],
           'ksize': 31, 
           'radius': 1,
           'search': paths,
           'searchquick': paths[0],
          
           'multifasta_reference': paths2,
           'multifasta_scaled': 2000,
           'multifasta_query_sig': 'outputs/vita_rf/at_least_5_studies_vita_vars.sig'}
    with io.open("../inputs/sgc_conf/" + library + '_r1_multifasta_conf.yml', 'w', encoding='utf8') as outfile:
        yaml.dump(yml, outfile, default_flow_style=False, allow_unicode=True, sort_keys=False)