Make conf files for HMP spacegraphcats queries

In [1]:
import yaml
import io
import pandas as pd

In [2]:
m = pd.read_csv("../inputs/working_metadata.tsv", sep = "\t", header = 0)
SAMPLES = m.sort_values(by='read_count')['run_accession']
LIBRARIES = m['library_name'].unique().tolist()

h = pd.read_csv("../inputs/hmp2_mgx_metadata.tsv", sep = "\t", header = 0)
HMP = h['External.ID'].unique().tolist()

Example conf file

```
catlas_base: MSM79H58
input_sequences:
- fastq/MSM79H58.fastq.gz
ksize: 31
radius: 1
search:
- ~/github/spacegraphcats/data/2.fa.gz
searchquick:
- ~/github/spacegraphcats/data/63.fa.gz

hashval_ksize: 51
hashval_queries: ../calc_cosmo_kmers/hmp_scaled2k_th138.labels.txt
```

In [3]:
genomes = ['AP010890.fna', 'BackhedF_2015__SID546_M__bin.6.fna', 'ChengpingW_2017__AS73raw__bin.33.fna',
           'CosteaPI_2017__daisy-11-0-0__bin.22.fna', 'CosteaPI_2017__SID713A045-11-90-0__bin.49.fna',
           'CP001361.fna', 'CP003040.fna', 'DRR042402_bin.21.fna', 'ERR1190768_bin.5.fna',
           'ERR1578685_bin.2.fna', 'ERR911961_bin.65.fna', 'ERS235550_13.fna', 'ERS235598_17.fna',
           'ERS396297_11.fna', 'ERS396299_52.fna', 'ERS396342_81.fna', 'ERS396398_4.fna',
           'ERS396441_81.fna', 'ERS396473_7.fna', 'ERS396539_25.fna', 'ERS473060_65.fna', 
           'ERS473099_4.fna', 'ERS473134_65.fna','ERS473186_56.fna', 'ERS473202_63.fna',  
           'ERS473255_26.fna', 'ERS473285_6.fna', 'ERS473343_4.fna', 'ERS537208_46.fna',
           'ERS537215_19.fna', 'ERS537328_30.fna', 'ERS608492_77.fna', 'ERS608495_74.fna', 
           'ERS608496_38.fna','ERS608525_13.fna','ERS608565_41.fna','ERS608576_22.fna',
           'ERS608600_49.fna', 'ERS608610_90.fna', 'ERS608615_34.fna', 'FTRU01000008.fna',
           'GCA_000153905.1_ASM15390v1_genomic.fna', 'GCA_000159875.2_Bact_sp_1_1_6_V2_genomic.fna',
           'GCA_000162015.1_ASM16201v1_genomic.fna', 'GCA_000162535.1_ASM16253v1_genomic.fna',
           'GCA_000225385.2_ASM22538v2_genomic.fna', 'GCA_000357165.1_ASM35716v1_genomic.fna',
           'GCA_000435395.1_MGS94_genomic.fna', 'GCA_000508885.1_ASM50888v1_genomic.fna',
           'GCA_000509145.1_ASM50914v1_genomic.fna', 'GCA_000519105.1_ASM51910v1_genomic.fna', 
           'GCA_000577815.1_JCC_genomic.fna', 'GCA_001055105.1_ASM105510v1_genomic.fna',
           'GCA_001405435.1_14207_7_29_genomic.fna', 'GCA_001406235.1_14207_7_41_genomic.fna',
           'GCA_001406495.1_14207_7_88_genomic.fna', 'GCA_001808655.1_ASM180865v1_genomic.fna',
           'GCA_001813525.1_ASM181352v1_genomic.fna', 'GCA_900036035.1_RGNV35913_genomic.fna',
           'GCA_900066745.1_14207_7_40_genomic.fna', 'GCA_900066825.1_14207_7_54_genomic.fna',
           'GeversD_2014__SKBSTL034__bin.11.fna', 'GL834357.fna', 'HMP_2012__SRS024087__bin.40.fna',
           'HMP_2012__SRS051882__bin.7.fna', 'JH126487.fna','JH590840.fna', 'JXXK01000001.fna', 
           'KarlssonFH_2013__S434__bin.51.fna', 'KarlssonFH_2013__S484__bin.30.fna', 'KB851023.fna',                      
           'KB851045.fna', 'KosticAD_2015__PRJNA231909.3108437__bin.1.fna', 
           'LeChatelierE_2013__MH0289__bin.49.fna', 'LiJ_2014__MH0270__bin.74.fna',
           'LiJ_2014__MH0311__bin.29.fna', 'LiJ_2014__O2.UC1-1__bin.3.fna', 
           'LiJ_2014__O2.UC13-1__bin.18.fna', 'LiJ_2014__V1.CD32-4__bin.38.fna',
           'LiJ_2014__V1.CD44-4__bin.30.fna', 'LiJ_2014__V1.CD45-0__bin.33.fna',
           'LiSS_2016__FAT_015-22-0-0__bin.6.fna', 'NielsenHB_2014__V1_UC17_2__bin.25.fna',
           'Obregon-TitoAJ_2015__NO12__bin.20.fna', 'QinJ_2012__CON-106__bin.8.fna',
           'QinJ_2012__T2D-098__bin.71.fna', 'SRR1765354_bin.3.fna', 'SRR3160442_bin.16.fna',
           'SRR4423656_bin.5.fna', 'SRR4423685_bin.10.fna', 'SRR5056810_bin.34.fna',
           'SRR5106332_bin.11.fna', 'SRR5580002_bin.2.fna', 'SRS011586_36.fna', 'SRS011586_5.fna',
           'SRS015065_75.fna','SRS018656_36.fna', 'SRS024331_12.fna', 'SRS043001_11.fna', 
           'SRS053214_28.fna', 'SRS054956_30.fna', 'SRS077502_3.fna', 'SRS142599_2.fna',
           'SRS143991_1.fna', 'SRS147022_17.fna','SRS147271_63.fna', 'SRS1596853_50.fna', 
           'SRS1719112_8.fna', 'SRS1719295_14.fna', 'SRS1719498_9.fna', 'SRS1735649_24.fna', 
           'SRS294916_20.fna', 'SRS475589_63.fna','SRS476013_30.fna','SRS476121_69.fna',
           'SRS476520_32.fna', 'VatanenT_2016__G80294__bin.12.fna', 
           'VogtmannE_2016__MMRS16644320ST-27-0-0__bin.66.fna', 
           'VogtmannE_2016__MMRS61683006ST-27-0-0__bin.27.fna', 
           'VogtmannE_2016__MMRS72802364ST-27-0-0__bin.39.fna', 
           'VogtmannE_2016__MMRS72980899ST-27-0-0__bin.41.fna', 'WenC_2017__N030__bin.40.fna', 
           'XieH_2016__YSZC12003_36012__bin.54.fna', 'XieH_2016__YSZC12003_36588__bin.69.fna',
           'XieH_2016__YSZC12003_36694__bin.17.fna', 'XieH_2016__YSZC12003_37297R1__bin.44.fna',
           'ZeeviD_2015__PNP_Main_124__bin.16.fna', 'ZeeviD_2015__PNP_Main_409__bin.41.fna',
           'ZellerG_2014__CCIS87252800ST-4-0__bin.6.fna']

In [4]:
paths = ["outputs/gather_genomes/" + genome for genome in genomes]

In [5]:
for hmp in HMP:
    seq1 ='inputs/hmp/' + hmp + '_R1.fastq.gz'
    seq2 = 'inputs/hmp/' + hmp + '_R2.fastq.gz'
    yml = {'catlas_base': hmp,
           'input_sequences': [seq1, seq2],
           'ksize': 31, 
           'radius': 1,
           'search': paths,
           'searchquick': paths[0]}
    with io.open("../inputs/sgc_conf/" + hmp + '_r1_conf.yml', 'w', encoding='utf8') as outfile:
        yaml.dump(yml, outfile, default_flow_style=False, allow_unicode=True, sort_keys=False)

In [6]:
for library in LIBRARIES:
    yml = {'catlas_base': library,
           'input_sequences': ['outputs/abundtrim/' + library + '.abundtrim.fq.gz'],
           'ksize': 31, 
           'radius': 1,
           'search': paths,
           'searchquick': paths[0]}
    with io.open("../inputs/sgc_conf/" + library + '_r1_conf.yml', 'w', encoding='utf8') as outfile:
        yaml.dump(yml, outfile, default_flow_style=False, allow_unicode=True, sort_keys=False)