In [1]:
import os
import sys
import json
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory
from concurrent.futures import ProcessPoolExecutor
import pandas as pd

sys.path.append('../src')

from run import run_mlst
from parse import parse_mlst_result

In [2]:
database = '/media/GenomicResearch/Tools/CGE/mlst_db'

In [3]:
species = 'kpneumoniae'

dirpath = Path("/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Contigs")
outpath = Path("/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/MLST")

In [4]:
with open('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/selected.txt') as f:
    accs = set(f.read().splitlines())

In [5]:
with ProcessPoolExecutor(64) as executor:
    for i in dirpath.iterdir():
        outdir = outpath/(i.stem)
        if outdir.exists() is False and i.stem in accs:
            executor.submit(run_mlst, i, outdir, database, species)

In [None]:
run_mlst(
    '/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Contigs/R21.1267.fa', 
    '/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Analysis/R21.1267/mlst/',
    database=database,
    species=species
)

In [6]:
dirpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/MLST')

summaries = dict()
for i in dirpath.iterdir():
    summary = parse_mlst_result(i/'data.json')
    if summary['ST'] == '':
        shutil.rmtree(i)
    summaries[i.name] = summary

In [7]:
df = pd.DataFrame(summaries).T.sort_index()
df.index.name = 'Key'
df.head()

Unnamed: 0_level_0,ST,mdh,infB,phoE,tonB,pgi,rpoB,gapA
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DRR061405,258,1,3,1,79,1,1,3
DRR061406,1593,11,1,9,59,1,4,4
DRR061407,258,1,3,1,79,1,1,3
DRR061412,2252,1,3,4,4,20,1,2
DRR061421,258,1,3,1,79,1,1,3


In [8]:
df[df['ST']!=''].to_csv('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/MLST.tsv', sep='\t')

In [9]:
profile = pd.read_csv('/media/GenomicResearch/Tools/CGE/mlst_db/kpneumoniae/kpneumoniae.tsv', sep='\t')
profile.head()

Unnamed: 0,ST,gapA,infB,mdh,pgi,phoE,rpoB,tonB
0,1,4,4,1,1,7,4,10
1,2,3,4,1,1,9,4,17
2,3,5,5,1,1,9,6,11
3,4,3,1,1,1,3,3,1
4,5,2,2,1,1,3,3,3


In [None]:
df = pd.read_csv('/media/Central_Lab_Storage/NcbiSRA/NCBI_Burkholderia_cenocepacia_SRA/MLST.tsv', sep='\t')

In [None]:
df['clonal_complex'] = df['ST'].map(dict(zip(profile['ST'], profile['clonal_complex'])))

In [None]:
df['Lineage'] = df['ST'].map(dict(zip(profile['ST'], profile['Lineage'])))

In [None]:
df.head()

In [None]:
df.to_csv('/media/Central_Lab_Storage/NcbiSRA/NCBI_Burkholderia_cenocepacia_SRA/MLST.tsv', sep='\t', index=False)