In [6]:
import os
import shutil
import subprocess
from pathlib import Path
from multiprocessing import Pool

import pandas as pd

In [10]:
def single_nucleotide_position(reference, query, outdir, threads=4):
    os.makedirs(outdir, exist_ok=True)
    
    symlink_reference = os.path.join(outdir, 'reference.fa')
    os.symlink(reference, symlink_reference)
    
    symlink_query = os.path.join(outdir, 'query.fa')
    os.symlink(query, symlink_query)
    
    cmd = f"parsnp -r {symlink_reference} -d {symlink_reference} {symlink_query} -p {threads} -o {outdir} --use-fasttree"
    subprocess.run(cmd, shell=True, check=True)
    
    ggr_file = Path(outdir, 'parsnp.ggr')
    snp_file = Path(outdir, 'parsnp.snp')
    vcf_file = Path(outdir, 'parsnp.vcf')
    
    cmd = f"harvesttools -i {ggr_file} -S {snp_file}"
    subprocess.run(cmd, shell=True, check=True)
    
    cmd = f"harvesttools -i {ggr_file} -V {vcf_file}"
    subprocess.run(cmd, shell=True, check=True)
    
    cmd = f"snp-dists {snp_file} > {Path(outdir, 'matrix.tsv')}"
    subprocess.run(cmd, shell=True, check=True)
    
    shutil.rmtree(Path(outdir, 'tmp'))

In [8]:
samples_name = ['R20-0140', 'R20-0158', 'R20-0148', 'R20-0145', 'R20-0150', 'R20-0088',
                'R20-0026', 'R20-0160', 'R20-0030', 'R19-2905', 'R20-0127', 'R20-0131',]

In [9]:
reference_path = Path('/media/NGS/Nanopore_1/20200925/contigs/hybrid')
query_path = Path('/media/NGS/Nanopore_1/20200925/contigs/denovo')
outpath = Path('/media/NGS/Nanopore_1/20200925/wgSNP')

In [12]:
with Pool(8) as p:
    try:
        for sample_name in samples_name:
            outdir = outpath/sample_name
            reference_genome = reference_path/(sample_name + '.fa')
            query_genome = query_path/(sample_name + '.fa')
            p.apply_async(single_nucleotide_position, args=(reference_genome, query_genome, outdir))
        p.close()
        p.join()
    except:
        p.terminate()

In [None]:
dirpath = Path('/media/NGS/Nanopore_1/20200925/wgSNP')

whole_genome_snp = dict()
for subdir in dirpath.iterdir():
    filepath = subdir/'matrix.tsv'
    df = pd.read_csv(filepath, sep='\t', index_col=0)
    whole_genome_snp[subdir.name] = df.at['hybrid.fa', 'denovo.fa']

In [None]:
s = pd.Series(whole_genome_snp)

In [None]:
s.name = 'wgSNP'
s.index.name = 'Key'

In [None]:
s.to_csv('/media/NGS/Nanopore_1/20200925/wgSNP.tsv', sep='\t')