In [1]:
import os
import shutil
from pathlib import Path
from tempfile import NamedTemporaryFile
from concurrent.futures import ProcessPoolExecutor

import pandas as pd

In [2]:
import sys
sys.path.append('../src')

In [3]:
from application import FastANICommandline

In [4]:
def ani_identify(query, output, refpath):
    refs = []
    for ref in refpath.glob('*.fa'):
        refs.append(ref.as_posix())
    with NamedTemporaryFile('w+t') as tempfile:
        tempfile.write('\n'.join(refs))
        tempfile.seek(0)
        cline = FastANICommandline('fastANI', rl=tempfile.name, q=query, output=output)
        cline()

In [5]:
dirpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Contigs')
outpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/ANI')
refpath = Path('/media/Central_Lab_Storage/NcbiASM/Klebsiella_pneumoniae/RefSeq')

In [13]:
ani_identify('/media/Central_Lab_Storage/NcbiASM/Enterococcus_faecalis/Contigs/GCA_000415185.1.fa',
             '/home/chen1i6c04/GCA_000415185.1.txt',
             refpath)

In [6]:
with ProcessPoolExecutor(32) as executor:
    for i in dirpath.iterdir():
        outfile = outpath/(i.stem + '.txt')
        executor.submit(ani_identify, i, outfile, refpath)

In [4]:
dirpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/ANI')
outfile = dirpath.as_posix() + '.tsv'

In [17]:
ani_results = []
for i in dirpath.iterdir():
    try:
        with open(i) as handle:
            header = next(handle).strip().split()
            ani_results.append(header)
    except StopIteration:
        os.remove(i)

In [18]:
df = pd.DataFrame(ani_results)

df[0] = df[0].apply(lambda x: Path(x).stem)
df[1] = df[1].apply(lambda x: Path(x).stem)
for column in (2, 3, 4):
    df[column] = df[column].astype(float)

df.to_csv(outfile, sep='\t', index=False, header=False)

In [36]:
def pairwise_compare(in_dir, output, threads=1):
    querys = []
    for x in Path(in_dir).iterdir():
        if x.suffix in {'.fa', '.fna', '.fasta'}:
            querys.append(x.as_posix())
    with NamedTemporaryFile('w+t') as tempfile:
        tempfile.write('\n'.join(querys))
        tempfile.seek(0)
        cline = FastANICommandline('fastANI', rl=tempfile.name, ql=tempfile.name, output=output, threads=threads)
        cline()

In [37]:
pairwise_compare('/media/Central_Lab_Storage/NcbiASM/Burkholderia_cenocepacia/RefSeq', '/home/chen1i6c04/ani.txt', 64)

In [38]:
df = pd.read_csv('/home/chen1i6c04/ani.txt', sep='\t', header=None, usecols=range(3))

df[0] = df[0].apply(lambda x: x.rsplit('/')[-1].replace('.fa', ''))
df[1] = df[1].apply(lambda x: x.rsplit('/')[-1].replace('.fa', ''))

df = df[df[0]!=df[1]]

In [41]:
df

Unnamed: 0,0,1,2
1,GCF_000009485.1,GCF_000236215.2,99.0586
2,GCF_000009485.1,GCF_001718895.1,98.9900
3,GCF_000009485.1,GCF_001606115.1,98.9415
4,GCF_000009485.1,GCF_001606135.1,98.8612
5,GCF_000009485.1,GCF_018223805.1,98.8514
...,...,...,...
76,GCF_018228625.1,GCF_000236215.2,99.0169
77,GCF_018228625.1,GCF_001606135.1,98.9934
78,GCF_018228625.1,GCF_001999885.1,98.9755
79,GCF_018228625.1,GCF_001484665.1,98.9693


In [None]:
import re

In [None]:
def extract_subsp(pattern):
    result = re.search('subsp. ([a-z]+)', pattern)
    if result:
        return result.groups()[0]

In [None]:
refseq_info = pd.read_csv('/media/Central_Lab_Storage/NcbiASM/assembly_summary.txt', sep='\t', usecols=['# assembly_accession', 'organism_name'], header=1)
refseq_info = refseq_info[refseq_info['# assembly_accession'].isin(set(df[0]))]
refseq_info['subspecies'] = refseq_info['organism_name'].apply(lambda x: extract_subsp(x))

In [None]:
enterica_accs = set(refseq_info[refseq_info['subspecies']=='enterica']['# assembly_accession'])

In [None]:
df[(df[0].isin(enterica_accs))&(df[1].isin(enterica_accs))].sort_values(2)