In [10]:
import sys
from pathlib import Path
from multiprocessing import Pool

sys.path.append('../src')

from utils import run_cmd

In [11]:
prog = '/media/Central_Lab_Storage/bo-han/Projects/Benga-2/profiling.py'
# scheme = "/media/GenomicResearch/Issue/Benga/Listeria_monocytogenes.faa"
database = "/media/GenomicResearch/Issue/Benga/Klebsiella_pneumoniae.db"
prodigaltf = "/media/Central_Lab_Storage/bo-han/Projects/Benga/models/Klebsiella_pneumoniae.trn"

In [19]:
dirpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Contigs')
outpath = Path('/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Analysis')

In [20]:
cmds = []
for i in dirpath.iterdir():
    if (outpath/i.stem).exists():
        outfile = outpath/i.stem/'profile.tsv'
        cmd = f"python {prog} -i {i} -o {outfile} -d {database} --prodigaltf {prodigaltf} -t 2"
        cmds.append(cmd)
len(cmds)

11326

In [None]:
%%time
with Pool(24) as p:
    try:
        p.map(run_cmd, cmds)
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [18]:
%%time
asm = '/media/Central_Lab_Storage/NcbiSRA/NCBI_Kp_SRA/Contigs/SRR1197688.fa'
out = '/home/chen1i6c04/SRR1197688.tsv'
!conda run -n Benga python {prog} -i {asm} -o {out} -d {database}  --prodigaltf {prodigaltf} -t 16

CPU times: user 327 ms, sys: 56.9 ms, total: 384 ms
Wall time: 19 s


In [9]:
import os
import sqlite3
from sqlalchemy import create_engine
import subprocess
import hashlib
from tempfile import TemporaryDirectory
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [None]:
def find_genes(input_fasta, training_file):
    records = []
    with TemporaryDirectory() as tmpdir:
        prodigal_output = os.path.join(tmpdir, 'genes.fna')
        subprocess.run(f"prodigal -i {input_fasta} -d {prodigal_output} -c -m -q -t {training_file}", shell=True)
        for record in SeqIO.parse(prodigal_output, 'fasta'):
            record.id = sequence_encoder(str(record.seq))
            record.seq = record.seq.translate(table=11)
            records.append(record)
    return records

def sequence_encoder(sequence):
    return hashlib.sha256(sequence.encode("ascii")).hexdigest()

def sequence_alignment(query_records, subject_records, threads):
    with TemporaryDirectory() as tmpdir:
        query, subject = os.path.join(tmpdir, 'query.faa'), os.path.join(tmpdir, 'subject.faa')
        SeqIO.write(query_records, query, 'fasta')
        SeqIO.write(subject_records, subject, 'fasta')
        subprocess.run(f"makeblastdb -in {subject} -dbtype prot", shell=True)
        child_process = subprocess.run(
            f"blastp -query {query} -db {subject} -evalue 1e-6 -outfmt '6 qseqid sseqid pident length qlen slen' -num_threads {threads}",
            stdout=subprocess.PIPE,
            shell=True
        )
    return child_process.stdout.decode().splitlines()

In [None]:
def match_filter(lines):
    result = []
    for line in lines:
        qseqid, sseqid, pident, length, qlen, slen = line.strip().split()
        pident, length, qlen, slen = float(pident), float(length), float(qlen), float(slen)
        if qseqid != sseqid and pident >= 95 and .75 <= qlen/slen < 1.25 and .75 <= qlen/length < 1.25:
            result.append((qseqid, sseqid))
    return result

In [None]:
class SQLiteDatabase:
    def __init__(self, database):
        self._database_path = database
        self._connection = sqlite3.connect(database)
    
    def fetch_core_genome(self):
        return self._connection.execute("select * from scheme").fetchall()
        
    def search(self, query):
        sql = f"select * from alleles where allele_id in ({','.join('?'*len(query))})"
        return self._connection.execute(sql, query).fetchall()
    
    def update(self, query):
        sql = 'insert or replace into alleles values (?, ?)'
        self._connection.executemany(sql, query)
    
    def close(self):
        if self._connection.in_transaction:
            self._connection.commit()
        self._connection.close()

In [None]:
def profiling(genome, database, training_file, outfile, threads=2):
    sqlite_db = SQLiteDatabase(database)
    
    gene_records = find_genes(genome, training_file)
    search_result = sqlite_db.search([gene.id for gene in gene_records])
    know_allele_ids = set(map(lambda x: x[0], search_result))
    unknow_gene_records = [record for record in gene_records if record.id not in know_allele_ids]
    core_genome_records = [SeqRecord(Seq(dna_seq).translate(table=11), id=locus_tag)
                           for locus_tag, dna_seq in sqlite_db.fetch_core_genome()]
    match_result = match_filter(
        sequence_alignment(unknow_gene_records, core_genome_records, threads)
    )

    if match_result:
        sqlite_db.update(match_result)
    sqlite_db.close()
    df = pd.DataFrame(search_result + match_result, columns=['allele_id', 'locus_id'])
    df = df.sort_values('allele_id', kind='mergesort').drop_duplicates('locus_id')
    df = df.set_index('locus_id').reindex((x.id for x in core_genome_records)).sort_index()
    df.to_csv(outfile, sep='\t')

In [None]:
dirpath = Path('/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Analysis')
outpath = Path('/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Profile_with_AlleleDB/')

In [None]:
sqlite_db = '/media/GenomicResearch/Issue/Benga/Listeria_monocytogenes.db'
training_file = '/media/Central_Lab_Storage/bo-han/Projects/Benga/models/Listeria_monocytogenes.trn'

In [None]:
%%time
with Pool(64) as p:
    try:
        for i in dirpath.iterdir():
            outfile = outpath/(i.name + '.tsv')
            if outfile.exists() is False:
                p.apply_async(profiling, (i/'contigs.fasta', sqlite_db, training_file, outfile, 1))
        p.close()
        p.join()
    except KeyboardInterrupt:
        p.terminate()

In [None]:
%%time
profiling(
    '/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Analysis/R20.0002/contigs.fasta',
    sqlite_db, training_file, '/home/chen1i6c04/test.tsv', 1)

In [None]:
dirpath_1 = Path('/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Profile_with_AlleleDB')
dirpath_2 = Path('/media/GenomicResearch/MiSeq/Listeria_monocytogenes/Analysis/')

In [None]:
for i in dirpath_1.iterdir():
    q = pd.read_csv(i, sep='\t')
    s = pd.read_csv(dirpath_2/i.stem/'profile.tsv', sep='\t')
    if (q.allele_id.fillna('')==s.allele_id.fillna('')).all() is False:
        print(i.stem)

In [None]:
con = sqlite3.connect(":memory:")

In [None]:
con.close()

In [10]:
con = sqlite3.connect(database)

In [11]:
cur = con.cursor()