# 16S species check from WGS

In [18]:
import os,sys,subprocess,glob,re, shutil
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns
from importlib import reload
from Bio import AlignIO, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import Phylo
from snipgenie import tools, trees

In [3]:
def get_blast_coverage(bl, fasta):
    """Get alignment coverage of blast results from original sequence lengths"""
    
    df=tools.fasta_to_dataframe('16S_ncbi.fa')
    df=df.rename(columns={'length':'sslen'})
    bl=bl.merge(df,left_on='sseqid',right_on='name',how='left')
    bl['perc_cov'] = bl.apply(lambda x: round(x.length/x.sslen*100,2),1)
    return bl

In [70]:
def blast_16S(filename, hits=100, pident=99.5):
    #tools.make_blast_database('16S_ncbi.fa')
    bl=tools.blast_fasta('16S_ncbi.fa',filename,maxseqs=hits)
    bl = get_blast_coverage(bl, '16S_ncbi.fa')
    bl = bl.drop_duplicates('sseqid')
    bl['species'] = bl.stitle.apply(lambda x: ' '.join(x.split()[1:3]))
    cols = ['sseqid','sslen','length','perc_cov','pident','stitle','species']
    bl = bl[(bl.pident>=pident) & (bl.perc_cov>=80)].sort_values('pident',ascending=False)  
    return bl

In [71]:
def extract_sequences_by_ids(input_fasta, output_fasta, ids_to_extract):
    """Extract sequences from fasta file with given ids"""
    
    sequences = SeqIO.parse(input_fasta, "fasta")    
    # Filter sequences that match the given IDs
    filtered = (seq for seq in sequences if seq.id in ids_to_extract)   
    SeqIO.write(filtered, output_fasta, "fasta")
    return

def append_sequences_to_fasta(fasta_file, seqs):
    """Append SeqRecords to a FASTA file, overwriting the old file."""

    existing_seqs = list(SeqIO.parse(fasta_file, "fasta"))
    if type(seqs) is not list:
        seqs = [seqs]
    existing_seqs.extend(seqs)    
    #with open(fasta_file, "w") as output_handle:
    SeqIO.write(existing_seqs, fasta_file, "fasta")
    return

def get_tree(fasta_file):
    """get phylo tree from fasta_file with fasttree"""
    
    out = 'temp.newick'
    cmd=f'mafft {fasta_file} > temp.aln'
    tmp = subprocess.check_output(cmd, shell=True, stderr=subprocess.PIPE)
    cmd=f'fasttree temp.aln > {out}'
    tmp = subprocess.check_output(cmd, shell=True, stderr=subprocess.PIPE)
    return out

In [72]:
tools.make_blast_database('16S_ncbi.fa')
bl = blast_16S('M.alkalescens_NCTC10135.fa',pident=90,hits=50)

In [73]:
bl[['sseqid','species','pident']][:5]

Unnamed: 0,sseqid,species,pident
0,NR_025984.1,Metamycoplasma alkalescens,99.932
2,NR_026035.1,Metamycoplasma auris,98.973
4,NR_025988.1,Metamycoplasma canadense,98.704
14,NR_029180.1,Metamycoplasma gateae,98.545
10,NR_041743.1,Mycoplasmopsis arginini,98.494


In [74]:
#m.genus.value_counts()
targetseq = SeqRecord(Seq(bl.iloc[0].qseq), id='NCTC10135')
m=bl.set_index('sseqid')[:20]
names = list(m.index)
extract_sequences_by_ids('16S_ncbi.fa', 'temp.fa', names)
append_sequences_to_fasta('temp.fa',targetseq)
treefile = get_tree('temp.fa')

In [75]:
m.to_csv('blast_result.csv')