## wgMLST for mbovis

* http://www3.ridom.de/seqsphere/cgmlst/
* https://www.cgmlst.org/ncs/schema/741110/
* https://jcm.asm.org/content/52/7/2479
* https://www.biorxiv.org/content/10.1101/172858v1.full
* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3980634/

## steps

* get MLST gene list
* assembly
* annotate
* assign allele numbers?

In [2]:
import sys,os,shutil,subprocess
import glob
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 150)
import pylab as plt
import seaborn as sns
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from io import StringIO
from snipgenie import app
import pathogenie as pg
import toytree

In [3]:
meta = pd.read_csv('/storage/btbgenie/mbovis_ireland/all_ireland_samples.csv')
vcf_file='/storage/btbgenie/wicklow_results/filtered.vcf.gz'

In [4]:
def get_samples_vcf(vcf_file):
    cmd = 'bcftools query -l %s' %vcf_file
    tmp = subprocess.check_output(cmd, shell=True)
    return tmp.decode().split('\n')
    
samplenames = get_samples_vcf(vcf_file)

In [5]:
path = '/storage/btbgenie/mbovis_ireland/Wicklow/'
files = glob.glob(os.path.join(path,'**', '*.fastq.gz'),recursive=True)
samp = app.get_samples(files,sep='_')
pairs = pd.pivot_table(samp,index=['sample'],columns=['pair'],values='filename',aggfunc='first').reset_index()
print (pairs[:2])

pair    sample                                                                                             1  \
0         1034                  /storage/btbgenie/mbovis_ireland/Wicklow/dog/1034_S91_L001-4_R1_001.fastq.gz   
1     13-11594  /storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_15-03-19/13-11594_S85_L001-4_R1_001.fastq.gz   

pair                                                                                             2  
0                     /storage/btbgenie/mbovis_ireland/Wicklow/dog/1034_S91_L001-4_R2_001.fastq.gz  
1     /storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_15-03-19/13-11594_S85_L001-4_R2_001.fastq.gz  


## create reference proteins for MLST

In [13]:
prots = pg.tools.genbank_to_dataframe('Mbovis-AF212297.2.gb',cds=True)
prots = prots.fillna('')
prots = prots.dropna(subset=['locus_tag'])
ref_proteins = 'Mbovis_AF212297_proteins.fa'
#get prokka type header for using in annotation
prots['header'] = prots.apply(lambda x: '~~~'.join([x.locus_tag,x.gene,x['product'],'none']),1)
pg.tools.dataframe_to_fasta(prots,idkey='header',outfile=ref_proteins)
print (len(prots))

3994


In [6]:
def get_nucleotide_sequences(gb_file,out_file,idkey='locus_tag'):
    """protein nucleotide seqs from genbank"""
    
    recs = SeqIO.to_dict(SeqIO.parse(gb_file,'genbank'))
    chroms = list(recs.keys())
    result = []
    for chrom in chroms[:10]:       
        rec = recs[chrom]        
        for f in rec.features[1:]:
            q=f.qualifiers
            if f.type != 'CDS':
                continue
            seq = rec.seq[f.location.start:f.location.end]
            try:         
                new = SeqRecord(seq,id=q[idkey][0])
                result.append(new)
            except:
                #print (q)
                pass
    SeqIO.write(result,out_file,format='fasta')
    return result

res=get_nucleotide_sequences('Mbovis-AF212297.2.gb','Mbovis_AF212297_nuc.fa')

## assembly

In [None]:
def spades(file1, file2, path, outfile=None, threads=4):
    """Run spades"""
    
    cmd = 'spades -t %s --pe1-1 %s --pe1-2 %s --careful -o %s' %(threads,file1,file2,path)    
    if not os.path.exists(path): 
        print (cmd)
        subprocess.check_output(cmd, shell=True)
    if outfile != None:
        shutil.copy(os.path.join(path,'scaffolds.fasta'),outfile)
    return outfile

assembly_path = '/storage/btbgenie/assembly/'

for i,r in pairs[5:10].iterrows():    
    name=r['sample']
    print (name)
    out = os.path.join(assembly_path,name+'.fa')
    print (out)
    spades(r[1], r[2], os.path.join(assembly_path,name), outfile=out, threads=12)

## get consensus sequence from alignment instead of assembly

In [8]:
def get_consensus(vcf_file, sample, out_file='consensus.fa'):
    """Get consensus sequence from vcf"""
      
    cmd='bcftools index -f %s' %vcf_file
    subprocess.check_output(cmd, shell=True)   
    cmd='cat {r} | bcftools consensus -s {s} {v} > {o}'.format(r=app.mbovis_genome,v=vcf_file,s=sample,o=out_file)
    #print (cmd)
    subprocess.check_output(cmd, shell=True)
    return

get_consensus(vcf_file, '1034','consensus.fa')


## annotate


In [None]:
seqfile='/storage/btbgenie/assembly/1034.fa'
#seqfile='consensus.fa'
featdf,recs = pg.run_annotation(seqfile,
                                threads=10, kingdom='bacteria', trusted=ref_proteins)
outfile='/storage/btbgenie/annotation_assembly/1034.fa'

#write out nucl sequences
SeqIO.write(recs,'temp.gb','genbank')
res = get_nucleotide_sequences('temp.gb',outfile,idkey='protein_id')

In [853]:
rec=res[3]
print (rec.id)
rec.seq.translate()

Mb3672c


Seq('MPQGTVKWFNAEKGFGFIAPEDGSADVFVHYTEIQGTGFRTLEENQKVEFEIGH...SL*')

# initialise MLST table from reference genome

## filter only Mb proteins in scheme

In [825]:
mlst=pd.read_csv('mlst_scheme.csv')
mlst=mlst[mlst['Ridom Usage']=='MLST+']
targets=list(mlst.Target)

In [826]:
mbmap=pd.read_csv('final_mbovis_mapping.csv')
mbmap = mbmap[mbmap.Rv_tag.isin(targets)]
mb_tags = list(mbmap.Mb_tag)
prots = prots[prots.locus_tag.isin(mb_tags)]
nucseqs = pg.tools.fasta_to_dataframe('Mbovis_AF212297_nuc.fa')
nucseqs = nucseqs[nucseqs.name.isin(mb_tags)]

In [827]:
cols = ['name','allele','sequence']
ref = nucseqs.copy()
ref['allele'] = 1
ref=ref.rename(columns={'translation':'sequence'})
ref[cols].to_csv('mlst_db.csv.gz',index=False,compression='gzip')
print (len(ref))

3184


In [635]:
fastafile='/storage/btbgenie/annotation_assembly/1034.fa'
df = pg.tools.fasta_to_dataframe(fastafile)
df[df.name=='Mb0014c']

Unnamed: 0,name,sequence,description,type
3240,Mb0014c,CTACTGGCCGAACCTCAGCGTGATGATGCCGTCCCGGTTGACGCCGGTCCCCGCCGGCGGGTTTTGATAGACGACCCGGTTGTGTTGGGAGCCACCGGCGTCGACGTCGGCCCCTTTGTCGAGCATCCCGGTCCAGCCCAGCGCGC...,Mb0014c <unknown description>,CDS


In [10]:
def find_alleles(fastafile):
    """Find allele by simple matches to the reference table of known sequences.
    Returns:
        dataframe with allele number for each gene
        dataframe with new alleles to add to db
    """
    
    db = pd.read_csv('mlst_db.csv.gz')
    names = db.name.unique()
    df = pg.tools.fasta_to_dataframe(fastafile).reset_index()
    #print (df.iloc[0])
    result=[]
    new=[]
    for name in names:
        #print (name)
        s = db[db.name==name]
        gene = df[df.name==name]
        #print (gene)
        if len(gene)==0:
            #print (name)
            #missing gene in target
            result.append((name,0))
            continue
        target = gene.iloc[0].sequence
        found = s[s.sequence==target]        
        #print (target,found)
        if len(found)>0:
            found = found.iloc[0]
            result.append((name,found.allele))
        else:
            #assign new allele            
            newallele = s.allele.max()+1
            result.append((name,newallele))
            new.append([name,newallele,target])
    prof = pd.DataFrame(result,columns=['name','allele'])
    prof['allele'] = prof.allele.astype(int)
    #new additions
    new = pd.DataFrame(new,columns=['name','allele','sequence'])
    return prof, new
    
def update_mlst_db(new):
    """Update the database of MLST profiles"""
    
    db = pd.read_csv('mlst_db.csv.gz')
    db = pd.concat([db,new])
    db.to_csv('mlst_db.csv.gz', index=False, compression='gzip')
    print ('added %s new alleles' %len(new))
    return

fastafile='26-Mbovis.fa'
prof,new = find_alleles(fastafile)
#update_mlst_db(new)
#print (prof[:100])

In [None]:
def type_sample(fastafile, outfile, threads=4, overwrite=False):
    """Type a single sample using wgMLST.
    Args:
        fastafile: fasta file to type from assembly or other
        
        path: output folder for annotations
    Returns:
        dataframe of MLST profile
    """
    
    if overwrite == True or not os.path.exists(outfile):
        #annotate
        featdf,recs = pg.run_annotation(fastafile, threads=threads,
                                        kingdom='bacteria', trusted=ref_proteins)
        #get nucl sequences from annotation    
        SeqIO.write(recs,'temp.gb','genbank')
        get_nucleotide_sequences('temp.gb',outfile,idkey='protein_id')
    
    #find alleles
    res,new = find_alleles(outfile)
    #print (res)
    #update db
    update_mlst_db(new)
    return res

get_consensus(vcf_file, '26-MBovis')
fastafile = 'consensus.fa'
res = type_sample(fastafile, '26-Mbovis.fa', threads=10)

In [15]:
def get_profile_string(df):
    return ''.join(df.allele.astype(str))

## profiles from assembly - not needed?

In [None]:
profs = {}
omit=['182-MBovis','19-4281','19-2438']
samples=glob.glob(os.path.join(assembly_path,'*.fa'))
pdf=pd.DataFrame()
for s in samples:
    print (s)
    sample = os.path.splitext(os.path.basename(s))[0]    
    if sample in omit:
        continue
    profile = type_sample(s, '/storage/btbgenie/annotation_assembly/%s.fa' %sample, threads=12)
    pdf[sample] = profile['allele']
    profs[sample] = get_profile_string(profile)

## profiles from consensus seqs

In [None]:
omit=['182-MBovis','19-4281','19-2438']

def run_samples(vcf_file, outdir, omit=[], **kwargs):
    """Run samples in a vcf file.
    Args:
        vcf_file: multi sample variant file from previous calling
        outdir: folder for writing intermediate files
    Returns:
        dict of mst profiles
    """

    profs = {}
    samplenames = get_samples_vcf(vcf_file)
    for s in samplenames[:15]:
        print (s)
        if s in omit:
            continue
        get_consensus(vcf_file, s)
        outfile = os.path.join(outdir, '%s.fa' %s)
        profile = type_sample('consensus.fa', outfile, **kwargs)
        profs[s] = get_profile_string(profile)
    return profs

profs = run_samples(vcf_file, '/storage/btbgenie/annotation_consensus', threads=12, omit=omit)

In [None]:
for i in profs:
    print (i,len(profs[i]))

## find distances between mlst profiles

In [21]:
def diff_profiles(s1, s2):
    #print (list(zip(list(s1), list(s2))))
    return sum(1 for a, b in zip(list(s1), list(s2)) if a != b)

diff_profiles('AAABBBB','AAABBCC')

2

In [19]:
def dist_matrix(profiles):
    """Distance matrix of a set of profiles"""
    
    dist=[]
    for s in profiles:
        x=profiles[s]
        row=[]
        for s in profiles:
            d = diff_profiles(x,profiles[s])
            row.append(d)
        dist.append(row)
    D = pd.DataFrame(dist,columns=profiles.keys(),index=profiles.keys())
    return D

In [23]:
def tree_from_distmatrix(D, treefile):
    
    from skbio import DistanceMatrix
    from skbio.tree import nj
    ids = list(D.index)
    dm = DistanceMatrix(D.values, ids)
    tree = nj(dm)
    #print(tree.ascii_art())
    tree.write(treefile, 'newick')
    return tree

In [None]:
import random
test = {'A':'1112233','B':'1112213','C':'2111213','D':'2111232','E':'212211','F':'2111222'}

d = dist_matrix(test)
tree_from_distmatrix(d)
tre = toytree.tree(treefile)
canvas,t,r=tre.draw(layout='r',scalebar=True,width=500)
sns.clustermap(d,figsize=(5,5))

In [44]:
D = dist_matrix(profs)
D.to_csv('dist_mlst.csv',index=False)
#sns.clustermap(D,xticklabels=True,yticklabels=True,figsize=(6,6))
treefile='temp.newick'
tree_from_distmatrix(D, treefile)
tre = toytree.tree(treefile)
tre=tre.root('43-MBovis')
canvas,t,r=tre.draw(layout='r',scalebar=True,height=400,width=500)
import toyplot.png
#toyplot.png.render(canvas, "mlst-tree.png")

In [43]:
tre2 = toytree.tree('/storage/btbgenie/wicklow_results/RAxML_bestTree.variants')
#tre2 = toytree.tree('snps.newick')
drop = list(set(tre2.get_tip_labels()) - set(tre.get_tip_labels()))
tre2=tre2.drop_tips(drop)
tre2=tre2.root('43-MBovis')
canvas,t,r=tre2.draw(layout='r',scalebar=True,height=400,width=500)
#toyplot.png.render(canvas, "snp-tree.png")

In [39]:
import dendropy
from dendropy.calculate import treecompare
tns = dendropy.TaxonNamespace()
tree1 = dendropy.Tree.get(path=treefile, schema='newick',taxon_namespace=tns)
tree2 = dendropy.Tree.get(path='/storage/btbgenie/wicklow_results/RAxML_bestTree.variants', schema='newick',taxon_namespace=tns)
core = pd.read_csv('/storage/btbgenie/wicklow_results/core.txt',sep=' ')
p = len(core)
tree=convert_branch_lengths(tree2, p)
tree.write(path="snps.newick",schema="newick")

In [40]:
print(treecompare.euclidean_distance(tree1, tree2))

16.37713653314643


## convert branch lengths to snps

In [38]:
def convert_branch_lengths(tree, n=1): 
    tree=dendropy.Tree(tree)
    for edge in tree.postorder_edge_iter():
        if edge.length is None:
            edge.length = 0
        else:
            edge.length = float(edge.length)*n
    newickstr=tree.as_string(schema='newick')
    return tree


## sensitivity analysis - SNPs vs MLST

* artificial data?

In [90]:
#simulate snps in genome

ref = SeqIO.read('Mbovis-AF212297.2.gb',format='gb')

class BTree(object):
    def __init__(self, name, seq):
        self.left = None
        self.right = None
        self.seq = seq  
        self.name= name

def mutate(seq, n=100):
    #create mutations
    from Bio.Seq import MutableSeq
    new = MutableSeq(seq)
    pos = random.sample(range(len(seq)),k=n)
    for i in pos:        
        m = random.sample(['A','G','C','T'],1)[0]        
        new[i] = m        
        #print (i,seq[i],new[i])
    return new, pos

def sim_pop(ref):
    #simulate related genomes - crude
    
    pop = BTree('ref', ref.seq)
    
    return

#mutate(str(ref.seq))
sim_pop(ref)
