## Albania isolates with reference genome set

* C. Loiseau et al., “An African origin for Mycobacterium bovis,” Evol. Med. Public Heal., pp. 49–59, 2020.

In [1]:
import sys,os,shutil,subprocess
import glob
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
import pylab as plt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
sys.path.append('pathogenie')
from pathogenie import tools, aligners, app, trees

In [136]:
master = pd.read_csv('../mbovis_sra_master.csv')

In [None]:
reload(app)
args = {'threads':8, 'outdir': '../test_results', 'labelsep':'-',
        'input':['/storage/btbgenie/mbovis_sra/',                 
                 '/storage/btbgenie/albania/'],
        'reference': None, 'overwrite':False}
W = app.WorkFlow(**args)
st = W.setup()
W.run()

## Create tree labels with meta data

In [None]:
df = pd.read_csv('../test_results/summary.csv')
print (df)
reload(trees)
cols = ['Run','COUNTRY_ISOLATION','CLONAL_COMPLEX','LibraryLayout','Host','ReleaseDate','GENOME_COVERAGE','filename']
df = df.merge(master,left_on='sample',right_on='Run',how='left').drop_duplicates('bam_file')
#print (df.loc[0])
df.at[0:2,'COUNTRY_ISOLATION']='Albania'
df.at[0:2,'CLONAL_COMPLEX']='Unknown2'
labelmap = dict(zip(df.bam_file,df.COUNTRY_ISOLATION))
labelmap

colors = {'Eu1': '#D362DE', 'Eu2': 'Green','Af2':'#6DA0E4','Unknown2':'gray'}
df['color'] = df.CLONAL_COMPLEX.apply(lambda x: colors[x],1)
colormap = dict(zip(df.bam_file,df.color))
t,ts = trees.create_tree('../test_results/RAxML_bipartitions.variants', 'ref', labelmap, colormap)
from ete3 import CircleFace, TreeStyle, NodeStyle, RectFace, TextFace
for i in colors:
    f=ts.legend.add_face(
            TextFace(text=i,fgcolor=colors[i],fsize=8),            
            column=0)
        
t.render("%%inline",tree_style=ts,w=500)
#png=t.render('../tree.png',tree_style=ts,dpi=200,w=500)

## Assemble samples

In [10]:
for i,df in W.fastq_table.groupby('sample'):    
    out=os.path.join('/storage/btbgenie/assembly',i)
    if os.path.exists(out):
        shutil.copy(os.path.join(out,'scaffolds.fasta'), '/storage/btbgenie/scaffolds/%s.fa' %i)
    else:
        f1 = df.iloc[0].filename; f2 = df.iloc[1].filename
        cmd = '/local/SPAdes-3.13.0-Linux/bin/spades.py --pe1-1 %s --pe1-2 %s --careful -o %s' %(f1,f2,out)
        print (cmd)
        subprocess.check_output(cmd,shell=True)

## Run mtbdiff - not accurate enough

In [7]:
import mtbdiff
inpath = '/storage/btbgenie/scaffolds/'
names = mtbdiff.run_genomes(inpath, outpath='../albania_mtbdiff', options='--reloc_dist 20000')
struct, snp =  mtbdiff.get_nucdiff_results('../albania_mtbdiff', names)
#struct = struct[(struct.Name=='deletion') | (struct.Name=='insertion')]
struct['RD'] = struct.apply(mtbdiff.get_region,1)
rdmat = mtbdiff.RD_matrix(struct)

## RD-Analyzer method - check for regions of difference presence with read alignments

https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3213-1

In [33]:
def create_rd_index(names=None):
    """Get RD region sequence from reference and make bwa index"""
    
    df=mtbdiff.RD.set_index('RD_name')
    if names!= None:
        df=df.loc[names]
    seqs=[]
    for name, row in df.iterrows():        
        #print (name,row.Start, row.Stop, row.Rv)
        from pyfaidx import Fasta
        rg = Fasta('../MTB-H37Rv.fna')
        sseq = rg['NC_000962.3'][row.Start:row.Stop].seq
        #refname = '%s.fa' %name
        seqs.append(SeqRecord(Seq(sseq),id=name))        
    SeqIO.write(seqs, 'RD.fa', 'fasta')
    aligners.build_bwa_index('RD.fa')

create_rd_index()

bwa index RD.fa


In [None]:
def align_regions(df):
    """Align reads to regions of difference"""
    
    from io import StringIO
    out = 'rd_aligned'
    ref = 'RD.fa'
    rg = Fasta('../MTB-H37Rv.fna')
    res = []
    for i,g in df.groupby('sample'):
        out=os.path.join('../rd_aligned',i+'.bam')
        f1 = g.iloc[0].filename; f2 = g.iloc[1].filename
        if not os.path.exists(out):
            aligners.bwa_align(f1, f2, ref, out, threads=4, overwrite=False)

        cmd = 'zcat %s | paste - - - - | cut -f2 | wc -c' %f1
        tmp = subprocess.check_output(cmd,shell=True)
        avdepth = int(tmp)*2/len(rg)
        print (avdepth)
        cmd = 'samtools coverage --min-BQ 1 %s' %out
        tmp = subprocess.check_output(cmd,shell=True)    
        s = pd.read_csv(StringIO(tmp.decode()),sep='\t')
        s['name'] = i
        #print (s)
        s['ratio'] = s.meandepth/avdepth
        res.append(s)
    res = pd.concat(res)
    return res

df = W.fastq_table
res = align_regions(df)

In [None]:
res[res['#rname']=='RD4']

In [133]:
X = pd.pivot_table(res,index='name',columns=['#rname'],values='ratio').T
X=X.clip(lower=0.09).replace(0.09,0)
X=X.clip(upper=0.09).replace(0.09,1)