## Albania isolates with reference genome set

* C. Loiseau et al., “An African origin for Mycobacterium bovis,” Evol. Med. Public Heal., pp. 49–59, 2020.

In [1]:
import sys,os,shutil,subprocess
import glob
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
import pylab as plt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
sys.path.append('pathogenie')
from pathogenie import tools, aligners, app, trees

In [2]:
master = pd.read_csv('../mbovis_sra_master.csv')

In [None]:
reload(app)
args = {'threads':8, 'outdir': '../test_results', 'labelsep':'-',
        'input':['/storage/btbgenie/mbovis_sra/',                 
                 '/storage/btbgenie/albania/'],
        'reference': None, 'overwrite':False}
W = app.WorkFlow(**args)
st = W.setup()
W.run()

## Create tree labels with meta data

In [None]:
summ = pd.read_csv('../test_results/summary.csv')
print (summ)
reload(trees)
cols = ['Run','COUNTRY_ISOLATION','CLONAL_COMPLEX','LibraryLayout','Host','ReleaseDate','GENOME_COVERAGE','filename']
df = summ.merge(master,left_on='sample',right_on='Run',how='left').drop_duplicates('bam_file')
df.at[0:1,'COUNTRY_ISOLATION']='Albania'
df.at[0:1,'CLONAL_COMPLEX']='Unknown2'

labelmap = dict(zip(df['sample'],df.COUNTRY_ISOLATION))
labelmap
colors = {'Eu1': '#D362DE', 'Eu2': 'Green','Af2':'#6DA0E4','Unknown2':'gray'}
df['color'] = df.CLONAL_COMPLEX.apply(lambda x: colors[x],1)
colormap = dict(zip(df['sample'],df.color))
t,ts = trees.create_tree('../test_results/RAxML_bipartitions.variants', 'ref', labelmap, colormap)
from ete3 import CircleFace, TreeStyle, NodeStyle, RectFace, TextFace
for i in colors:
    f=ts.legend.add_face(
            TextFace(text=i,fgcolor=colors[i],fsize=8),            
            column=0)
        
t.render("%%inline",tree_style=ts,w=500)
#png=t.render('../tree.png',tree_style=ts,dpi=200,w=500)

## Snippy comparison

https://github.com/tseemann/snippy

In [7]:
#tab file for multi snippy
df = W.fastq_table
x = df.reset_index().pivot(columns='pair', index='sample', values='filename')
x.to_csv('samples.tab',sep='\t')
snippycmd = '/local/snippy/bin/snippy-multi samples.tab --ref Mbovis_AF212297.fa --cpus 10 > runme.sh'

In [None]:
#read snippy snps
snippycore = pd.read_csv('../snippy_run/core.tab',sep='\t')
print (snippycore[:4])
smat = pd.read_csv('../test_results/core.txt',sep=' ',index_col=0)
print 
import matplotlib_venn
sets=[set(snippycore.POS),set(smat.index)]
matplotlib_venn.venn2(sets,set_labels=['Snippy','Pathogenie'])
ax=matplotlib_venn.venn2_circles(sets,linewidth=1)
plt.title('SNP site overlap',fontsize=20)
plt.savefig('snp_overlap_snippy.png',dpi=100)

In [None]:
#snippy tree
treefile = trees.run_RAXML('../snippy_run/core.aln', outpath='.')
t,ts = trees.create_tree(treefile, 'Reference', labelmap, colormap)
t.render("%%inline",tree_style=ts,w=500)

## Assemble samples

In [10]:
for i,df in W.fastq_table.groupby('sample'):    
    out=os.path.join('/storage/btbgenie/assembly',i)
    if os.path.exists(out):
        shutil.copy(os.path.join(out,'scaffolds.fasta'), '/storage/btbgenie/scaffolds/%s.fa' %i)
    else:
        f1 = df.iloc[0].filename; f2 = df.iloc[1].filename
        cmd = '/local/SPAdes-3.13.0-Linux/bin/spades.py --pe1-1 %s --pe1-2 %s --careful -o %s' %(f1,f2,out)
        print (cmd)
        subprocess.check_output(cmd,shell=True)

## RD-Analyzer method - check for regions of difference presence with read alignments

https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3213-1

In [29]:
from pathogenie import rd_analysis
reload(rd_analysis)
rd_analysis.create_rd_index()

bwa index RD.fa


In [None]:
df = W.fastq_table
res = rd_analysis.align_regions(df,'../rd_aligned')

In [None]:
res[res['#rname']=='RD4']

In [133]:
X = pd.pivot_table(res,index='name',columns=['#rname'],values='ratio').T
X=X.clip(lower=0.09).replace(0.09,0)
X=X.clip(upper=0.09).replace(0.09,1)