# phylogeny of global M. bovis lineages

Make phylogeny from global sequences (Ireland/UK, World) for strain naming

* https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.000491?crawler=true
* https://open-research-europe.ec.europa.eu/articles/1-100/v2#f1

In [1]:
import sys,os,shutil,subprocess,time
import glob,random
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
import pylab as plt
import matplotlib as mpl
import seaborn as sns
from Bio import SeqIO
sys.path.append('pathogenie')
from snipgenie import tools, aligners, app, trees, plotting, clustering
import toytree

In [None]:
reload(app)
reload(tools)
reload(aligners)
args = {'threads':12, 'outdir': '/storage/btbgenie/global_results', 'labelsep':'-',
        'input':['/storage/btbgenie/brites'],
         'species': 'Mbovis-AF212297', 'overwrite':False,
         'custom_filters': True, 'get_stats': False,
         'buildtree':True}
W = app.WorkFlow(**args)
st = W.setup()
W.run()

In [None]:
samples = pd.read_csv('/storage/btbgenie/global_results/samples.csv')
#app.clean_bam_files(samples,'/storage/btbgenie/global_results/mapped/',True)

In [None]:
treefile1 = '/storage/btbgenie/global_results/tree.newick'
tre = toytree.tree(treefile1)
mystyle = { "layout": 'c','node_sizes':1,'tip_labels_align':True}
tre.draw(**mystyle,height=700);

## spoligotypes

In [None]:
def get_spoligotypes(samples, spo=None):
    """Get spoligotypes for multiple M.bovis strains"""
    
    if spo is not None:
        done=list(spo['sample'])
    else:
        done=[]
    samples = samples.drop_duplicates('sample')
    res=[]
    for i,r in samples.iterrows():
        f=r.filename1
        samp=r['sample']
        if samp in done:
            continue
        b = tools.get_spoligotype(f)
        sb = tools.get_sb_number(b)
        print (r['sample'], sb, b)
        res.append([r['sample'],sb,b])
    
    res = pd.DataFrame(res,columns=['sample','SB','code'])
    return res

spo = pd.read_csv('/storage/btbgenie/mbovis_ireland/spoligotypes.csv')
samples = pd.read_csv('/storage/btbgenie/global_results/samples.csv')
res = get_spoligotypes(samples,spo)

In [95]:
#res=pd.concat([spo,res])
#res.to_csv('/storage/btbgenie/mbovis_ireland/spoligotypes.csv')

In [None]:
samples = pd.read_csv('/storage/btbgenie/global_results/samples.csv')
from snipgenie import rdiff
reload(rdiff)
rdiff.create_rd_index()
res = rdiff.run_samples(samples,'/storage/btbgenie/global_results/rd_aligned',threads=10)
X = rdiff.get_matrix(res, cutoff=0.15)
X['ident'] = X.apply(rdiff.apply_rules,1)
X.to_csv('/storage/btbgenie/global_results/rd.csv')

## merge to get metadata 

In [110]:
samples = pd.read_csv('/storage/btbgenie/global_results/samples.csv')
meta = pd.read_csv('brites_table1.csv')
x = samples.merge(meta,right_on='Accession_Number',left_on='sample',how='left')
spotypes = pd.read_csv('/storage/btbgenie/mbovis_ireland/spoligotypes.csv')
R = pd.read_csv('/storage/btbgenie/global_results/rd.csv',index_col=0)
meta = x.merge(spotypes,on='sample')
meta = meta.drop_duplicates('sample')
#rds=R[['RD1mic','RD149','RD3','RD5','RD5oryx*','RD122','RDAf2','RDWicklow']]
#meta = x.merge(rds,left_on='sample',right_index=True)
len(meta)

407

In [None]:
reload(clustering)
snpdist = pd.read_csv('/storage/btbgenie/global_results/snpdist.csv',index_col=0)
clusts,members = clustering.get_cluster_levels(snpdist)
meta = meta.merge(clusts,left_on='sample',right_index=True)

In [115]:
clusts.snp500.value_counts()
#clustering.nonredundant_samples(clusts, 'snp50')

snp500
2    398
1      9
Name: count, dtype: int64

In [108]:
meta.to_csv('/storage/btbgenie/global_results/metadata.csv',index=False)

## RD analysis

In [None]:
reload(rdiff)
rdiff.show_rd_coverage(samples[:5], 'LT708304.1',4330000,4340000,app.mbovis_genome,margin=1300)

In [None]:
complexcolors = {'La2':'blue', 'La1.8.1':'pink', 'La1.3':'', 'La1.6':'', 'La1.7.1':'', 'La1.5':'', 'La3':'',
                'La1.8.2':'', 'La1.7-unk4':'', 'La1.2':'', 'La1.4':'', 'La1.7-unk5':'', 'La1.1':''}

In [None]:
R = pd.read_csv('/storage/btbgenie/global_results/rd.csv',index_col=0).iloc[:,:-1]
xcols = R.columns[(R.sum()<61) | (R.sum()>10)]
R[xcols]

In [None]:
R = pd.read_csv('/storage/btbgenie/global_results/rd.csv',index_col=0)
#mapdict=dict(zip(meta['sample'],meta['SIB_Number']))
#X.index=X.index.map(mapdict)
scols=['sample','Clonal_Complex']
R=R.merge(meta[scols],left_index=True,right_on='sample',how='left').set_index('sample')

row_colors,colormap = plotting.get_color_mapping(R,'Clonal_Complex',cmap='Paired',seed=18)

cols = ['RD1','RD1mic','RDWicklow','RD8','RD11','RD149','RD6','RD3','RDAf2']

import matplotlib.patches as mpatches
g=sns.clustermap(R[cols],lw=0,cmap='gray_r',fmt='g',cbar_pos=None,col_cluster=False,
                 yticklabels=False,row_colors=row_colors,figsize=(12,12))
pts=[]
for c in colormap:
    pts.append(mpatches.Patch(color=colormap[c],label=c))
g.fig.legend(handles=pts,bbox_to_anchor=(1.15, .9))

## combine ireland + uk + global samples

add Ireland samples to global tree

In [None]:
outdir='/storage/btbgenie/combined_results'

folders = {'ireland':'/storage/btbgenie/all_ireland_results/',          
          'global':'/storage/btbgenie/global_results/'}

x=[]
for f in folders:
    file = os.path.join(folders[f],'samples.csv')
    df=pd.read_csv(file)    
    x.append(df)
    print (f, len(df))
allsamples = pd.concat(x)
print (len(allsamples))
allsamples.to_csv(os.path.join(outdir,'samples.csv'),index=False)
app.write_samples(allsamples[['sample']], outdir)

## run variant calling on combined files

In [None]:
reload(app)
bam_files = allsamples.bam_file
gff_file=os.path.join(outdir,'mbovis.gff')
outdir='/storage/btbgenie/combined_results'
tools.gff_bcftools_format(app.mbovis_gb, gff_file)
app.run_bamfiles(bam_files, app.mbovis_genome, outdir=outdir, threads=18,
                 mask=app.mbovis_mask, gff_file=gff_file,
                 custom_filters=True, overwrite=False)

In [4]:
reload(app)
#app.csq_call(app.mbovis_genome, gff_file,'/storage/btbgenie/combined_results/snps.vcf.gz', 
#             '/storage/btbgenie/combined_results/csq.tsv')
#smat = pd.read_csv('/storage/btbgenie/combined_results/core.txt',index_col=0,sep=' ')
#trees.convert_branch_lengths('/storage/btbgenie/combined_results/RAxML_bipartitions.variants',
#                             '/storage/btbgenie/combined_results/tree.newick', len(smat))


Unnamed: 0_level_0,ref,1034,13-11594,14-MBovis,15-11643,17-11662,17-MBovis,19-11957,19-MBovis,22-12200,...,SRR13888775,SRR1791891,SRR1792067,SRR1792410,SRR7236143,SRR7236174,SRR7236252,SRR7236262,SRR7236279,SRR7236421
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
224,A,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
342,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
387,C,C,C,C,C,C,C,C,C,C,...,C,C,C,C,C,C,C,C,C,C
467,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
539,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4349078,C,C,C,C,C,C,C,C,C,C,...,C,C,C,C,C,C,C,C,C,C
4349107,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
4349136,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
4349603,A,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A


<div class="alert alert-block alert-info"><b>Start here for post analysis</b></div>

In [None]:
scols=['sample','SB','batch','county']
ireland = pd.read_csv('/storage/btbgenie/all_ireland_results/metadata.csv').iloc[:,:-7]
#ireland['Country']='Ireland'
master=pd.concat([ireland,meta])
print (len(ireland), len(meta), len(master))

In [None]:
def get_common(x, common):
    if x in common:
        return x
    else:
        return 'Other'

In [None]:
common = master.Country.value_counts()[:9]
master['Country1'] = master.Country.apply(lambda x: get_common(x, common))
#len(master)

In [None]:
#common = master.county.value_counts()[:4]
#master['county1'] = master.county.apply(lambda x: get_common(x, common))
#master['cc'] = master['Clonal_Complex'].fillna(master.Country)
#master['cc1'] = master['Clonal_Complex'].fillna(master.county1)

In [None]:
common = master.SB.value_counts()[:8]
master['SB1'] = master.SB.apply(lambda x: get_common(x, common))

In [None]:
reload(clustering)
snpdist = pd.read_csv('/storage/btbgenie/combined_results/snpdist.csv',index_col=0)
clusts,members = clustering.get_cluster_levels(snpdist)
cc = clusts.snp500.value_counts()#[1:20]
cc.plot(kind='bar',figsize=(12,5))
print (list(cc.index))
members.to_parquet("/storage/btbgenie/cluster_members.parquet")

In [None]:
st = clustering.generate_strain_names(clusts)

In [None]:
final = master.merge(st,left_on='sample',right_index=True)

In [None]:
#combine rd data
rd1=pd.read_csv('/storage/btbgenie/global_results/rd.csv',index_col=0)
rd2=pd.read_csv('/storage/btbgenie/mbovis_ireland/rd.csv',index_col=0)
rd=pd.concat([rd1,rd2])
rvar = ['RD149','RD3','RD5','RD5oryx*','RDAf2','RDWicklow']
final = final.merge(rd[rvar],left_on='sample',right_index=True)

In [None]:
final.to_csv('/storage/btbgenie/combined_results/metadata.csv')

In [None]:
cols=['sample','SB']
X=final.set_index(cols)[rvar].fillna(0)
#X = X.iloc[:,1:]
#print (X)
sns.clustermap(X,xticklabels=True,yticklabels=[],cmap='gray',figsize=(12,12))

In [None]:
meta.Clonal_Complex.value_counts()

## representative samples

In [None]:
clustering.nonredundant_samples(clusts, 'snp500')

## missing sites

In [None]:
snprecs, smat = tools.core_alignment_from_vcf('/storage/btbgenie/global_results/snps.vcf.gz', missing=True)

In [None]:
missing = smat[smat=='N'].T.count().sort_values()
x=pd.DataFrame(missing).reset_index()
x

## SNPs and indels statistics

In [None]:
final = pd.read_csv('/storage/btbgenie/combined_results/metadata.csv',index_col=0)

In [None]:
csq = app.read_csq_file('/storage/btbgenie/combined_results/csq.tsv')
aamat = app.get_aa_snp_matrix(csq)
nucmat = pd.read_csv('/storage/btbgenie/combined_results/core.txt',sep=' ')
nucmat = nucmat.set_index('pos')

In [None]:
csqind = app.read_csq_file('/storage/btbgenie/combined_results/csq_indels.tsv')
#print (csqind)
#print (csqind.snp_type.value_counts())

ind_mat = app.get_aa_snp_matrix(csqind)
ind_mat.reset_index().snp_type.value_counts()
#ind_mat
#ind_mat[ind_mat.sum(1)>5]

In [None]:
#mat.index = mat.index.get_level_values(1)+'_'+mat.index.get_level_values(2)
stat = aamat.sum(1).reset_index()
stat.gene.value_counts()[:50]

In [None]:
C=aamat.T
cols=['sample','Country','snp500']
C=C.merge(final[cols],on='sample')
C=C.set_index(cols)
C

In [None]:
countries = C.index.get_level_values(1)
clust = C.index.get_level_values(2)

In [None]:
reload(trees)
colormap1 = dict(zip(countries.unique(), trees.qcolors))
colors1 = [colormap1[i] if i in colormap1 else 'Black' for i in countries]
colormap2 = dict(zip(clust.unique(), trees.qcolors))
colors2 = [colormap2[i] if i in colormap2 else 'Black' for i in clust]
#print (colors2)
g=sns.clustermap(C.iloc[:,:2000],xticklabels=False,yticklabels=False,figsize=(17,12),cmap='gray_r',cbar_pos=None,row_colors=colors2)
p=plotting.make_legend(g.fig,colormap1)
#g.fig.savefig('/storage/btbgenie/global_results/indel_clusters.jpg')

In [None]:
x=[[random.choice([0,1]) for i in range(10)] for i in range(4)]
data=np.array([[1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
               [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
               [1, 1, 0, 0, 0, 1, 1, 1, 1, 1],
               [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])
df=pd.DataFrame(data, index=['A','B','C','D'])
print (df)
get_unique_snps([0,1,2], df)


In [None]:
def get_unique_snps(names, df, present=True):
    """Get snps unique to one or more samples from a SNP matrix.
    Args:
        name: name of sample(s)
        df: snp matrix from app.get_aa_snp_matrix(csq)
        present: whether snp should be present/absent
    returns:
        dataframe
    """

    if type(names) is str:
        names=[names]
    insamp = df[names]
    other = df.loc[:, ~df.columns.isin(names)]
    if present == True:
        u = other[other.sum(1)==0]
        u = insamp.loc[u.index]
    else:
        u = other[other.sum(1)==len(other.columns)]
        #sns.clustermap(df.loc[u.index])
        u = insamp.loc[u.index]
        u = u[u.sum(1)==0]
    return u

In [None]:
#names=['41-MBovis','45-MBovis','48-MBovis','35-MBovis']
x=pd.read_csv('/storage/btbgenie/wicklow_results/samples.csv')
names=list(x['sample'])
print (len(names))
get_unique_snps(names, aamat)

In [None]:
c = nucmat.T
c = c.loc[names]
c = c[[i for i in c if c[i].nunique()>1]]
c

In [None]:
pos = list(c.columns)
aamat.loc[pos,:,:,:]

## PCA snps

In [None]:
import sklearn
from sklearn import decomposition
from sklearn import manifold

pca = decomposition.PCA(n_components=3)
mds = manifold.MDS(n_components=3)
pos = mds.fit(C).embedding_
X = pca.fit_transform(pos)
#X = pca.transform(C)

In [None]:
fig,ax = plt.subplots(figsize=(7, 7))
ax.scatter(X[:, 0], X[:, 1], c=colors1, s=50, alpha=.4)

In [None]:
%matplotlib ipympl
plt.figure()
ax = plt.axes(projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=colors2, s=30, alpha=.3)
