# compare 2 methods of calling - concat vs individual

In [6]:
import sys,os,shutil,subprocess
import random
import glob, time
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
import pylab as plt
import seaborn as sns
from Bio import SeqIO, AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from gzip import open as gzopen
sys.path.append('snipgenie')
from snipgenie import tools, aligners, app, tools

In [None]:
reload(app)
ref = app.sarscov2_genome
args = {'threads':12, 'outdir': 'simdata/sim_results', 'labelsep':'.',
        'input':'simdata/sim_fastq/',        
        'reference': ref,
        'overwrite':False,
        'filters':'',
        'custom_filters': False, 'get_stats':False}
W = app.WorkFlow(**args)
st = W.setup()
W.run()

In [23]:
samples = pd.read_csv('simdata/sim_results/samples.csv')

In [116]:
def compare_results(c1, c2, sample=None):
    """
    Compare two runs of snipgenie.
    """

    x = c1[~c1.pos.isin(c2.pos)]
    y = c2[~c2.pos.isin(c1.pos)]
    print ('%s/%s sites not in second:' %(len(x),len(c1)))
    print (x)
    print ('-------------------------')
    print ('%s/%s sites not in first:' %(len(y),len(c2)))
    print (y)
    return x,y

In [106]:
import multiprocessing as mp

def worker(args):
    mpileup(args[0], args[1])
        
def mpileup_multiprocess(bam_files, ref, outpath, threads=4, callback=None):
    """Run mpileup in parallel over multiple files and make separate bcfs.
    Assumes alignment to a bacterial reference with a single chromosome."""

    bcftoolscmd = tools.get_cmd('bcftools')
    #size = len(bam_files) 
    #pool = mp.Pool(threads)
    outfiles = []
    st = time.time()
    bcfpath = os.path.join(outpath,'bcf')
    if not os.path.exists(bcfpath):
        os.mkdir(bcfpath)
    for bam_file in bam_files:
        name = os.path.splitext(os.path.basename(bam_file))[0]
        out = '{o}/{f}.bcf'.format(o=bcfpath,f=name) 
        outfiles.append(out)
   
    data = list(zip(bam_files,outfiles))
    #print (data)
   
    p = mp.Pool(threads)
    p.map_async(worker, data)
    p.close()
    p.join()
    
    t=time.time()-st
    print ('took %s seconds' %str(round(t,3)))
    rawbcf = os.path.join(outpath,'raw.bcf')
    bcf_files = ' '.join(outfiles)
    cmd = '{bc} merge --threads {t} -o {r} {b}'.format(b=bcf_files,r=rawbcf, bc=bcftoolscmd,t=threads)
    print (cmd)
    subprocess.check_output(cmd, shell=True)
    return rawbcf

def mpileup_files(bam_files, ref, outpath, threads=4):
    """mpileup bam files"""
    
    bcftoolscmd = tools.get_cmd('bcftools')
    outfiles = []
    st = time.time()
    bcfpath = os.path.join(outpath,'bcf')
    if not os.path.exists(bcfpath):
        os.mkdir(bcfpath)
    for bam_file in bam_files:
        name = os.path.splitext(os.path.basename(bam_file))[0]
        out = '{o}/{f}.bcf'.format(o=bcfpath,f=name) 
        outfiles.append(out)        
   
    data = list(zip(bam_files,outfiles)) 
    #print (data)
    for f,out in data:
        mpileup(f, out, ref, threads=threads)
        
    t=time.time()-st
    print ('took %s seconds' %str(round(t,3)))
    rawbcf = os.path.join(outpath,'raw.bcf')
    bcf_files = ' '.join(outfiles)
    #print (bcf_files)
    cmd = '{bc} merge --threads {t} -0 -o {o} {b}'.format(b=bcf_files,o=rawbcf, bc=bcftoolscmd,t=threads)
    print (cmd)
    #subprocess.check_output(cmd, shell=True)
    #cmd = '{bc} norm -f {r} -O b -o {o} tmp.bcf'.format(o=rawbcf, bc=bcftoolscmd, r=ref) 
    subprocess.check_output(cmd, shell=True)    
    return rawbcf

def mpileup(bam_file, out, ref, threads=4, overwrite=False):
    """Run bcftools for single file."""

    bcftoolscmd = tools.get_cmd('bcftools')
    if overwrite==False and os.path.exists(out):
        return
    cmd = '{bc} mpileup -a {a} -O b --max-depth 500 --min-MQ 60 -f {r} -o {o} {b}'\
            .format(r=ref, b=bam_file, o=out, bc=bcftoolscmd, a=app.annotatestr, t=threads)
    subprocess.check_output(cmd, shell=True)  
    print (cmd)
    cmd = 'bcftools index {o}'.format(o=out)
    subprocess.check_output(cmd, shell=True)
    return
   

In [103]:
reload(tools)

def run_file(bam_file, ref, outpath, filters=None, mask=None,
             overwrite=False, threads=4):
    """
    Run pileup/snp calling for one file
    """
    
    if not os.path.exists(outpath):
        os.makedirs(outpath, exist_ok=True)
    if filters == None:
        filters = app.default_filter        
    bcftoolscmd = tools.get_cmd('bcftools')
    rawbcf = os.path.join(outpath,'raw.bcf')
    mpileup(bam_file, rawbcf, ref, threads=threads, overwrite=overwrite)

    #call
    vcfout = os.path.join(outpath,'calls.vcf')
    tools.bcftools_call(rawbcf, vcfout)#, show_cmd=True)
    
    #filter snps
    filtered = os.path.join(outpath,'filtered.bcf')
    cmd = '{bc} filter -i "{f}" -o {o} -O b {i}'.format(bc=bcftoolscmd,i=vcfout,o=filtered,f=filters)
    #print (cmd)
    tmp = subprocess.check_output(cmd,shell=True)
    
    #print ('splitting snps and indels..')
    snpsout = os.path.join(outpath,'snps.bcf')
    cmd = '{bc} view -v snps -o {o} -O b {i}'.format(bc=bcftoolscmd,o=snpsout,i=vcfout)
    #print (cmd)
    subprocess.check_output(cmd,shell=True)
    cmd = 'bcftools index {o}'.format(o=snpsout)
    subprocess.check_output(cmd, shell=True)
    
    #also get indels only to separate file
    indelsout = os.path.join(outpath,'indels.bcf')
    #cmd = '{bc} call -V snps --ploidy 1 -m -v -o {o} {raw}'.format(bc=bcftoolscmd,o=indelsout,raw=rawbcf)
    cmd = '{bc} view -v indels -o {o} -O z {i}'.format(bc=bcftoolscmd,o=indelsout,i=filtered)
    #print (cmd)
    subprocess.check_output(cmd,shell=True)

    #apply mask if required
    if mask != None:
        mask_filter(snpsout, mask, outdir=outpath, overwrite=True)
        mask_filter(indelsout, mask, outdir=outpath, overwrite=True)

    #run proximity filter on all sites at once...
    
    return snpsout

path = 'simdata/sim_results3/S1'
bam_file = 'simdata/sim_results/mapped/S1.bam'
run_file(bam_file, app.sarscov2_genome, path, overwrite=True)


[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


bcftools mpileup -a "AD,ADF,ADR,DP,SP,INFO/AD,INFO/ADF,INFO/ADR" -O b --max-depth 500 --min-MQ 60 -f /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa -o simdata/sim_results3/S1/raw.bcf simdata/sim_results/mapped/S1.bam


'simdata/sim_results3/S1/snps.bcf'

In [112]:
def new_variant_calling(samples, outpath, overwrite=False, threads=4):
    """
    Run variant calling for multiple samples
    """
    
    bcftoolscmd = tools.get_cmd('bcftools')
    if samples is not None:
        #write out new samples.txt file for reheader step
        app.write_samples(samples[['sample']], outpath)
        sample_file = os.path.join(outpath,'samples.txt')    
    outfiles = []
    for i,r in samples.iterrows():    
        name = r['sample']
        path = os.path.join(outpath,name)
        snpfile = run_file(r.bam_file, app.sarscov2_genome, path)
        outfiles.append(snpfile)

    merged = os.path.join(outpath, 'merged.vcf.gz')
    bcf_files = ' '.join(outfiles)
    #print (bcf_files)
    cmd = '{bc} merge -m all --threads {t} -0 -O z -o {o} {f}'.format(f=bcf_files,o=merged, bc=bcftoolscmd,t=threads)
    print (cmd)
    subprocess.check_output(cmd, shell=True)  
    app.relabel_vcfheader(merged, sample_file)
    return
    
outdir='simdata/sim_results3/'
new_variant_calling(samples, outdir)

bcftools merge -m all --threads 4 -0 -O z -o simdata/sim_results3/merged.vcf.gz simdata/sim_results3/S1/snps.bcf simdata/sim_results3/S10/snps.bcf simdata/sim_results3/S11/snps.bcf simdata/sim_results3/S2/snps.bcf simdata/sim_results3/S3/snps.bcf simdata/sim_results3/S4/snps.bcf simdata/sim_results3/S5/snps.bcf simdata/sim_results3/S6/snps.bcf simdata/sim_results3/S7/snps.bcf simdata/sim_results3/S8/snps.bcf simdata/sim_results3/S9/snps.bcf
bcftools reheader --samples simdata/sim_results3/samples.txt -o /tmp/calls.vcf simdata/sim_results3/merged.vcf.gz


In [113]:
reload(tools)
#tools.core_alignment_from_vcf('simdata/sim_results3/merged.vcf.gz')
app.run_vcf('simdata/sim_results3/merged.vcf.gz', outdir, threads=8)

getting core alignment..
uninformative_sites False
found 176 sites for core snps
0 sites with at least one missing sample
0 uninformative sites
computing snp distance matrix..
raxmlHPC-PTHREADS -f a -N 100 -T 8 -m GTRCAT -V -p 79763833 -x 81235701 -n variants -w /home/farrell/gitprojects/snipgenie/notebooks/simdata/sim_results3 -s simdata/sim_results3/core.fa


## SNIPPY test

In [None]:
outdir='simdata/snippy_results'
ref=app.sarscov2_genome
for i, r in samples.iterrows():    
    out = os.path.join(outdir, r['sample'])
    cmd = '/local/snippy/bin/snippy --cpus 8 --outdir {o} --ref {r} --R1 {f1} --R2 {f2}'.format(
            o=out,r=ref,f1=r.filename1,f2=r.filename2)
    print (cmd)
    subprocess.check_output(cmd, shell=True)

In [None]:
cmd = "/local/snippy/bin/snippy-core --ref 'simdata/snippy_results/S1/ref.fa' S1 S10 S11 S2 S3 S4 S5 S6 S7 S8 S9"
subprocess.check_output(cmd, shell=True)

In [None]:
reload(app)

bam_files = list(samples.bam_file.unique())
mpileup_files(bam_files, ref, 'simdata/sim_results2', threads=8)
#app.run_bamfiles(bam_files, ref, outdir='simdata/sim_results2', custom_filters=False, filters='', threads=8)

#manually
cmd = 'bcftools call --ploidy 1 -m -v -o {o} {raw}'.format(o='simdata/sim_results2/calls2.vcf',raw='simdata/sim_results2/raw.bcf')
print (cmd)
subprocess.check_output(cmd, shell=True)
cmd = 'bcftools view -v snps -o {o} -O z {i}'.format(o='simdata/sim_results2/snps.vcf.gz',i='simdata/sim_results2/calls2.vcf')
subprocess.check_output(cmd, shell=True)
snprecs, smat = tools.core_alignment_from_vcf('simdata/sim_results2/snps.vcf.gz')
outfasta = 'simdata/sim_results2/core2.fa'
SeqIO.write(snprecs, outfasta, 'fasta')
aln = AlignIO.read(outfasta, 'fasta')
aln = aln[1:]
snp_dist = tools.snp_dist_matrix(aln)
snp_dist.to_csv('simdata/sim_results2/snpdist2.csv', sep=',')


In [None]:
reload(tools)
bcf_files = ' '.join(glob.glob('simdata/sim_results2/bcf/*.bcf'))
print(bcf_files)
tools.bcftools_merge(bcf_files, 'simdata/sim_results2/raw.bcf', threads=8)

In [8]:
reload(tools)
tools.bcftools_call('simdata/sim_results2/test.bcf', 'simdata/sim_results2/calls.vcf')

bcftools call --ploidy 1 -m -o simdata/sim_results2/calls.vcf simdata/sim_results2/test.bcf


In [118]:
c1=pd.read_csv('simdata/sim_results/core.txt',sep=' ')
c2=pd.read_csv('simdata/sim_results3/core.txt',sep=' ')
x1,y1=compare_results(c1,c2)

0/176 sites not in second:
Empty DataFrame
Columns: [pos, ref, S1, S10, S11, S2, S3, S4, S5, S6, S7, S8, S9]
Index: []
-------------------------
0/176 sites not in first:
Empty DataFrame
Columns: [pos, ref, S1, S10, S11, S2, S3, S4, S5, S6, S7, S8, S9]
Index: []


In [114]:
reload(tools)
pos=x1.pos
#pos2 = y1.pos[:12]
#pos = np.random.randint(1,2000,5)
pos = [189]#,252,457,855]
#pos = list(x1.pos[:6])
n=6

print (tools.bcftools_query('simdata/sim_results/snps.vcf.gz',pos).iloc[:,:])
print (tools.bcftools_query('simdata/sim_results3/raw.bcf',pos).iloc[:,:])

for s in ['S1','S2','S3','S9','S10','S11']:
    print (tools.bcftools_query(f'simdata/sim_results2/bcf/{s}.bcf',pos).iloc[:,:5])


NameError: name 'x1' is not defined

In [None]:
d1 = pd.read_csv('simdata/sim_results/snpdist.csv',index_col=0)
d2 = pd.read_csv('simdata/sim_results2/snpdist.csv',index_col=0)
print (d1)
print (d2)

In [90]:
c1=pd.read_csv('simdata/sim_results/core.txt',sep=' ')
c2=pd.read_csv('simdata/sim_results2/core.txt',sep=' ')
c3=pd.read_csv('simdata/snippy_results/core.tab',sep='\t')
c3=c3.rename(columns={'POS':'pos'})
x,y=compare_results(c1,c2)


30/176 sites not in second:
       pos ref S1 S10 S11 S2 S3 S4 S5 S6 S7 S8 S9
1      189   G  G   G   G  G  G  G  G  G  G  G  T
3      252   G  G   G   G  G  G  G  G  G  G  G  T
5      457   T  T   C   T  T  T  T  T  T  T  T  C
9      855   C  C   T   C  C  C  C  C  C  C  C  T
37    5724   C  C   T   T  C  C  C  C  C  C  C  T
56    7782   A  A   G   G  A  A  A  A  A  A  A  G
60    8615   G  G   G   G  G  G  G  G  G  G  G  T
64    8922   C  C   C   C  C  C  C  C  C  C  C  T
70   10226   C  C   T   T  C  C  C  C  C  C  C  T
74   10658   G  G   G   G  G  G  G  G  G  G  G  A
78   11820   G  G   G   G  G  G  G  G  G  G  G  T
81   12921   C  C   C   C  C  C  C  C  C  C  C  T
96   15601   C  C   T   C  C  C  C  C  C  C  C  T
97   15848   C  C   C   C  C  C  C  C  C  C  C  T
102  16508   G  G   G   G  G  G  G  G  G  G  G  T
103  17090   G  G   G   G  G  G  G  G  G  G  G  A
104  17104   C  C   C   C  C  C  C  C  C  C  C  T
120  19495   C  C   C   C  C  C  C  C  C  C  C  A
127  20283   C  C   C 

## test with wicklow data

In [None]:
samples = pd.read_csv('/storage/btbgenie/wicklow_results/samples.csv')
bam_files = list(samples.bam_file.unique())
ref = app.mbovis_genome
#mpileup_multiprocess(bam_files, ref, 'wicklow_temp', threads=8)

In [None]:
reload(app)
samples = pd.read_csv('/storage/btbgenie/wicklow_results/samples.csv')
bam_files = list(samples.bam_file.unique())
app.run_bamfiles(bam_files, app.mbovis_genome, outdir='wicklow_temp', samples=samples, custom_filters=True, threads=10)
#treefile = trees.run_RAXML('temp/core.fa', threads=8, outpath='temp')

In [None]:
v1 = tools.vcf_to_dataframe(os.path.join('wicklow_results','snps.vcf.gz'))
v1
print (tools.bcftools_query('wicklow_results/raw.bcf',[205557]))
print (tools.bcftools_query('wicklow_temp/raw.bcf',[205557]))

In [None]:
compare_results('/storage/btbgenie/wicklow_results','wicklow_temp')
compare_results('wicklow_temp','/storage/btbgenie/wicklow_results')

In [None]:
v1 = vcf_to_dataframe('result.vcf')
print (v1.var_type.value_counts())

In [None]:
v2 = vcf_to_dataframe('mapped/raw.bcf')
print (v2.var_type.value_counts())