# Spoligotype from WGS

* https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-016-0270-7


In [1]:
import sys,os,shutil,subprocess
import glob
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 150)
import pylab as plt
from Bio import SeqIO
from io import StringIO
from snipgenie import tools,app,aligners

## test files

In [2]:
#path = '/storage/btbgenie/mbovis_ireland/NI/'
path = '/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/'
files = glob.glob(os.path.join(path, '*.gz'))
samp = app.get_samples(files)
samp = app.get_pivoted_samples(samp)
samp

Unnamed: 0,sample,filename1,filename2,name1,name2
0,17,/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/17-MBovis_S21_L001-4_R1_001.fastq.gz,/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/17-MBovis_S21_L001-4_R2_001.fastq.gz,17-MBovis_S21_L001-4_R1_001,17-MBovis_S21_L001-4_R2_001
1,19,/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/19-MBovis_S32_L001-4_R1_001.fastq.gz,/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/19-MBovis_S32_L001-4_R2_001.fastq.gz,19-MBovis_S32_L001-4_R1_001,19-MBovis_S32_L001-4_R2_001
2,26,/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/26-MBovis_S43_L001-4_R1_001.fastq.gz,/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/26-MBovis_S43_L001-4_R2_001.fastq.gz,26-MBovis_S43_L001-4_R1_001,26-MBovis_S43_L001-4_R2_001


In [None]:
#convert reads to fasta
tools.fastq_to_fasta(f1, 'temp.fa', 500000)
#make blast db from reads
tools.make_blast_database('temp.fa')
#blast spacers to db
bl = tools.blast_fasta('temp.fa', '../dr_spacers.fa', evalue=0.1, 
                       maxseqs=100000, show_cmd=True)

In [None]:
#bl=bl.sort_values('qseqid','pident')
bl[['qseqid','pident','qcovs','length','mismatch']]

In [212]:
bl=bl[(bl.qcovs>95) & (bl.mismatch<2)]
x=bl.groupby('qseqid').agg({'pident':np.size}).reset_index()
x

Unnamed: 0,qseqid,pident
0,37,9.0
1,38,7.0


In [5]:
def get_spoligotype(filename, reads_limit=500000, threshold=0):
    """Get spoligotype from reads. Returns a binary string."""
    
    ref = '../snipgenie/data/dr_spacers.fa'
    #convert reads to fasta
    tools.fastq_to_fasta(filename, 'temp.fa', reads_limit)
    #make blast db from reads
    tools.make_blast_database('temp.fa')
    #blast spacers to db
    bl = tools.blast_fasta('temp.fa', ref, evalue=.1, 
                           maxseqs=reads_limit, show_cmd=True) 
    bl=bl[(bl.qcovs>95) & (bl.mismatch<3)]
    x = bl.groupby('qseqid').agg({'pident':np.size}).reset_index()
    #print (x)
    x = x[x.pident>=threshold]    
    found = list(x.qseqid)
    
    s=[]
    for i in range(1,44):
        if i in found:
            s.append('1')
        else:
            s.append('0')
    s =''.join(s)
    print (s)
    return s

s = get_spoligotype('/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_09-07-18/48-MBovis_S17_L001-4_R1_001.fastq.gz')
get_sb_number(s)

blastn -out ../snipgenie/data/dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../snipgenie/data/dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 500000 -num_threads 4 -task blastn
1100101000000110111111111011101111111100000


In [4]:
def get_sb_number(binary_str):
    """Get SB number from binary pattern usinf database reference"""
    
    df = pd.read_csv('../snipgenie/data/Mbovis.org_db.csv')
    x = df[df['binary'] == str(binary_str)]
    if len(x) == 0:
        return 
    else:
        return x.iloc[0].SB

get_sb_number('1100101000001110111111111111111111111100000')

'SB0054'

## test thresholds, reads used

In [64]:
for t in range(0,4):
    b = get_spoligotype('/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_09-07-18/36-MBovis_S38_L001-4_R1_001.fastq.gz', threshold=t)
    print (t,get_sb_number(b))

blastn -out ../snipgenie/data/dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../snipgenie/data/dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 500000 -num_threads 4 -task blastn
1100101000000110111011111011111011111100000
0 None
blastn -out ../snipgenie/data/dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../snipgenie/data/dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 500000 -num_threads 4 -task blastn
1100101000000110111011111011111011111100000
1 None
blastn -out ../snipgenie/data/dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../snipgenie/data/dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 500000 -num_threads 4 -task blastn
1000001000000010111010101001111011110100

In [6]:
res=[]
for f in files:
    s = get_spoligotype(f, threshold=0)
    sb = get_sb_number(s)
    print (f, sb)
    res.append([f,sb])    

blastn -out ../snipgenie/data/dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../snipgenie/data/dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 500000 -num_threads 4 -task blastn
1100101000000010011111111101111111011100000
/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/17-MBovis_S21_L001-4_R1_001.fastq.gz None
blastn -out ../snipgenie/data/dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../snipgenie/data/dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 500000 -num_threads 4 -task blastn
1100001000001100011010111111111111000100000
/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_07-01-18/26-MBovis_S43_L001-4_R1_001.fastq.gz None
blastn -out ../snipgenie/data/dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart 