# Spoligotype from WGS

* https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-016-0270-7


In [9]:
import sys,os,shutil,subprocess
import glob
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 150)
import pylab as plt
from Bio import SeqIO
from io import StringIO
from snpgenie import tools,app,aligners

## test files

In [257]:
path = '/storage/btbgenie/mbovis_ireland/NI/'
files = glob.glob(os.path.join(path, '*.gz'))
samp = app.get_samples(files)
samp

Unnamed: 0,name,sample,filename,pair
0,ERR125601_1,ERR125601,/storage/btbgenie/mbovis_ireland/NI/ERR125601_1.fastq.gz,1
1,ERR125601_2,ERR125601,/storage/btbgenie/mbovis_ireland/NI/ERR125601_2.fastq.gz,2
2,ERR125602_2,ERR125602,/storage/btbgenie/mbovis_ireland/NI/ERR125602_2.fastq.gz,1
3,ERR125599_1,ERR125599,/storage/btbgenie/mbovis_ireland/NI/ERR125599_1.fastq.gz,1
4,ERR125602_1,ERR125602,/storage/btbgenie/mbovis_ireland/NI/ERR125602_1.fastq.gz,2
5,ERR125599_2,ERR125599,/storage/btbgenie/mbovis_ireland/NI/ERR125599_2.fastq.gz,2


In [186]:
#convert reads to fasta
tools.fastq_to_fasta(f1, 'temp.fa', 500000)
#make blast db from reads
tools.make_blast_database('temp.fa')
#blast spacers to db
bl = tools.blast_fasta('temp.fa', '../dr_spacers.fa', evalue=0.1, 
                       maxseqs=100000, show_cmd=True)

blastn -out ../dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 100000 -num_threads 4 -task blastn


In [None]:
#bl=bl.sort_values('qseqid','pident')
bl[['qseqid','pident','qcovs','length','mismatch']]

In [212]:
bl=bl[(bl.qcovs>95) & (bl.mismatch<2)]
x=bl.groupby('qseqid').agg({'pident':np.size}).reset_index()
x

Unnamed: 0,qseqid,pident
0,37,9.0
1,38,7.0


In [259]:
def get_spoligotype(filename, reads_limit=500000):
    """Get spoligotype from reads"""
    
    ref = '../snpgenie/data/dr_spacers.fa'
    #convert reads to fasta
    tools.fastq_to_fasta(filename, 'temp.fa', reads_limit)
    #make blast db from reads
    tools.make_blast_database('temp.fa')
    #blast spacers to db
    bl = tools.blast_fasta('temp.fa', ref, evalue=0.1, 
                           maxseqs=100000, show_cmd=False) 
    bl=bl[(bl.qcovs>95) & (bl.mismatch<2)]
    x = bl.groupby('qseqid').agg({'pident':np.size}).reset_index()
    x
    #print (x)
    found = list(x.qseqid)
    #print (found)
    s=[]
    for i in range(1,44):
        if i in found:
            s.append('1')
        else:
            s.append('0')
    s =''.join(s)
    print (s)
    return s

s = get_spoligotype('/storage/btbgenie/rd_test_data/ERR234151_2.fastq.gz')
oct(int(s,2))

1101111101111110111111111111111111111100000


'0o157576777777740'

In [260]:
def get_sb_number(binary_str):
    """Get SB number from binary pattern usinf database reference"""
    
    df = pd.read_csv('../snpgenie/data/Mbovis.org_db.csv')
    x = df[df['binary'] == binary_str]
    if len(x) == 0:
        return 
    else:
        return x.iloc[0].SB

get_sb_number('0000000000000000000000000000000000001100000')

'SB0118'

In [261]:
res=[]
for f in files:
    s = get_spoligotype(f)
    sb = get_sb_number(s)
    print (f, sb)
    res.append([f,sb])    

1101101000001110111111111111111111111100000
/storage/btbgenie/mbovis_ireland/NI/ERR125601_1.fastq.gz SB0140
1101101000001110111111111111111111111100000
/storage/btbgenie/mbovis_ireland/NI/ERR125601_2.fastq.gz SB0140
1101101000001110111111111111111111111100000
/storage/btbgenie/mbovis_ireland/NI/ERR125602_2.fastq.gz SB0140
1101101000001110111111111111111111111100000
/storage/btbgenie/mbovis_ireland/NI/ERR125599_1.fastq.gz SB0140
1101101000001110111111111111111111111100000
/storage/btbgenie/mbovis_ireland/NI/ERR125602_1.fastq.gz SB0140
1101101000001110111111111111111111111100000
/storage/btbgenie/mbovis_ireland/NI/ERR125599_2.fastq.gz SB0140
