# Spoligotype from WGS

* https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-016-0270-7


In [12]:
import sys,os,shutil,subprocess
import glob
from importlib import reload
import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 150)
import pylab as plt
from Bio import SeqIO
from io import StringIO
from snipgenie import tools,app,aligners

## test files

In [13]:
#path = '/storage/btbgenie/mbovis_ireland/NI/'
path = '/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/'
files = glob.glob(os.path.join(path, '*.gz'))
samp = app.get_samples(files)
samp

Unnamed: 0,name,sample,filename,pair
0,19-5224_S37_L001-4_R2_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-5224_S37_L001-4_R2_001.fastq.gz,1
1,19-4168_S66_L001-4_R2_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-4168_S66_L001-4_R2_001.fastq.gz,2
2,19-8294_S40_L001-4_R1_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-8294_S40_L001-4_R1_001.fastq.gz,3
3,19-7319_S58_L001-4_R2_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-7319_S58_L001-4_R2_001.fastq.gz,4
4,19-1469_S35_L001-4_R1_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-1469_S35_L001-4_R1_001.fastq.gz,5
...,...,...,...,...
157,19-598_S95_L001-4_R2_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-598_S95_L001-4_R2_001.fastq.gz,158
158,19-6277_S32_L001-4_R2_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-6277_S32_L001-4_R2_001.fastq.gz,159
159,19-1428_S57_L001-4_R2_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-1428_S57_L001-4_R2_001.fastq.gz,160
160,19-4803_S31_L001-4_R2_001,19,/storage/btbgenie/mbovis_ireland/Monaghan/Fastqs_16-12-19/19-4803_S31_L001-4_R2_001.fastq.gz,161


In [186]:
#convert reads to fasta
tools.fastq_to_fasta(f1, 'temp.fa', 500000)
#make blast db from reads
tools.make_blast_database('temp.fa')
#blast spacers to db
bl = tools.blast_fasta('temp.fa', '../dr_spacers.fa', evalue=0.1, 
                       maxseqs=100000, show_cmd=True)

blastn -out ../dr_spacers_blast.txt -outfmt "6 qseqid sseqid qseq sseq pident qcovs length mismatch gapopen qstart qend sstart send evalue bitscore stitle" -query ../dr_spacers.fa -db temp.fa -evalue 0.1 -max_target_seqs 100000 -num_threads 4 -task blastn


In [None]:
#bl=bl.sort_values('qseqid','pident')
bl[['qseqid','pident','qcovs','length','mismatch']]

In [212]:
bl=bl[(bl.qcovs>95) & (bl.mismatch<2)]
x=bl.groupby('qseqid').agg({'pident':np.size}).reset_index()
x

Unnamed: 0,qseqid,pident
0,37,9.0
1,38,7.0


In [18]:
def get_spoligotype(filename, reads_limit=500000, threshold=2):
    """Get spoligotype from reads"""
    
    ref = '../snpgenie/data/dr_spacers.fa'
    #convert reads to fasta
    tools.fastq_to_fasta(filename, 'temp.fa', reads_limit)
    #make blast db from reads
    tools.make_blast_database('temp.fa')
    #blast spacers to db
    bl = tools.blast_fasta('temp.fa', ref, evalue=0.1, 
                           maxseqs=100000, show_cmd=False) 
    bl=bl[(bl.qcovs>95) & (bl.mismatch<2)]
    x = bl.groupby('qseqid').agg({'pident':np.size}).reset_index()    
    x = x[x.pident>=threshold]
    print (x)
    found = list(x.qseqid)
    
    s=[]
    for i in range(1,44):
        if i in found:
            s.append('1')
        else:
            s.append('0')
    s =''.join(s)
    print (s)
    return s

s = get_spoligotype('/storage/btbgenie/mbovis_ireland/NI/ERR125598_1.fastq.gz')
oct(int(s,2))

    qseqid  pident
0        1     4.0
1        2     4.0
2        7    11.0
3       14     2.0
6       19     2.0
9       23     4.0
10      24     7.0
11      25     6.0
12      26     4.0
13      27     3.0
18      34     4.0
19      35     3.0
20      36     2.0
21      37     2.0
22      38     3.0
1100001000000100001000111110000001111100000


'0o141004107601740'

In [None]:
def get_sb_number(binary_str):
    """Get SB number from binary pattern usinf database reference"""
    
    df = pd.read_csv('../snpgenie/data/Mbovis.org_db.csv')
    x = df[df['binary'] == binary_str]
    if len(x) == 0:
        return 
    else:
        return x.iloc[0].SB

get_sb_number('0000000000000000000000000000000000001100000')

## test thresholds, reads used

In [None]:
for t in range(2,6):
    b = get_spoligotype('/storage/btbgenie/mbovis_ireland/Wicklow/Fastqs_09-07-18/45-MBovis_S22_L001-4_R1_001.fastq.gz', 500000, t)
    print (get_sb_number(b))

In [None]:
res=[]
for f in files:
    s = get_spoligotype(f)
    sb = get_sb_number(s)
    print (f, sb)
    res.append([f,sb])    