## SNP typing and lineage naming from WGS using reference population

References:

* https://www.nature.com/articles/ncomms5812
* https://www.frontiersin.org/articles/10.3389/fmicb.2020.00843/full
* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3502966/

Selection of reference population:

* Global
* European
* UK/Ireland
* All Ireland (republic + NI)

required: 

* We need to be able to add new samples to the existing **reference** population phylogeny without having to re-analyse everything together.
* We need to have a sufficiently sampled reference matrix that it will cover most potential new inputs. Otherwise they will be considered outgroups.


In [501]:
import sys,os
from importlib import reload
import numpy as np
import pandas as pd
import pylab as plt
import scipy.cluster.hierarchy as shc
from sklearn.preprocessing import normalize
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO,AlignIO
from Bio import Phylo
import seaborn as sns
import toytree
from snipgenie import app, trees, tools, snp_typing

In [486]:
def snps_to_fasta(snpmat, outfile):
    """Write snp matrix to fasta file"""
    
    snpmat = snpmat.fillna('N')
    recs = []
    for col in snpmat.columns[1:]:
        seq = ''.join(snpmat[col])        
        seqrec = SeqRecord(Seq(seq),id=col)
        recs.append(seqrec)
    SeqIO.write(recs, outfile, 'fasta')
    return    

def tree_from_snps(snpmat):
    snps_to_fasta(snpmat, 'snps.fa') 
    treefile = trees.run_fasttree('snps.fa')
    tre = toytree.tree(treefile)
    mystyle = { "layout": 'r','node_sizes':1,'tip_labels_align':False}
    tre.ladderize().draw(**mystyle,width=700);
    return tre

In [487]:
#snps_to_fasta(nucmat, 'snps.fa')
tree_from_snps(nucmat)

<toytree.Toytree.ToyTree at 0x7f0af613f3a0>

In [60]:
nucmat = pd.read_csv('../snipgenie/data/nuc_snps_ireland.txt',sep=' ')
nucmat = nucmat.set_index('pos')
ref=nucmat['ref']

In [76]:
clusts=pd.read_csv('/storage/btbgenie/all_ireland_results/clusters.txt',sep='\t')
clusts[10:14]

Unnamed: 0,SequenceName,ClusterNumber
10,19-6359,3
11,19-7209,3
12,19-1603,3
13,4794,3


In [505]:
def make_ref_snps():
    
    nucmat = pd.read_csv('../snipgenie/data/nuc_snps_ireland.txt',sep=' ')
    nucmat = nucmat.set_index('pos')
    clusts=pd.read_csv('/storage/btbgenie/all_ireland_results/clusters.txt',sep='\t')
    X=nucmat.T.merge(clusts,left_index=True,right_on='SequenceName').set_index(['ClusterNumber']).T
    return X

In [508]:
X = make_ref_snps()
X[:4]

ClusterNumber,2,5,5.1,5.2,5.3,5.4,5.5,5.6,-1,5.7,...,7,7.1,7.2,7.3,7.4,7.5,7.6,7.7,7.8,7.9
687,C,C,C,C,C,C,C,C,C,C,...,C,C,C,C,C,C,C,C,C,C
937,A,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
1303,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
1456,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G


In [None]:
print (X.loc[:,3])

## find snps unique to clusters

In [506]:
def get_clade_snps(refmat):
    """get unique clade snps from a snp matrix
       returns: a dataframe with unique positions/allele for each clade
       with this format
              clade      pos allele
           2   490878      G
           2   804997      T
           2   941068      A
           2  1124266      G
    """
    
    res=[]
    clusters = refmat.columns.unique()
    for c in clusters:
        for pos,r in list(refmat.iterrows()):
            #print (pos)
            a = r[c]
            b = r[~r.index.isin([c])]
            #print (len(r),len(a),len(b))
            f1 = a.value_counts()
            f2 = b.value_counts()
            alt1 = f1.index[0]
            if len(f1)>1:
                continue
            alt2 = f2.index[0]
            if alt1 in f2:
                continue
            #print (f1,alt2)
            #print (f1,f2)
            res.append((c,pos,alt1))
    
    res = pd.DataFrame(res,columns=['clade','pos','allele'])
    print (res)
    return res

refmat = make_ref_snps()
res = get_clade_snps(refmat)

res.to_csv('clade_snps.csv',index=False)

     clade      pos allele
0        2   490878      G
1        2   804997      T
2        2   941068      A
3        2  1124266      G
4        2  1442194      T
..     ...      ...    ...
229      8  2944945      T
230      8  3191587      T
231      8  3256582      A
232      8  3440364      A
233      8  3772840      C

[234 rows x 3 columns]


In [70]:
#check results
X.loc[51709,6]

ClusterNumber
6    C
6    C
6    C
6    C
6    C
    ..
6    C
6    C
6    C
6    C
6    C
Name: 51709, Length: 72, dtype: object

## identify sample from clade-specific SNPs 

In [634]:
sample = '1579'

def lookup_sample(snptable, snps):
    """Look up a sample using snps and known clades
        snptable: reference lookup table
        snps: a series with snps at each position for the
        given sample, this can be derived from a single row
        in the snp matrix produced from snipgenie
    """     
        
    found=[]
    for i,r in snptable.iterrows():
        if not r.pos in snps.index:
            continue
        if snps[r.pos] == r.allele:
            #print (r.pos,r.allele,r.clade)
            found.append(r.clade)
    if len(found) == 0:
        return
    return set(found)
 
reload(snp_typing)

x = nucmat.set_index('pos')[sample]
snptable = snp_typing.clade_snps
lookup_sample(snptable, x)

{6}

In [636]:
def type_samples(nucmat):
    """
    Type multiple samples.
    Args:
        nucmat: a dataframe with the following format-
        pos       687  937  1303 ..      
        sample1    C    A    G 
        sample2    C    A    G
        ...
    Returns:
        types for each sample
    """
    snptable = snp_typing.clade_snps
    for name,r in nucmat.iterrows():
        #print (r)
        cl = lookup_sample(snptable, r)
        print (name,cl)
        
snps = nucmat.set_index('pos').T[:3]
type_samples(snps)

ref {2}
31-12952 {5}
48-MBovis {5}


## test on isolated subset of samples - must use uninformative snps from subsamples

In [642]:
vcf_file='../test_results/filtered.vcf.gz'
snprecs, testmat = tools.fasta_alignment_from_vcf(vcf_file, uninformative=True)
testmat.T

found 191 sites
0 sites with at least one missing sample
0 uninformative sites


pos,33788,41437,69913,130237,160535,166696,173274,229412,232188,246207,...,4130927,4133879,4162554,4180986,4191866,4216874,4217177,4227256,4298265,4311425
ref,A,T,T,T,T,A,G,C,G,C,...,A,C,T,A,G,T,C,G,T,C
17-11662,G,C,C,C,C,G,T,C,C,T,...,C,T,C,G,A,C,C,G,C,T
15-11643,G,C,C,C,C,G,T,C,C,T,...,C,T,C,G,A,C,T,A,C,T
19-11957,G,C,C,C,C,G,T,C,C,T,...,C,T,C,G,A,C,T,A,C,T
13-11594,G,C,C,C,C,G,T,G,C,T,...,C,T,C,G,A,C,C,G,C,T


In [643]:
type_samples(testmat.T)

ref {2}
17-11662 {5}
15-11643 {5}
19-11957 {5}
13-11594 {5}


## store snp data to database

## encode snps to store in a single field in DB?

e.g. string encoding


In [602]:
x=snps.iloc[0]

def encode_snps(x):
    """encode snps as string for storage"""
    
    s=[]
    for i in zip(x.index.astype(str),x.values):
        s.append(''.join(i))
    s = ';'.join(s)
    return s

s = encode_snps(x)

import re
def decode_snps(s):
    """decode snps"""
    
    x=s.split(';')
    pos=[]
    alleles=[]
    for i in x:
        n,p,a = re.split(r'(\d+)', i)
        pos.append(p)
        alleles.append(a)
    x = pd.Series(alleles,pos)
    x.index.name='pos'
    return x

x = decode_snps(s)
x

pos
687        C
937        A
1303       G
1456       G
1584       T
          ..
3832251    G
3833590    G
3835182    G
3839650    T
3842525    G
Length: 1746, dtype: object

In [524]:
d=nucmat.set_index('pos').T[:5]
d

pos,687,937,1303,1456,1584,2532,4480,8048,8150,8741,...,3823420,3825991,3827894,3828728,3830688,3832251,3833590,3835182,3839650,3842525
ref,C,A,G,G,T,C,T,C,C,T,...,T,G,T,G,G,G,G,G,T,G
31-12952,C,A,G,G,T,C,T,C,C,C,...,T,G,T,G,G,G,G,G,C,G
48-MBovis,C,A,G,G,T,C,T,C,C,C,...,T,G,T,G,G,G,G,G,C,G
49-MBovis,C,A,G,G,T,C,T,C,C,C,...,T,G,T,G,G,G,G,G,C,G
28-12935,C,A,G,G,T,C,T,C,C,C,...,T,G,T,G,G,G,G,G,C,G


In [614]:
#assign ID
import uuid
  
id = uuid.uuid1()
id
import hashlib
hash_object = hashlib.md5(b'Hello World')
print(hash_object.hexdigest())


b10a8db164e0754105b7a99be72e3fe5


In [611]:
class isolate():
    
    def __init__(self, id, ):
        
        self.id = id
        self.clade = None
        self.species = 'M.bovis'
        return
    
    def get_type(self):
        
        return
    
    def get_record(self):
        """Create isolate record from SNP typing"""

        return
    
    def __repr__(self):
        return 'isolate: %s, SNP-type: %s' %(self.id, self.clade)

iso = isolate('42343')
iso

isolate: 42343, SNP-type: None