## SNP typing and lineage naming from WGS using reference population

References:

* https://www.nature.com/articles/ncomms5812
* https://www.frontiersin.org/articles/10.3389/fmicb.2020.00843/full
* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3502966/

Selection of reference population:

* Global
* European
* UK/Ireland
* All Ireland (republic + NI)

required: 

* We need to be able to add new samples to the existing **reference** population phylogeny without having to re-analyse everything together.
* We need to have a sufficiently sampled reference matrix that it will cover most potential new inputs. Otherwise they will be considered outgroups.


In [1]:
import sys,os,subprocess
from importlib import reload
import numpy as np
import pandas as pd
import pylab as plt
import scipy.cluster.hierarchy as shc
from sklearn.preprocessing import normalize
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO,AlignIO
from Bio import Phylo
import seaborn as sns
import toytree
import snipgenie
from snipgenie import app, trees, tools, snp_typing

In [2]:
def snps_to_fasta(snpmat, outfile):
    """Write snp matrix to fasta file"""
    
    snpmat = snpmat.fillna('N')
    recs = []
    for col in snpmat.columns[1:]:
        seq = ''.join(snpmat[col])        
        seqrec = SeqRecord(Seq(seq),id=col)
        recs.append(seqrec)
    SeqIO.write(recs, outfile, 'fasta')
    return    

def tree_from_snps(snpmat):
    snps_to_fasta(snpmat, 'snps.fa') 
    treefile = trees.run_fasttree('snps.fa')
    tre = toytree.tree(treefile)
    mystyle = { "layout": 'r','node_sizes':1,'tip_labels_align':False}
    tre.ladderize().draw(**mystyle,width=700);
    return tre

In [None]:
#snps_to_fasta(nucmat, 'snps.fa')
tree_from_snps(nucmat)

In [39]:
nucmat = pd.read_csv('../snipgenie/data/nuc_snps_ireland.txt',sep=' ')
nucmat = nucmat.set_index('pos')
ref=nucmat['ref']
print (len(nucmat.T))

249


In [37]:
clusts=pd.read_csv('/storage/btbgenie/all_ireland_results/clusters.txt',sep='\t')
clusts[10:14]

Unnamed: 0,SequenceName,ClusterNumber
10,2602,1
11,19-1428,1
12,19-6108,1
13,19-4803,1


In [6]:
def make_ref_snps():
    
    nucmat = pd.read_csv('../snipgenie/data/nuc_snps_ireland.txt',sep=' ')
    nucmat = nucmat.set_index('pos')
    clusts=pd.read_csv('/storage/btbgenie/all_ireland_results/clusters.txt',sep='\t')
    X=nucmat.T.merge(clusts,left_index=True,right_on='SequenceName').set_index(['ClusterNumber']).T
    return X

In [7]:
X = make_ref_snps()
X[:4]

ClusterNumber,-1,11,11.1,11.2,11.3,11.4,11.5,11.6,2,11.7,...,20,20.1,20.2,20.3,20.4,20.5,20.6,20.7,20.8,20.9
687,C,C,C,C,C,C,C,C,C,C,...,C,C,C,C,C,C,C,C,C,C
937,A,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
1303,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G
1456,G,G,G,G,G,G,G,G,G,G,...,G,G,G,G,G,G,G,G,G,G


In [None]:
print (X.loc[:,3])

## find snps unique to clusters

In [None]:
def get_clade_snps(refmat):
    """get unique clade snps from a snp matrix
       returns: a dataframe with unique positions/allele for each clade
       with this format
              clade      pos allele
           2   490878      G
           2   804997      T
           2   941068      A
           2  1124266      G
    """
    
    res=[]
    clusters = refmat.columns.unique()
    for c in clusters:
        for pos,r in list(refmat.iterrows()):
            #print (pos)
            a = r[c]
            b = r[~r.index.isin([c])]
            #print (len(r),len(a),len(b))
            f1 = a.value_counts()
            f2 = b.value_counts()
            alt1 = f1.index[0]
            if len(f1)>1:
                continue
            alt2 = f2.index[0]
            if alt1 in f2:
                continue
            #print (f1,alt2)
            #print (f1,f2)
            res.append((c,pos,alt1))
    
    res = pd.DataFrame(res,columns=['clade','pos','allele'])
    print (res)
    return res

refmat = make_ref_snps()
res = get_clade_snps(refmat)

res.to_csv('clade_snps.csv',index=False)

In [70]:
#check results
X.loc[51709,6]

ClusterNumber
6    C
6    C
6    C
6    C
6    C
    ..
6    C
6    C
6    C
6    C
6    C
Name: 51709, Length: 72, dtype: object

## identify sample from clade-specific SNPs 

In [25]:
sample = '1579'

def lookup_sample(snptable, snps):
    """Look up a sample using snps and known clades
        snptable: reference lookup table
        snps: a series with snps at each position for the
        given sample, this can be derived from a single row
        in the snp matrix produced from snipgenie
    """     
        
    found=[]
    for i,r in snptable.iterrows():
        if not r.pos in snps.index:
            continue
        if snps[r.pos] == r.allele:
            #print (r.pos,r.allele,r.clade)
            found.append(r.clade)
    if len(found) == 0:
        return
    return set(found)
 
reload(snp_typing)

x = nucmat[sample]
snptable = snp_typing.clade_snps
lookup_sample(snptable, x)

{6}

In [26]:
def type_samples(nucmat):
    """
    Type multiple samples.
    Args:
        nucmat: a dataframe with the following format-
        pos       687  937  1303 ..      
        sample1    C    A    G 
        sample2    C    A    G
        ...
    Returns:
        types for each sample
    """
    snptable = snp_typing.clade_snps
    for name,r in nucmat.iterrows():
        #print (r)
        cl = lookup_sample(snptable, r)
        print (name,cl)
        
snps = nucmat.T[:3]
type_samples(snps)

ref {2}
31-12952 {5}
48-MBovis {5}


## test on isolated subset of samples - must use uninformative snps from subsamples

In [None]:
vcf_file='../test_results/filtered.vcf.gz'
snprecs, testmat = tools.fasta_alignment_from_vcf(vcf_file, uninformative=True)
testmat.T

In [643]:
type_samples(testmat.T)

ref {2}
17-11662 {5}
15-11643 {5}
19-11957 {5}
13-11594 {5}


## store snp data to database

## encode snps to store in a single field in DB?

e.g. string encoding


In [27]:
x=snps.iloc[0]

def encode_snps(x):
    """encode snps as string for storage"""
    
    s=[]
    for i in zip(x.index.astype(str),x.values):
        s.append(''.join(i))
    s = ';'.join(s)
    return s

s = encode_snps(x)

import re
def decode_snps(s):
    """decode snps"""
    
    x=s.split(';')
    pos=[]
    alleles=[]
    for i in x:
        n,p,a = re.split(r'(\d+)', i)
        pos.append(p)
        alleles.append(a)
    x = pd.Series(alleles,pos)
    x.index.name='pos'
    return x

x = decode_snps(s)
x

pos
687        C
937        A
1303       G
1456       G
1584       T
          ..
3832251    G
3833590    G
3835182    G
3839650    T
3842525    G
Length: 1746, dtype: object

## snps unique to each snp100 clade to define naming?

In [9]:
final = pd.read_csv('/storage/btbgenie/all_ireland_results/metadata.csv')
csq = app.read_csq_file('/storage/btbgenie/all_ireland_results/csq.tsv')
aamat = app.get_aa_snp_matrix(csq)
nucmat = pd.read_csv('/storage/btbgenie/all_ireland_results/core.txt',sep=' ')
nucmat = nucmat.set_index('pos')


In [15]:
nucmat[:3]

Unnamed: 0_level_0,ref,1034,13-11594,14-MBovis,15-11643,17-11662,17-MBovis,182-MBovis,19-11957,19-MBovis,...,ERR125619,ERR125620,ERR125621,ERR125622,ERR125623,ERR125624,ERR125625,ERR125626,ERR125627,ERR125628
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
950278,A,A,A,A,A,A,A,A,A,A,...,A,A,A,A,A,A,A,A,A,A
1998854,T,T,T,T,T,T,T,T,T,T,...,T,T,T,T,T,T,T,T,T,T
1507338,C,C,C,C,C,C,C,C,C,C,...,C,C,C,C,C,C,C,C,C,C


In [10]:
reload(tools)
name='cat-003488'
tools.get_unique_snps(name, aamat)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sample,cat-003488
start,gene,aa,snp_type,Unnamed: 4_level_1
1461929,rfe,393R>393C,missense,1
2753251,gdh,528A>528G,missense,1
2967659,BQ2027_MB2708c,151H>151R,missense,1
3057726,BQ2027_MB2810,552V>552E,missense,1
3355243,BQ2027_MB3060c,217S,synonymous,1


In [24]:
clades = [3,5,6,7,8,10,11,12,18]
for clade in clades:
    names = final[final.snp100==clade]['sample']
    u = tools.get_unique_snps(names, aamat)
    print (clade,len(names),len(u))

3 106 392
5 103 479
6 70 234
7 8 130
8 50 213
10 9 84
11 154 487
12 28 294
18 71 689
