## SNP typing and lineage naming from WGS using reference population

References:

* https://www.frontiersin.org/articles/10.3389/fmicb.2020.00843/full
* https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3502966/

Selection of reference population:

* Global
* European
* UK/Ireland
* All Ireland (republic + NI)

required: 

* We need to be able to add new samples to the existing **reference** population phylogeny without having to re-analyse everything together.
* We need to have a sufficiently sampled reference matrix that it will cover most potential new inputs. Otherwise they will be considered outgroups.

Method: 

* add one or more samples to an existing population snp matrix - concatenate two sets of snps into one matrix by keeping only the polymorphic sites in the reference matrix. sites not present in the unknown samples are replaced with Ns
* generate a new phylogeny from the combined snp matrix or just cluster them
* find nearest neighbours and identify the unknown isolates with a known clade

In [137]:
import numpy as np
import pandas as pd
import pylab as plt
import scipy.cluster.hierarchy as shc
from sklearn.preprocessing import normalize
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Phylo
import seaborn as sns
import toytree
from snipgenie import app, trees

## get snp matrix

In [232]:
nucmat = pd.read_csv('../snipgenie/data/nuc_snps_ireland.txt',sep=' ')
csqmat = pd.read_csv('../snipgenie/data/snps_ireland.csv',index_col=[0,1,2,3,4])

In [None]:
X=csqmat
print (X.iloc[:2,:3])
sns.clustermap(X,xticklabels=False,figsize=(15,10),cmap='gray_r',cbar_pos=None)

## combine sub sample of snps with original matrix

In [269]:
m1 = X.sample(30,axis=0,random_state=2)
m2 = X.sample(5,axis=0,random_state=5)

print(m1.iloc[:2,:3])
#print(m2.iloc[:2,:3])

new = m1.join(m2).fillna(0)

print (len(m1),len(m2))
print (len(new))

                                                  (687, 'dnaA', '229F')  \
name    county   SB     ClusterNumber clade                               
4798    Monaghan SB0140 4             Monaghan-1                      0   
19-7209 Monaghan SB0140 3             Monaghan-2                      0   

                                                  (937, 'dnaA', '313T>313A')  \
name    county   SB     ClusterNumber clade                                    
4798    Monaghan SB0140 4             Monaghan-1                           0   
19-7209 Monaghan SB0140 3             Monaghan-2                           1   

                                                  (1057, 'dnaA', '353I>353V')  
name    county   SB     ClusterNumber clade                                    
4798    Monaghan SB0140 4             Monaghan-1                            1  
19-7209 Monaghan SB0140 3             Monaghan-2                            1  


ValueError: columns overlap but no suffix specified: Index(['(687, 'dnaA', '229F')', '(937, 'dnaA', '313T>313A')',
       '(1057, 'dnaA', '353I>353V')', '(1303, 'dnaA', '435G>435R')',
       '(8741, 'gyrA', '480R')', '(9191, 'gyrA', '630G')',
       '(18207, 'pknA', '185A')', '(20333, 'rodA', '436I>436T')',
       '(32965, 'BQ2027_MB0030', '309R')', '(41437, 'BQ2027_MB0039', '50G')',
       ...
       '(4298265, 'espj', '281*>281Q')', '(4298265, 'espj', 'stop_lost')',
       '(4304318, 'mycp1', '406P>406L')', '(4311425, 'eccd2', '126G>126D')',
       '(4312718, 'esxD', '9P>9S')',
       '(4320406, 'BQ2027_MB3926c', '272T>272M')',
       '(4323191, 'BQ2027_MB3929c', '188A>188T')',
       '(4325917, 'LH57_21250', '118A>118T')',
       '(4339036, 'sigMa', '160S>160A')', '(4348990, 'rnpA', '58K>58E')'],
      dtype='object', length=591)

## lookup sample

In [261]:
def get_sample_details(name):
    x=csqmat.reset_index()
    r = x[x.name==name]
    return r

get_sample_details('19-2919').clade


42    Monaghan-2
Name: clade, dtype: object

In [None]:
def get_nearest_neighbours(name, n):
    #get n nearest samples in cluster matrix
    X=csqmat
    
    return


In [None]:
X=snpmat.set_index('pos').sample(250).sort_index()
mapping = {"A": 1, "T": 2, "C": 3, "G": 4}
X=X.replace(mapping)
sns.clustermap(X,row_cluster=False,yticklabels=False,figsize=(15,10))


In [None]:
Y=X.copy()
from sklearn.cluster import KMeans
#km1 = KMeans(n_clusters=4).fit(x)
#x['l1'] = km1.labels_

for level in range(4,9):
    km = KMeans(n_clusters=level).fit(x)
    Y['l'+str(level)] = km.labels_
Y=Y.sort_values(['l4','l5','l6','l7','l8'])   
Y['address'] = Y[['l4','l5','l6','l7','l8']].astype(str).agg('-'.join, axis=1)
Y[30:40]
Y.address.value_counts()

In [None]:
def snps_to_fasta(snpmat, outfile):
    """Write snp matrix to fasta file"""
    
    snpmat = snpmat.fillna('N')
    recs = []
    for col in snpmat.columns[1:]:
        seq = ''.join(snpmat[col])        
        seqrec = SeqRecord(Seq(seq),id=col)
        recs.append(seqrec)
    SeqIO.write(recs, outfile, 'fasta')
    return    

def tree_from_snps(snpmat):
    snps_to_fasta(snpmat, 'snps.fa') 
    treefile = trees.run_fasttree('snps.fa')
    tre = toytree.tree(treefile)
    mystyle = { "layout": 'r','node_sizes':1,'tip_labels_align':False}
    tre.ladderize().draw(**mystyle,width=700);
    return tre

snps_to_fasta(snpmat, 'snps.fa')
tree_from_snps(snpmat)

## combine sub sample of snps with original matrix


In [None]:
x=snpmat.set_index('pos').sample(150).sort_index()
m1 = x.sample(30,axis=1,random_state=2)
m1 = m1[2:]
m2 = x.sample(6,axis=1,random_state=5)
m2 = m2[:-4]
#print (m1)
    
tree_from_snps(m2)

In [None]:
print(m1[:5])
print(m2[:5])

In [None]:
new = m1.join(m2).fillna('N')
new
tree_from_snps(new)

In [None]:
def concat_snp_matrices(ref, other):
    """
    join two snp matrices
    we assume there are no private or uninformative snps in the inputs
    if site is missing in the other matrix we assign an N there
    discard sites that are not in ref?
    """
    
    for i,r in other.iterrows():
        
    return

r = snpmat.set_index('pos').T

concat_snp_matrices(m1, m2)

In [24]:
treefile = trees.run_fasttree('snps.fa')

## simulate genomes

In [None]:
ref = app.mbovis_genome
vcf_file='/'
cmd = 'perl /storage/btbgenie/simuG/simuG.pl \
     -refseq {r} \
     -snp_vcf {v} \
     -prefix output_prefix '.format(r=ref)