In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### This notebook was created to (1) convert the genotypes matrix for all isolates into a pairwise SNP distance matrix

In [2]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio import Phylo

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
import itertools
import gzip
from scipy.sparse import csr_matrix

import networkx as nx
import scipy
from collections import Counter
from itertools import groupby
from operator import itemgetter
import numpy.ma as ma
import time

####################################################################################################################################################################################

## [1] Create a script to construct the pairwise simililarity matrices (for A, C, G and T) and also the matrix that contains the number of non-NA sites called for each pair of isolates (NA).

####################################################################################################################################################################################

In [4]:
import numpy as np
from scipy.sparse import csr_matrix
import sys
import timeit

In [5]:
#INPUT - condition that turns Genotypes Array into a Boolean (one of five options) ['A' , 'C' , 'G' , 'T' , 'not_NaN']
genotypes_bool_condition = 'G'

### Check runtime of matrix multiplication to see if runtime < 3 days for the *least* sparse binary matrix

In [10]:
#necessary setup to test matrix multiplication
code_to_setup = """
import numpy as np
from scipy.sparse import csr_matrix
import sys
import timeit

#load Genotypes Matrix
genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/genotypes_matrix.npy')

#subset to 200 isolates for runtime check, as number of isolates scale by 10x - Runtime scales by 100x
genotypes_array = genotypes_array[: , 0:200]

#convert genotypes to it's tranpose so that Rows:isolates , Columns:SNP sites to set up for matrix multiplication of (isolate pair i) <-> (isolate pair j)
genotypes_array = genotypes_array.T

#convert genotypes array into a boolean (then to a binary matrix) based off of condition ('not_NaN') that will result in the least sparse matrix
Geno_bool = genotypes_array != 9

#delete Genotypes Matrix from memory
del genotypes_array
    
#convert boolean matrix into a binary matrix to prep for matrix multiplication
Geno_bool = Geno_bool.astype(int)

#convert binary matrix to a scipy sparse matrix to improve multiplication runtime
Geno_bool = csr_matrix(Geno_bool)

#store the transpose of this matrix as a scipy sparse matrix
Geno_bool_T = csr_matrix(Geno_bool.transpose())
"""

#test runtime of matrix multiplication 
code_to_test = """
#compute similarity matrix by multiplying Genotypes binary matrix by it's transpose
Sim_matrix = Geno_bool.dot(Geno_bool_T).toarray()"""

#test the matrix multiplication 3 times and take the average
avg_time_to_multiply = timeit.timeit(setup = code_to_setup , stmt = code_to_test, number = 3) / 3.0

In [9]:
avg_time_to_multiply

12.32261864344279

### If runtime for this login-node is good, go through with matrix multiplication, otherwise abort

In [None]:
#avg time to multiply is stored in seconds, want to make sure that the matrix multiplication for 200 isolates <= 14 seconds
if avg_time_to_multiply <= 14:
    
    #load Genotypes Matrix
    genotypes_array =  np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/genotypes_matrix.npy')

    #convert genotypes to it's tranpose so that Rows:isolates , Columns:SNP sites to set up for matrix multiplication of (isolate pair i) <-> (isolate pair j)
    genotypes_array = genotypes_array.T 
    
    #convert genotypes array into a boolean (then to a binary matrix) based off of some condition
    if genotypes_bool_condition == 'A':
        Geno_bool = genotypes_array == 0

    elif genotypes_bool_condition == 'C':
        Geno_bool = genotypes_array == 1

    elif genotypes_bool_condition == 'G':
        Geno_bool = genotypes_array == 2

    elif genotypes_bool_condition == 'T':
        Geno_bool = genotypes_array == 3

    elif genotypes_bool_condition == 'not_NaN':
        Geno_bool = genotypes_array != 9
        
    #delete Genotypes Matrix from memory
    del genotypes_array
        
    #convert boolean matrix into a binary matrix to prep for matrix multiplication
    Geno_bool = Geno_bool.astype(int)

    #convert binary matrix to a scipy sparse matrix to improve multiplication runtime
    Geno_bool = csr_matrix(Geno_bool)

    #store the transpose of this matrix as a scipy sparse matrix
    Geno_bool_T = csr_matrix(Geno_bool.transpose())
    
    #compute similarity matrix by multiplying Genotypes binary matrix by it's transpose
    Sim_matrix = Geno_bool.dot(Geno_bool_T).toarray()

    #save similarity matrix constructed from multipltying binary genotypes matrix with its transpose
    np.save('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/similarity_matrix_' + genotypes_bool_condition , Sim_matrix , allow_pickle = True)
    
else:
    
    print('aborted because runtime is too long for this node!')
    print('Average time to multiply = ' + str(avg_time_to_multiply))

####################################################################################################################################################################################

## [2] Use script above to submit the computation of each similiarity matrix as a seperate job

####################################################################################################################################################################################

In [3]:
from slurmpy import Slurm
import os

#### Submit a job to compute each similarity matrix

Possible Inputs: 'A', 'C', 'G', 'T', & 'not_NaN'

In [5]:
for genotypes_bool_condition in ['A' , 'C' , 'G' , 'T']: #['A' , 'C' , 'G' , 'T' , 'not_NaN']:
    
    sim_matrix_mult_job = 'python /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/python_scripts/similarity_matrix_computation_for_inhost_TB_dynamics.py ' + genotypes_bool_condition
    
    #directory where you want output + error files
    os.chdir('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/similarity_matrix_computation_jobs/')

    job_name = 'Sim_' + genotypes_bool_condition

    s = Slurm(job_name , {'partition':'medium' , 'N':'1' , 't':'1-6:00:00' , 'mem':'160G' , 'mail-type':'ALL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(sim_matrix_mult_job)

    print job_name  + ' : ' +  str(job_id)

submitted: Submitted batch job 4144931
submitted: Submitted batch job 4144933


Sim_A : 4144931
Sim_C : 4144933
Sim_G : 4144936
Sim_T : 4144937


submitted: Submitted batch job 4144936
submitted: Submitted batch job 4144937


#### Runtime for matrix multiplications

*Similarity Matrix for A* - Run time 04:51:44

*Similarity Matrix for T* - Run time 04:24:41

*Similarity Matrix for C* - Run time 14:21:01

*Similarity Matrix for G* - Run time 13:22:03

*Similarity Matrix for non NaN values* - Run time 2-04:21:47

####################################################################################################################################################################################

## [3] Create pairwise SNP distance matrix from genotypes matrix from Similarity Matrices

####################################################################################################################################################################################

In [2]:
import numpy as np

#### Load in similiarity matrices for A, C, G, T & matrix that contains the number of pairwise *good calls* for each pair of isolates.

In [3]:
Sim_A_array = np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/similarity_matrix_A.npy')
Sim_C_array = np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/similarity_matrix_C.npy')
Sim_G_array = np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/similarity_matrix_G.npy')
Sim_T_array = np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/similarity_matrix_T.npy')

Sim_not_NaN = np.load('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/similarity_matrix_not_NaN.npy')

#### Use matrices above to compute pairwise distance matrix

In [4]:
#add similarity matrices for each allele to get a similarity matrix across all bases
Sim_array = Sim_A_array + Sim_C_array + Sim_G_array + Sim_T_array

#subtract similarity matrix from number of sites that had high quality base calls for both isolates to get number of SNPs that each pair of isolates differed by
pairwise_SNP_dist = Sim_not_NaN - Sim_array

In [5]:
np.shape(pairwise_SNP_dist)

(20352, 20352)

In [6]:
pairwise_SNP_dist

array([[   0, 1708, 1048, ..., 1056,  990, 1092],
       [1708,    0, 1619, ..., 1631, 1650, 1665],
       [1048, 1619,    0, ...,  642, 1014,  687],
       ...,
       [1056, 1631,  642, ...,    0, 1025,  329],
       [ 990, 1650, 1014, ..., 1025,    0, 1051],
       [1092, 1665,  687, ...,  329, 1051,    0]])

#### Save pairwise SNP distance matrix

In [7]:
np.save('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/genotypes_matrix_and_tSNE/Genotype_Filtered_2/pairwise_distance_matrix/pairwise_SNP_distance_matrix' , pairwise_SNP_dist , allow_pickle = True)