# Minigraph Mtb Pangenome Analysis - Part 1: Sequence processsing steps & k-mer comparison

### Maximillian Marin (mgmarin@g.harvard.edu)


### Import Statements

In [33]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.stats

%matplotlib inline

In [34]:
import plotly.express as px

In [35]:
from Bio import SeqIO
import plotly.express as px

# https://github.com/ipython/ipython/issues/10627
import os
os.environ['QT_QPA_PLATFORM']='offscreen'

In [36]:
import ete3 as rec
from ete3 import Tree

In [37]:
import json

Import [Gfapy](https://github.com/ggonnella/gfapy)

In [38]:
import gfapy

#### Pandas Viewing Settings

In [39]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [40]:
pd.set_option('max_colwidth', 400)

In [41]:
import time

In [42]:
import screed

In [43]:
import mmh3

## Define useful Kmer analysis functions

In [44]:
def jaccard_similarity(a, b):
    a = set(a)
    b = set(b)
    
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    
    return intersection / union

In [45]:
def jaccard_containment(a, b):
    a = set(a)
    b = set(b)
    
    intersection = len(a.intersection(b))
    
    return intersection / len(a)

In [46]:
def jaccard_containment_FromUnqHashes_WiNP(a, b):
    
    # Assumes that the input lists HAVE UNIQUE values
    Num_intersection = np.intersect1d(a, b, assume_unique = True).shape[0]
        
    return Num_intersection / a.shape[0]

In [47]:
def jaccard_similarity_FromUnqHashes_WiNP(a, b):
    
    # Assumes that the input lists HAVE UNIQUE values
    Num_intersection = np.intersect1d(a, b, assume_unique = True).shape[0]
    
    Num_union = np.union1d(a, b).shape[0]
    
    return Num_intersection / Num_union

In [48]:
def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1
    
    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)
        
    return kmers

In [49]:
#import screed # a library for reading in FASTA/FASTQ

def read_kmers_from_file(filename, ksize):
    all_kmers = []
    for record in screed.open(filename):
        sequence = record.sequence
        
        kmers = build_kmers(sequence, ksize)
        all_kmers += kmers

    return all_kmers

In [50]:
#import mmh3

def hash_kmer(kmer):
    # calculate the reverse complement
    rc_kmer = screed.rc(kmer)
    
    # determine whether original k-mer or reverse complement is lesser
    if kmer < rc_kmer:
        canonical_kmer = kmer
    else:
        canonical_kmer = rc_kmer
        
    # calculate murmurhash using a hash seed of 42
    hash = mmh3.hash64(canonical_kmer, 42)[0]
    if hash < 0: hash += 2**64
        
    # done
    return hash

In [51]:
def hash_kmers(kmers):
    hashes = []
    for kmer in kmers:
        hashes.append(hash_kmer(kmer))
    return hashes

In [52]:
def hash_kmers_ToUnqNP(kmers):
    hashes = []
    for kmer in kmers:
        hashes.append(hash_kmer(kmer))
        
    return np.unique(np.array(hashes))

# Parse sample metadata & preprocessed genome info/results

In [53]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"
!mkdir $InputAsmPath_Dir

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"


mkdir: cannot create directory ‘../../Data/231121.InputAsmTSVs.MtbSetV3.151CI’: File exists


## Testing reading back in "WGA158CI_AsmSummary_DF"

In [54]:

WGA158CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_158CI_SOI = list( WGA158CI_AsmSummary_DF["SampleID"].values )
WGA158CI_SampleIDs = SampleIDs_158CI_SOI

print(','.join(SampleIDs_158CI_SOI) )

ID_To_PrimLineage_Dict = dict(WGA158CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA158CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA158CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)


N0072,N0153,TB3113,TB1236,TB2659,TB2780,TB1612,TB2512,TB2981,TB3091,M0003941_3,TB3368,N0145,N0155,TB2995,TB3396,N0004,N1274,N0054,02_R1179,01_R1134,M0017522_5,M0016395_7,M0010874_7,02_R1708,02_R0894,01_R1430,M0014888_3,02_R1896,TB4620,TB3162,MT_0080,TB3054,TB3251,M0016737_0,TB2661,TB3237,TB3169,TB3386,TB3334,M0011368_9,TB2968,N1272,N1176,N1202,N1177,N0091,RW-TB008,9050-05,4549-04,696-05,702-06,706-05,8129-04,3003-06,8651-04,QC-3,QC-9,QC-5,QC-8,QC-10,QC-4,QC-7,QC-6,QC-1,mada_1-10,mada_2-46,mada_1-1,mada_1-36,mada_1-39,mada_1-51,mada_1-44,mada_117,mada_118,mada_122,mada_107,R27252,R23887,R30215,R30078,R29816,R30234,18_0621851,R36431,R28703,mada_115,mada_2-42,R31095,R28012,R37765,R27657,R25048,R24120,R28581,R29598,mada_1-11,R24100,R21408,R20574,R20260,R18043,R22601,R23146,R32929,R21893,R30420,R26778,R26791,R28980,R27725,R18040,R27937,mada_1-30,mada_2-31,mada_1-41,R21770,R21839,mada_1-32,R30396,R21363,R20896,mada_102,mada_129,mada_139,mada_151,mada_105,R15311,mada_103,mada_2-25,mada_112,ma

## PARSE METADATA FOR ALL 158 assemblies processed by this pipeline

In [55]:

WGA151CI_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")
#WGA158CI_SampleIDs = list( WGA158CI_Asm_Path_DF["SampleID"].values )
WGA151CI_Asm_Path_DF.shape

(151, 4)

In [56]:
WGA151CI_Asm_Path_DF.head(4)

Unnamed: 0,SampleID,Dataset_Tag,Genome_ASM_PATH,ShortRead_Genome_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.SR.Asm.fasta
1,N0153,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.SR.Asm.fasta
2,TB3113,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.SR.Asm.fasta
3,TB1236,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.SR.Asm.fasta


In [57]:
SampleID_To_LRAsmFA_PATH_Dict = dict(WGA151CI_Asm_Path_DF[['SampleID', 'Genome_ASM_PATH']].values)

# Import/parse processed H37rv genome annotations

In [58]:
RepoRef_Dir = "../../References"

#ESX_Genes_List_TSV = f"{RepoRef_Dir}/190927_H37rv_ListOf_ESXgenes.tsv"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"
H37Rv_GenomeAnnotations_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.IntergenicRegions.tsv"
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv"    
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed"

## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[["H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]

RvID_To_Symbol_Dict = dict(H37Rv_GeneInfo_Subset_DF[['H37rv_GeneID', 'Symbol']].values)
Symbol_To_FuncCat_Dict = dict(H37Rv_GeneInfo_Subset_DF[['Symbol', 'Functional_Category']].values)
#Esx_Genes_DF = pd.read_csv(ESX_Genes_List_TSV, sep = '\t')



## Parse & Hash H37Rv K-mers

In [59]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"
H37rv_Ref_GBK_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.gbk"
H37rv_Ref_FA_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.fasta"

IS6110_Example_FA_PATH = f"{Mtb_RefDir}/IS6110_From_Rv0795_Rv0796.DNA.fasta"


In [60]:
#!ls -1 $Mtb_RefDir

In [61]:
H37Rv_kmers = read_kmers_from_file(H37rv_Ref_FA_PATH, 31)
H37Rv_Hashes_UnqNP = hash_kmers_ToUnqNP(H37Rv_kmers)


In [62]:
H37Rv_kmers[:5]

['TTGACCGATGACCCCGGTTCAGGCTTCACCA',
 'TGACCGATGACCCCGGTTCAGGCTTCACCAC',
 'GACCGATGACCCCGGTTCAGGCTTCACCACA',
 'ACCGATGACCCCGGTTCAGGCTTCACCACAG',
 'CCGATGACCCCGGTTCAGGCTTCACCACAGT']

In [63]:
H37Rv_Hashes_UnqNP[:5]

array([2.11865533e+12, 2.60404038e+12, 1.03948083e+13, 1.07936883e+13,
       1.31451542e+13])

In [64]:
type(H37Rv_Hashes_UnqNP)

numpy.ndarray

#### Read in IS6110 example K-mers (From Rv0795 & Rv0796)

In [65]:
IS6110_Ex1_kmers = read_kmers_from_file(IS6110_Example_FA_PATH, 31)
IS6110_Ex1_Hashes_UnqNP = hash_kmers_ToUnqNP(IS6110_Ex1_kmers)


### Process k-mers of representative isolates from different lineages 

In [66]:
# Lin1_N0072_kmers = read_kmers_from_file(SampleTag_ToPaths_Dict['N0072']["LRAsm_Bakta_FNA"], 31)
# Lin1_N0072_kmer_hashes_UnqNP = hash_kmers_ToUnqNP(Lin1_N0072_kmers)
     
# Lin2_DNA028_kmers = read_kmers_from_file(SampleTag_ToPaths_Dict['DNA028']["LRAsm_Bakta_FNA"], 31)
# Lin2_DNA028_kmer_hashes_UnqNP = hash_kmers_ToUnqNP(Lin2_DNA028_kmers)
       
# Lin3_N1274_kmers = read_kmers_from_file(SampleTag_ToPaths_Dict['N1274']["LRAsm_Bakta_FNA"], 31)
# Lin3_N1274_kmer_hashes_UnqNP = hash_kmers_ToUnqNP(Lin3_N1274_kmers)
   
# Lin4_DNA124_kmers = read_kmers_from_file(SampleTag_ToPaths_Dict['DNA124']["LRAsm_Bakta_FNA"], 31)
#Lin4_DNA124_kmer_hashes_UnqNP = hash_kmers_ToUnqNP(Lin4_DNA124_kmers)

#Lin5_N1272_kmers = read_kmers_from_file(SampleTag_ToPaths_Dict['N1272']["LRAsm_Bakta_FNA"], 31)
#Lin5_N1272_kmer_hashes_UnqNP = hash_kmers_ToUnqNP(Lin5_N1272_kmers)

#Lin6_N1177_kmers = read_kmers_from_file(SampleTag_ToPaths_Dict['N1177']["LRAsm_Bakta_FNA"], 31)
#Lin6_N1177_kmer_hashes_UnqNP = hash_kmers_ToUnqNP(Lin6_N1177_kmers)

#Lin8_RW_TB008_kmers = read_kmers_from_file(SampleTag_ToPaths_Dict['RW-TB008']["LRAsm_Bakta_FNA"], 31)
#Lin8_RW_TB008_kmer_hashes_UnqNP = hash_kmers_ToUnqNP(Lin8_RW_TB008_kmers)

In [67]:
#Lin8_RW_TB008_kmer_hashes_UnqNP.shape

In [68]:
#jaccard_similarity(H37Rv_Hashes, N0072_kmer_hashes)

In [69]:
#jaccard_containment(H37Rv_Hashes, N0072_kmer_hashes)

# Parse H37Rv reference gene sequences (MycoBrowser Version)

In [70]:
O2_RefDir = "/n/data1/hms/dbmi/farhat/mm774/References"
MycoBrowser_RefFiles_Dir = f"{O2_RefDir}/190619_Mycobrowser_H37rv_ReferenceFiles"

H37Rv_Genes_MycoBro_FA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_genes_v3.fasta"

H37Rv_Proteins_MycoBro_FAA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_proteins_v3_TrimmedHeader.fasta"

H37Rv_Proteins_NCBI_FAA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_proteins_v3_TrimmedHeader.fasta"

H37Rv_FAA_PATH = f"{O2_RefDir}/GCF_000195955.2_ASM19595v2_proteins.faa"

H37Rv_GBK_PATH = f"{O2_RefDir}/GCF_000195955.2_ASM19595v2_genomic.gbk"

In [71]:
!grep ^">" $H37Rv_Genes_MycoBro_FA | grep "dnaA"


>Rv0001|dnaA|CDS|1-1524|+|Chromosomal replication initiator protein DnaA


In [72]:
dictOf_H37Rv_MycoBrow_GeneSeq = {}
dictOf_H37Rv_MycoBrow_GeneSeq_Rv = {}
dictOf_H37Rv_MycoBrow_Gene_KmerHashes = {}


for index, record in tqdm(enumerate(SeqIO.parse(H37Rv_Genes_MycoBro_FA, "fasta"))):
    
    RecordName = record.name
    RvID = RecordName.split("|")[0]

    GeneID = RecordName.split("|")[1]
    S_Seq = str(record.seq).upper()
    
    dictOf_H37Rv_MycoBrow_GeneSeq[GeneID] = S_Seq
    dictOf_H37Rv_MycoBrow_GeneSeq_Rv[RvID] = S_Seq
    
    record_Hashes_UnqNP = hash_kmers_ToUnqNP(build_kmers(S_Seq, 31))
    dictOf_H37Rv_MycoBrow_Gene_KmerHashes[GeneID] = record_Hashes_UnqNP
    

4187it [00:31, 133.99it/s]


In [73]:
dictOf_H37Rv_MycoBrow_GeneSeq["dnaA"]

'TTGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCGGTCGTCTCCGAACTTAACGGCGACCCTAAGGTTGACGACGGACCCAGCAGTGATGCTAATCTCAGCGCTCCGCTGACCCCTCAGCAAAGGGCTTGGCTCAATCTCGTCCAGCCATTGACCATCGTCGAGGGGTTTGCTCTGTTATCCGTGCCGAGCAGCTTTGTCCAAAACGAAATCGAGCGCCATCTGCGGGCCCCGATTACCGACGCTCTCAGCCGCCGACTCGGACATCAGATCCAACTCGGGGTCCGCATCGCTCCGCCGGCGACCGACGAAGCCGACGACACTACCGTGCCGCCTTCCGAAAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGGGGCGATAACCAGCACAGTTGGCCAAGTTACTTCACCGAGCGCCCGCACAATACCGATTCCGCTACCGCTGGCGTAACCAGCCTTAACCGTCGCTACACCTTTGATACGTTCGTTATCGGCGCCTCCAACCGGTTCGCGCACGCCGCCGCCTTGGCGATCGCAGAAGCACCCGCCCGCGCTTACAACCCCCTGTTCATCTGGGGCGAGTCCGGTCTCGGCAAGACACACCTGCTACACGCGGCAGGCAACTATGCCCAACGGTTGTTCCCGGGAATGCGGGTCAAATATGTCTCCACCGAGGAATTCACCAACGACTTCATTAACTCGCTCCGCGATGACCGCAAGGTCGCATTCAAACGCAGCTACCGCGACGTAGACGTGCTGTTGGTCGACGACATCCAATTCATTGAAGGCAAAGAGGGTATTCAAGAGGAGTTCTTCCACACCTTCAACACCTTGCACAATGCCAACAAGCAAATCGTCATCTCATCTGACCGCCCACCCAAGCAGCTCGCCACCCTCGAGGACCGGCTGAGAACCCGCTTTGAGTGGGGGCTGATCACTGACGTACAACCACCC

In [74]:
dictOf_H37Rv_MycoBrow_GeneSeq_Rv["Rv0001"]

'TTGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCGGTCGTCTCCGAACTTAACGGCGACCCTAAGGTTGACGACGGACCCAGCAGTGATGCTAATCTCAGCGCTCCGCTGACCCCTCAGCAAAGGGCTTGGCTCAATCTCGTCCAGCCATTGACCATCGTCGAGGGGTTTGCTCTGTTATCCGTGCCGAGCAGCTTTGTCCAAAACGAAATCGAGCGCCATCTGCGGGCCCCGATTACCGACGCTCTCAGCCGCCGACTCGGACATCAGATCCAACTCGGGGTCCGCATCGCTCCGCCGGCGACCGACGAAGCCGACGACACTACCGTGCCGCCTTCCGAAAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGGGGCGATAACCAGCACAGTTGGCCAAGTTACTTCACCGAGCGCCCGCACAATACCGATTCCGCTACCGCTGGCGTAACCAGCCTTAACCGTCGCTACACCTTTGATACGTTCGTTATCGGCGCCTCCAACCGGTTCGCGCACGCCGCCGCCTTGGCGATCGCAGAAGCACCCGCCCGCGCTTACAACCCCCTGTTCATCTGGGGCGAGTCCGGTCTCGGCAAGACACACCTGCTACACGCGGCAGGCAACTATGCCCAACGGTTGTTCCCGGGAATGCGGGTCAAATATGTCTCCACCGAGGAATTCACCAACGACTTCATTAACTCGCTCCGCGATGACCGCAAGGTCGCATTCAAACGCAGCTACCGCGACGTAGACGTGCTGTTGGTCGACGACATCCAATTCATTGAAGGCAAAGAGGGTATTCAAGAGGAGTTCTTCCACACCTTCAACACCTTGCACAATGCCAACAAGCAAATCGTCATCTCATCTGACCGCCCACCCAAGCAGCTCGCCACCCTCGAGGACCGGCTGAGAACCCGCTTTGAGTGGGGGCTGATCACTGACGTACAACCACCC

# Generate array of all 31-mers in annotated MGEs in H37Rv genome

## a) Create DF for only MGEs (Insertion seqs + Phages)

In [75]:
H37Rv_GenomeAnno_Genes_DF.head(1)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded


In [76]:
H37Rv_Anno_InsSeqAndPhages_DF = H37Rv_GenomeAnno_Genes_DF.query("Functional_Category == 'insertion seqs and phages'")     
H37Rv_Anno_InsSeqAndPhages_DF.shape
                                                               

(147, 12)

In [77]:
H37Rv_Anno_InsSeqAndPhages_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
33,NC_000962.3,33581,33794,+,Rv0031,Rv0031,CDS,insertion seqs and phages,No,Possible remnant of a transposase,,InsertionSeqs_And_Phages
99,NC_000962.3,103709,104663,-,Rv0094c,Rv0094c,CDS,insertion seqs and phages,No,Conserved hypothetical protein,,InsertionSeqs_And_Phages


In [78]:
InsSeqAndPhages_GeneIDs = H37Rv_Anno_InsSeqAndPhages_DF["Symbol"].values
len(InsSeqAndPhages_GeneIDs)

147

In [79]:
InsSeqAndPhages_GeneIDs[:5]

array(['Rv0031', 'Rv0094c', 'Rv0095c', 'Rv0336', 'Rv0393'], dtype=object)

In [80]:
dictOf_H37Rv_MycoBrow_Gene_KmerHashes["Rv0031"].shape

(183,)

## b) Create a NP array of all 31 bp k-mer hashes for MGEs

In [81]:
InsSeqAndPhages_GeneIDs.shape

(147,)

In [82]:
InsSeqAndPhages_GeneIDs

array(['Rv0031', 'Rv0094c', 'Rv0095c', 'Rv0336', 'Rv0393', 'Rv0397',
       'Rv0515', 'Rv0605', 'Rv0606', 'Rv0741', 'Rv0755A', 'Rv0795',
       'Rv0796', 'Rv0797', 'Rv0829', 'Rv0850', 'Rv0920c', 'Rv0921',
       'Rv0922', 'Rv1034c', 'Rv1035c', 'Rv1036c', 'Rv1041c', 'Rv1042c',
       'Rv1047', 'Rv1054', 'Rv1055', 'Rv1128c', 'Rv1148c', 'Rv1149',
       'Rv1150', 'Rv1199c', 'Rv1313c', 'Rv1369c', 'Rv1370c', 'Rv1572c',
       'Rv1573', 'Rv1574', 'Rv1575', 'Rv1576c', 'Rv1577c', 'Rv1578c',
       'Rv1579c', 'Rv1580c', 'Rv1581c', 'Rv1582c', 'Rv1583c', 'Rv1584c',
       'Rv1585c', 'Rv1586c', 'Rv1587c', 'Rv1588c', 'Rv1701', 'Rv1702c',
       'Rv1756c', 'Rv1757c', 'Rv1763', 'Rv1764', 'Rv1765A', 'Rv1945',
       'Rv2013', 'Rv2014', 'Rv2085', 'Rv2086', 'Rv2087', 'Rv2100',
       'Rv2105', 'Rv2106', 'Rv2167c', 'Rv2168c', 'Rv2177c', 'Rv2278',
       'Rv2279', 'Rv2309c', 'Rv2310', 'Rv2354', 'Rv2355', 'Rv2424c',
       'Rv2479c', 'Rv2480c', 'Rv2512c', 'Rv2646', 'Rv2647', 'Rv2648',
       'Rv2649', 'Rv2

In [83]:
Rv_MGEs_Hashes_List = []

ListOfAll_Rv_MGE_Hashes_list = []

for i_GeneID, i_Hashes in dictOf_H37Rv_MycoBrow_Gene_KmerHashes.items():

    if i_GeneID in InsSeqAndPhages_GeneIDs:
        Rv_MGEs_Hashes_List.append(i_Hashes)
        ListOfAll_Rv_MGE_Hashes_list += list(i_Hashes)
        
Rv_MGEs_Hashes_NP = np.unique(np.concatenate(Rv_MGEs_Hashes_List))


In [84]:
len(ListOfAll_Rv_MGE_Hashes_list)

104238

In [85]:
len(Rv_MGEs_Hashes_List)

147

In [86]:
Rv_MGEs_Hashes_NP.shape

(69102,)

### Look at Jaccard Containment between H37Rv and Rv_MGEs

In [87]:
Rv_MGEs_Hashes_NP

array([8.12529937e+13, 1.19999884e+14, 5.08258766e+14, ...,
       1.84461311e+19, 1.84466019e+19, 1.84467278e+19])

In [88]:
H37Rv_Hashes_UnqNP

array([2.11865533e+12, 2.60404038e+12, 1.03948083e+13, ...,
       1.84467278e+19, 1.84467284e+19, 1.84467438e+19])

In [89]:
jaccard_containment_FromUnqHashes_WiNP(Rv_MGEs_Hashes_NP, H37Rv_Hashes_UnqNP)

1.0

In [90]:
jaccard_containment_FromUnqHashes_WiNP(H37Rv_Hashes_UnqNP, Rv_MGEs_Hashes_NP)

0.01589562466616704

In [91]:
jaccard_similarity_FromUnqHashes_WiNP(H37Rv_Hashes_UnqNP, Rv_MGEs_Hashes_NP)

0.01589562466616704

In [92]:
jaccard_similarity_FromUnqHashes_WiNP(Rv_MGEs_Hashes_NP, H37Rv_Hashes_UnqNP)

0.01589562466616704

In [93]:
jaccard_containment_FromUnqHashes_WiNP(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["Rv0031"], H37Rv_Hashes_UnqNP)

1.0

In [94]:
jaccard_containment_FromUnqHashes_WiNP(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"], H37Rv_Hashes_UnqNP)

1.0

In [95]:
jaccard_containment_FromUnqHashes_WiNP(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["Rv0031"], H37Rv_Hashes_UnqNP)

1.0

In [96]:
jaccard_containment_FromUnqHashes_WiNP(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"], dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"])

1.0

In [97]:
jaccard_containment_FromUnqHashes_WiNP(IS6110_Ex1_Hashes_UnqNP, H37Rv_Hashes_UnqNP)

1.0

In [98]:
# Rv0795 & Rv0796

In [99]:
jaccard_containment_FromUnqHashes_WiNP(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["Rv0795"], IS6110_Ex1_Hashes_UnqNP)

1.0

In [100]:
jaccard_containment_FromUnqHashes_WiNP(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["Rv0796"], IS6110_Ex1_Hashes_UnqNP)

1.0

## c) Extra exploration of MGE gene annotations

In [101]:
H37Rv_Anno_InsSeqAndPhages_DF.query("Symbol == 'Rv0605'")

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
626,NC_000962.3,701405,702014,+,Rv0605,Rv0605,CDS,insertion seqs and phages,No,Possible resolvase,,InsertionSeqs_And_Phages


In [102]:
H37Rv_Anno_InsSeqAndPhages_DF[H37Rv_Anno_InsSeqAndPhages_DF["Product"].str.contains("transposase")].shape

(85, 12)

In [103]:
H37Rv_Anno_InsSeqAndPhages_DF[H37Rv_Anno_InsSeqAndPhages_DF["Product"].str.contains("transposase")].head(5)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
33,NC_000962.3,33581,33794,+,Rv0031,Rv0031,CDS,insertion seqs and phages,No,Possible remnant of a transposase,,InsertionSeqs_And_Phages
627,NC_000962.3,702015,702759,+,Rv0606,Rv0606,CDS,insertion seqs and phages,No,Possible transposase (fragment),,InsertionSeqs_And_Phages
771,NC_000962.3,832533,832848,+,Rv0741,Rv0741,CDS,insertion seqs and phages,No,Probable transposase (fragment),,InsertionSeqs_And_Phages
787,NC_000962.3,850341,850527,-,Rv0755A,Rv0755A,CDS,insertion seqs and phages,No,Putative transposase (fragment),,InsertionSeqs_And_Phages
829,NC_000962.3,889071,889398,+,Rv0795,Rv0795,CDS,insertion seqs and phages,No,Putative transposase for insertion sequence element IS6110 (fragment),,InsertionSeqs_And_Phages


In [104]:
H37Rv_Anno_InsSeqAndPhages_DF[H37Rv_Anno_InsSeqAndPhages_DF["Product"].str.contains("IS6110")].shape

(18, 12)

# Define output dir of the Mtb-WGA-SMK processing pipeline

In [105]:
# Define pipeline output directories

WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

#MtbWGA_SMK_Pipeline_OutputDir = WGA_SMK_Outputs_Dir + "/220427_WGA158CI_V1"
WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"
MtbWGA_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir

In [106]:
#!ls -1 $WGA_SMK_Outputs_Dir

## Define paths to all Assembly FAs + BAKTA Annotations (Short + Long read ASMs)

In [107]:
listOfSample_Tags = WGA158CI_SampleIDs

target_SMK_OutputDir = MtbWGA_SMK_Pipeline_OutputDir

SampleTag_ToPaths_Dict = {}

for SampleID in listOfSample_Tags:
    sample_Asm_OutputDir = target_SMK_OutputDir + "/AsmAnalysis/" + SampleID
    GenomeAnno_Dir = f"{sample_Asm_OutputDir}/GenomeAnnotation"
    
    # LR Bakta Anno Files
    Bakta_LR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_Asm_Bakta"
    
    i_LRAsm_Bakta_GFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gff3"
    i_LRAsm_Bakta_GBFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gbff"
    i_LRAsm_Bakta_FAA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.faa"
    i_LRAsm_Bakta_FFN = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.ffn"
    i_LRAsm_Bakta_FNA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.fna"
    i_LRAsm_Bakta_TXT = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.txt"

    
    MM2_AsmToRef_Output_Dir = sample_Asm_OutputDir + "/VariantCallingVersusH37Rv/MM2_AsmToH37rv"
    MM2_AsmToH37Rv_BAM = f"{MM2_AsmToRef_Output_Dir}/{SampleID}.mm2.AsmToH37Rv.bam"

    dictOfPaths_Temp = {}
    dictOfPaths_Temp["LRAsm_Bakta_GFF"] = i_LRAsm_Bakta_GFF  
    dictOfPaths_Temp["LRAsm_Bakta_GBFF"] = i_LRAsm_Bakta_GBFF
    dictOfPaths_Temp["LRAsm_Bakta_FAA"] = i_LRAsm_Bakta_FAA    
    dictOfPaths_Temp["LRAsm_Bakta_FFN"] = i_LRAsm_Bakta_FFN    
    dictOfPaths_Temp["LRAsm_Bakta_FNA"] = i_LRAsm_Bakta_FNA  
    dictOfPaths_Temp["LRAsm_Bakta_TXT"] = i_LRAsm_Bakta_TXT 

    dictOfPaths_Temp["MM2_AsmToH37Rv_BAM"] = MM2_AsmToH37Rv_BAM    
    
    SampleTag_ToPaths_Dict[SampleID] = dictOfPaths_Temp



## Define PATHS relevant to Minigraph analysis

In [108]:
target_OutputDir = MtbWGA_SMK_Pipeline_OutputDir

Minigraph_158CI_OutDir = f"{target_OutputDir}/Minigraph"

MG_WGA158CI_GFA = f"{Minigraph_158CI_OutDir}/Minigraph_H37rv_Vs_158CI.V1.gfa"
MG_WGA158CI_Bubble_SV_BED = f"{Minigraph_158CI_OutDir}/Minigraph_H37rv_Vs_158CI.V1.Bubble.SV.bed"
MG_WGA158CI_Stable_FA = f"{Minigraph_158CI_OutDir}/Minigraph_H37rv_Vs_158CI.V1.Stable.fa"

MG_WGA158CI_MergedSVInfo_TSV = f"{Minigraph_158CI_OutDir}/Minigraph_H37rv_Vs_158CI.MergedSV.Info.tsv"

MG_WGA158CI_MergedSVInfo_SVVCF = f"{Minigraph_158CI_OutDir}/Minigraph_H37rv_Vs_158CI.MergedSV.Info.svvcf"


In [109]:
!ls -alh $MtbWGA_SMK_Pipeline_OutputDir

total 288K
drwxrwsr-x   9 mm774 farhat  203 Nov 22 02:12 .
drwxrwsr-x  11 mm774 farhat  402 Nov 22 01:55 ..
drwxrwsr-x 155 mm774 farhat 3.8K Nov 22 02:06 AsmAnalysis
drwxrwsr-x   3 mm774 farhat   61 Nov 22 02:08 Busco_Download_Tmp
drwxrwsr-x   4 mm774 farhat  102 Nov 22 12:05 FastANI
drwxrwsr-x   2 mm774 farhat  351 Nov 22 02:06 Minigraph
drwxrwsr-x   3 mm774 farhat   25 Nov 22 01:53 O2logs
drwxrwsr-x  14 mm774 farhat  654 Nov 22 12:04 PanGenome_Analysis
drwxrwsr-x   4 mm774 farhat   78 Nov 22 02:12 SourMash


In [110]:
#!gfatools stat $MG_WGA158CI_GFA

## Parse in K-mers of complete Minigraph GFA (158 Mtb assemblies)

#### K-mers parsed for ALL nodes

In [111]:
Mtb_GFA_GP = gfapy.Gfa.from_file(MG_WGA158CI_GFA)

In [112]:
Mtb_GFA_GP.version

'gfa1'

In [113]:

listOfAll_GFA_Nodes = []

MG_dictOf_NodeInfo = {}

for line in tqdm(Mtb_GFA_GP.lines):
    #print(type(line))
    
    line_Str = str(line)
    
    if line_Str.startswith("S"):
        
        line_SplitByTab = line_Str.split("\t")
        
        S_Name = line_SplitByTab[1]
        
        MG_dictOf_NodeInfo[S_Name] = {}
        
        S_Seq = line_SplitByTab[2]
        
        Len_Seq = len(S_Seq)
        
        #record_Kmers = build_kmers(S_Seq, 31)
        #record_Hashes = hash_kmers(record_Kmers)
        
        record_Hashes_UnqNP = hash_kmers_ToUnqNP(build_kmers(S_Seq, 31))
        
        MG_dictOf_NodeInfo[S_Name]["Len"] = Len_Seq
        
        #MG_dictOf_NodeInfo[S_Name]["Kmers"] = record_Kmers

        MG_dictOf_NodeInfo[S_Name]["Kmer_Hashes"] = record_Hashes_UnqNP
        
        listOfAll_GFA_Nodes.append(S_Name)



100%|██████████| 7843/7843 [00:43<00:00, 180.61it/s] 


In [114]:
#record_Kmers

In [115]:
MG_dictOf_NodeInfo["s1"]["Len"]

1533

In [116]:
def getAllHash_ExceptTarget_NParray(dictOfHashes, targetToRemove):
    
    all_Hashes_ExceptTarget_List = []

    for i_SeqID, i_SeqInfoDict in dictOfHashes.items():
        
        i_Hashes = i_SeqInfoDict["Kmer_Hashes"]
        
        if i_SeqID != targetToRemove:
            all_Hashes_ExceptTarget_List.append(i_Hashes) 

    all_Hashes_ExceptTarget = np.unique(np.concatenate(all_Hashes_ExceptTarget_List))   
    
    return all_Hashes_ExceptTarget


In [117]:
Z = getAllHash_ExceptTarget_NParray(MG_dictOf_NodeInfo, "s1")
Z.shape

(4393516,)

## Parse the Bubble SV Summary BED from Minigraph

In [118]:
MG_SV_BED_DF = pd.read_csv(MG_WGA158CI_Bubble_SV_BED, sep = "\t", header=None)

MG_SV_BED_DF.columns = ["Chr", "Start", "End", "Unk1", "Unk2",
                        "Unk3", "Len_Ref", "Len_Alt", 
                        "Unk4", "Unk5", "Unk6", "NodePath", "Ref_Seq", "Alt_Seq"]


Target_Col = ["Chr", "Start", "End",
              "Len_Ref", "Len_Alt", 
              "NodePath", "Ref_Seq", "Alt_Seq"]


MG_SV_BED_DF = MG_SV_BED_DF[Target_Col]

# Remove the first and last nodes, these are not included in the SV of interest
MG_SV_BED_DF["NodePath_Trimmed"] = MG_SV_BED_DF["NodePath"].str.split(",").str[1:-1]

MG_SV_BED_DF["Start_Node"] = MG_SV_BED_DF["NodePath"].str.split(",").str[0]
MG_SV_BED_DF["End_Node"] = MG_SV_BED_DF["NodePath"].str.split(",").str[-1]

MG_SV_BED_DF.shape

(535, 11)

In [119]:
MG_SV_BED_DF.head(3)

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,Ref_Seq,Alt_Seq,NodePath_Trimmed,Start_Node,End_Node
0,NC_000962.3,1533,1533,0,0,"s1,s2",*,*,[],s1,s2
1,NC_000962.3,1591,1652,61,1480,"s2,s2034,s2959,s2035,s3,s2036,s2036,s3,s2035,s2959,s2034,s4,s5",ATTGGCTGTGAGTGTCGCTGTGCACAAACCGCGCACAGACTCATACAGTCCCGGCGGTTCC,ATTGGCTGTGAGTGTCGCTGTGCACAAACCGCGCACAGACTCATACAGTCCCGGCGGTTCCGTTCGCCGGGACTGTATGAGTCTGTGCGCGGTTTGTGCACAGCGACACTCACAGCCAATTGAACCGCCCCGGTGAGTCCGGAGACTCTCTGATCTGAGACCTCAGCCGGCGGCTGGTCTCTGGCGTTGAGCGTAGTAGGCAGCCTCGAGTTCGACCGGCGGGACGTCGCCGCAGTACTGGTAGAGGCGGCGATGGTTGAACCAGTCGACCCAGCGCGCGGTGGCCAACTCGACATCCTCGATGGACCGCCAGGGCTTGCCGGGTTTGATCAGCTCGGTCTTGTATAGGCCGTTGATCGTCTCGGCTAGTGCATTGTCATAGGAGCTTCCGACCGC...,"[s2034, s2959, s2035, s3, s2036, s2036, s3, s2035, s2959, s2034, s4]",s2,s5
2,NC_000962.3,13622,13622,0,1358,"s5,s2744,s6",*,TGAACCGCCCCGGTGAGTCCGGAGACTCTCTGATCTGAGACCTCAGCCGGCGGCTGGTCTCTGGCGTTGAGCGTAGTAGGCAGCCTCGAGTTCGACCGGCGGGACGTCGCCGCAGTACTGGTAGAGGCGGCGATGGTTGAACCAGTCGACCCAGCGCGCGGTGGCCAACTCGACATCCTCGATGGACCGCCAGGGCTTGCCGGGTTTGATCAGCTCGGTCTTGTATAGGCCGTTGATCGTCTCGGCTAGTGCATTGTCATAGGAGCTTCCGACCGCTCCGACCGACGGTTGGATGCCTGCCTCGGCGAGCCGCTCGCTGAACCGGATCGATGTGTACTGAGATCCCCTATCCGTATGGTGGATAACGTCTTTCAGGTCGAGTACGCCTTCTTGTTG...,[s2744],s5,s6


### Create Dict of NodeID to BubbleID

In [120]:
NodeID_ToBubbleID_Dict = {}

for i, row in MG_SV_BED_DF.iterrows():
    
    Bubble_Num = i + 1
    BubbleID = "BubbleRegion_" + str(Bubble_Num)
    i_NodePath_Trimmed = row["NodePath_Trimmed"]
    
    for NodeID in i_NodePath_Trimmed: NodeID_ToBubbleID_Dict[NodeID] = BubbleID
        

In [121]:
NodeID_ToBubbleID_Dict["s3"]

'BubbleRegion_2'

## Classify all nodes by wheither they are a CORE NODE or a SV NODE within the graph

In [122]:
SV_NodeIDs_All = []
Non_SV_NodeIDs = []

for i, row in MG_SV_BED_DF.iterrows():

    i_NodePath_Trimmed = row["NodePath_Trimmed"]
    
    SV_NodeIDs_All += (i_NodePath_Trimmed)
    
    Non_SV_NodeIDs.append( row["Start_Node"] )
    Non_SV_NodeIDs.append( row["End_Node"] )

    
SV_NodeIDs_All = list(set(SV_NodeIDs_All))
Non_SV_NodeIDs = list(set(Non_SV_NodeIDs))
    

In [123]:
len(list(set(SV_NodeIDs_All)) )

2602

In [124]:
len(Non_SV_NodeIDs)

536

In [125]:
MG_dictOf_NodeInfo["s1"]["Len"]

1533

In [126]:
np.unique(np.array([])).shape

(0,)

# 1) Compare k-mer content of all SV nodes vs all SV nodes

In [155]:
len(MG_dictOf_NodeInfo.keys())

3138

In [156]:
len(list(MG_dictOf_NodeInfo.keys()))

3138

In [157]:
All_Nodes_List = list(MG_dictOf_NodeInfo.keys())
len(All_Nodes_List)

3138

In [158]:
3137 ** 2

9840769

In [159]:
start = time.time()

listOfTuples = []

for record_Name_1 in tqdm( All_Nodes_List ) :
    for record_Name_2 in All_Nodes_List:
        record_1_Hashes_UnqNP = MG_dictOf_NodeInfo[record_Name_1]["Kmer_Hashes"]
        record_2_Hashes_UnqNP = MG_dictOf_NodeInfo[record_Name_2]["Kmer_Hashes"]

        if (record_1_Hashes_UnqNP.shape[0] != 0) & (record_2_Hashes_UnqNP.shape[0] != 0):
            
            #record_1and2_JC = np.nan
            record_1and2_JC = jaccard_containment_FromUnqHashes_WiNP(record_1_Hashes_UnqNP, record_2_Hashes_UnqNP)
            record_1and2_JS = jaccard_similarity_FromUnqHashes_WiNP(record_1_Hashes_UnqNP, record_2_Hashes_UnqNP)
            
            
            record_1_SeqLen = MG_dictOf_NodeInfo[record_Name_1]["Len"]
            record_2_SeqLen = MG_dictOf_NodeInfo[record_Name_2]["Len"]

            #print(f"Jaccard Sim between Record 1 and 2: {record_1and2_JS}")
            
            i_Tuple = (record_Name_1, record_Name_2, 
                       record_1_SeqLen, record_2_SeqLen,
                       record_1and2_JS, record_1and2_JC)
            
            listOfTuples.append(i_Tuple)

            
end = time.time()
print(end - start)

AvA_Nodes_DF = pd.DataFrame(listOfTuples)
AvA_Nodes_DF.columns = ["RecordID_1", "RecordID_2", "Record1_Len", "Record2_Len", "JaccardSim", "JaccardContain"]

#AvA_Nodes_DF = AvA_Nodes_DF.query("RecordID_1 != RecordID_2")



  0%|          | 0/3138 [00:00<?, ?it/s][A
  0%|          | 1/3138 [00:00<44:01,  1.19it/s][A
  0%|          | 2/3138 [00:01<38:03,  1.37it/s][A
  0%|          | 3/3138 [00:01<33:45,  1.55it/s][A
  0%|          | 5/3138 [00:04<46:53,  1.11it/s][A
  0%|          | 6/3138 [00:07<1:21:25,  1.56s/it][A
  0%|          | 7/3138 [00:08<1:11:20,  1.37s/it][A
  0%|          | 8/3138 [00:09<59:12,  1.13s/it]  [A
  0%|          | 9/3138 [00:11<1:08:55,  1.32s/it][A
  0%|          | 10/3138 [00:12<1:09:58,  1.34s/it][A
  0%|          | 13/3138 [00:17<1:16:40,  1.47s/it][A
  0%|          | 14/3138 [00:19<1:18:39,  1.51s/it][A
  0%|          | 15/3138 [00:20<1:08:02,  1.31s/it][A
  1%|          | 16/3138 [00:20<58:25,  1.12s/it]  [A
  1%|          | 17/3138 [00:23<1:17:44,  1.49s/it][A
  1%|          | 18/3138 [00:23<1:04:45,  1.25s/it][A
  1%|          | 19/3138 [00:24<55:06,  1.06s/it]  [A
  1%|          | 20/3138 [00:25<46:20,  1.12it/s][A
  1%|          | 21/3138 [00:26<49:00,

2250.1082696914673


In [160]:
AvA_Nodes_NoZeroJS_DF = AvA_Nodes_DF.query("JaccardSim != 0")   
AvA_Nodes_NoZeroJS_DF.shape


(125615, 6)

In [161]:
AvA_Nodes_Trim_DF = AvA_Nodes_DF.query("RecordID_1 != RecordID_2").query("JaccardSim != 0")   
AvA_Nodes_Trim_DF.shape


(123104, 6)

## Look at All vs All Nodes Stats

In [162]:
AvA_Nodes_DF.head(3)

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s1,s1,1533,1533,1.0,1.0
1,s1,s2,1533,58,0.0,0.0
2,s1,s3,1533,56,0.0,0.0


In [163]:
AvA_Nodes_DF.shape

(6305121, 6)

In [164]:
AvA_Nodes_DF.query("RecordID_1 != RecordID_2").shape

(6302610, 6)

In [165]:
AvA_Nodes_DF.query("JaccardSim >= 0.05").shape

(115801, 6)

In [166]:
AvA_Nodes_DF.query("JaccardSim >= 0.05")["RecordID_1"].nunique()

2511

In [167]:
AvA_Nodes_DF.query("JaccardSim >= 0.5")["RecordID_1"].nunique()

2511

In [168]:
AvA_Nodes_DF.query("JaccardSim >= 0.8")["RecordID_1"].nunique()

2511

In [169]:
AvA_Nodes_DF.query("JaccardSim >= 1")["RecordID_1"].nunique()

2511

## Output All vs All Node k-mer comparison to TSV

In [170]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

!mkdir $PangenomeAnalysis_Dir

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz"     

AvA_Nodes_Trim_DF.to_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV3’: File exists


In [171]:
!ls -1 $PangenomeAnalysis_Dir

Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz
Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv


In [None]:
#!rm $MG_AvA_Node_KmerAnalysis_TSV

In [172]:
!zcat $MG_AvA_Node_KmerAnalysis_TSV | wc -l

gzip: compressed data not read from a terminal. Use -f to force decompression.
For help, type: gzip -h
0


### Parse in processed All vs All Kmer analysis

In [179]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz"     


In [180]:
!ls -lah $MG_AvA_Node_KmerAnalysis_TSV_GZ

-rw-r--r-- 1 mm774 farhat 1.2M Nov 22 15:20 ../../Data/MtbPangenomeAnalysis_SetV3/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz


In [181]:
#!ls -lah $MG_AvA_Node_KmerAnalysis_TSV

In [182]:
AvA_Nodes_Trim_DF = pd.read_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t" )
AvA_Nodes_Trim_DF.shape

(123104, 6)

In [183]:
AvA_Nodes_Trim_DF.head()

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s3,s2959,56,61,0.78125,0.961538
1,s7,s2247,1876,42,0.006501,0.006501
2,s8,s2823,542,87,0.096339,0.097656
3,s13,s2037,24063,49,0.000208,0.000208
4,s13,s2039,24063,114,0.000208,0.000208


In [184]:
!ls -lah $PangenomeAnalysis_Dir

total 1.6M
drwxr-sr-x  2 mm774 farhat  160 Nov 22 15:20 .
drwxr-sr-x 56 mm774 farhat 2.9K Nov 22 14:06 ..
-rw-r--r--  1 mm774 farhat 1.2M Nov 22 15:20 Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz
-rw-r--r--  1 mm774 farhat 168K Nov 22 14:43 Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv


# 2) Compare k-mer content of SV nodes to known references (Rv, all other nodes, etc)

In [178]:
NodeAnalysis_listOfRows = []


for NodeID, NodeInfo in tqdm( MG_dictOf_NodeInfo.items() ) :
    
    record_Hashes_UnqNP = NodeInfo["Kmer_Hashes"] 
               
    Len_Seq = NodeInfo["Len"]
        
    if record_Hashes_UnqNP.shape[0] != 0:
        Record_Hash_JC_WiH37Rv = jaccard_containment_FromUnqHashes_WiNP( record_Hashes_UnqNP, H37Rv_Hashes_UnqNP)
        Record_Hash_JC_WiIS6110 = jaccard_containment_FromUnqHashes_WiNP( record_Hashes_UnqNP, IS6110_Ex1_Hashes_UnqNP)

        allHashes_UnqNP_ExceptRecord = getAllHash_ExceptTarget_NParray(MG_dictOf_NodeInfo, NodeID)

        Record_Hash_JC_WiAllOtherNodes = jaccard_containment_FromUnqHashes_WiNP( record_Hashes_UnqNP, allHashes_UnqNP_ExceptRecord)
        Record_Hash_JC_WiRv_InsSeqAndPhages = jaccard_containment_FromUnqHashes_WiNP( record_Hashes_UnqNP, Rv_MGEs_Hashes_NP)

    else:
        Record_Hash_JC_WiH37Rv = 0
        Record_Hash_JC_WiIS6110 = 0
        Record_Hash_JC_WiAllOtherNodes = 0
        Record_Hash_JC_WiRv_InsSeqAndPhages = 0 
             
        if Len_Seq >= 31:
            print(f"No kmers were produced for segment: {NodeID}")
                
    Status_SVNode = (NodeID in SV_NodeIDs_All)
    
    i_Row = (NodeID,
             Len_Seq,
             Record_Hash_JC_WiH37Rv,
             Record_Hash_JC_WiIS6110,
             Record_Hash_JC_WiAllOtherNodes,
             Record_Hash_JC_WiRv_InsSeqAndPhages,
             Status_SVNode)
    
    NodeAnalysis_listOfRows.append(i_Row)

    
MG_NodeAnalysis_DF = pd.DataFrame(NodeAnalysis_listOfRows)

MG_NodeAnalysis_DF.columns = ["NodeID",
                              "SeqLength",
                              "Jaccard_Cont_WiRv",
                              "Jaccard_Cont_WiIS6110",
                              "Jaccard_Cont_WiAllOtherNodes",
                              "Jaccard_Cont_WiRv_InsSeqAndPhages",
                              "IsSVNode"]



  0%|          | 0/3138 [00:00<?, ?it/s][A
  0%|          | 1/3138 [00:00<41:56,  1.25it/s][A
  0%|          | 2/3138 [00:01<42:13,  1.24it/s][A
  0%|          | 3/3138 [00:02<42:03,  1.24it/s][A
  0%|          | 5/3138 [00:03<35:48,  1.46it/s][A
  0%|          | 6/3138 [00:04<37:44,  1.38it/s][A
  0%|          | 7/3138 [00:04<38:48,  1.34it/s][A
  0%|          | 8/3138 [00:05<39:34,  1.32it/s][A
  0%|          | 9/3138 [00:06<39:57,  1.31it/s][A
  0%|          | 10/3138 [00:07<40:05,  1.30it/s][A
  0%|          | 13/3138 [00:08<32:21,  1.61it/s][A
  0%|          | 14/3138 [00:08<34:50,  1.49it/s][A
  0%|          | 15/3138 [00:09<36:35,  1.42it/s][A
  1%|          | 16/3138 [00:10<37:55,  1.37it/s][A
  1%|          | 17/3138 [00:11<38:54,  1.34it/s][A
  1%|          | 18/3138 [00:11<39:57,  1.30it/s][A
  1%|          | 19/3138 [00:12<40:23,  1.29it/s][A
  1%|          | 20/3138 [00:13<40:44,  1.28it/s][A
  1%|          | 21/3138 [00:14<40:40,  1.28it/s][A
  1%|    

KeyboardInterrupt: 

### Add a "BubbleID" column to the node-level analysis

In [145]:
MG_NodeAnalysis_DF["BubbleID"] = MG_NodeAnalysis_DF["NodeID"].map(NodeID_ToBubbleID_Dict)
MG_NodeAnalysis_DF["BubbleID"] = MG_NodeAnalysis_DF["BubbleID"].fillna("None")
MG_NodeAnalysis_DF.shape

(3138, 8)

In [146]:
MG_NodeAnalysis_DF.head(10)

Unnamed: 0,NodeID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiAllOtherNodes,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,BubbleID
0,s1,1533,1.0,0.0,0.0,0.0,False,
1,s2,58,1.0,0.0,0.0,0.0,False,
2,s3,56,1.0,0.0,0.961538,0.0,True,BubbleRegion_2
3,s4,5,0.0,0.0,0.0,0.0,True,BubbleRegion_2
4,s5,11970,1.0,0.0,0.0,0.0,False,
5,s6,12847,1.0,0.0,0.0,0.0,False,
6,s7,1876,1.0,0.0,0.006501,0.0,True,BubbleRegion_4
7,s8,542,1.0,0.0,0.097656,0.0,False,
8,s9,5979,1.0,0.0,0.0,0.030761,False,
9,s10,4133,1.0,0.0,0.0,0.0,False,


In [135]:
MG_NodeAnalysis_DF.shape

(3138, 8)

## Output Minigraph NODE Kmer Analysis to TSV

In [147]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

!mkdir $PangenomeAnalysis_Dir

MG_Node_KmerAnalysis_TSV = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv"     

MG_NodeAnalysis_DF.to_csv(MG_Node_KmerAnalysis_TSV, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV3’: File exists


In [148]:
!ls -1 $PangenomeAnalysis_Dir

Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv


In [149]:
!ls -alh $MG_Node_KmerAnalysis_TSV

-rw-r--r-- 1 mm774 farhat 168K Nov 22 14:43 ../../Data/MtbPangenomeAnalysis_SetV3/Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv


In [150]:
!wc -l $MG_Node_KmerAnalysis_TSV

3139 ../../Data/MtbPangenomeAnalysis_SetV3/Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv


### Test parsing of NODE Kmer Analysis TSV

In [185]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

MG_Node_KmerAnalysis_TSV = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv"


In [186]:
MG_NodeAnalysis_DF = pd.read_csv(MG_Node_KmerAnalysis_TSV, sep = "\t" )
MG_NodeAnalysis_DF.shape  

(3138, 8)

In [187]:
MG_NodeAnalysis_DF.head()

Unnamed: 0,NodeID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiAllOtherNodes,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,BubbleID
0,s1,1533,1.0,0.0,0.0,0.0,False,
1,s2,58,1.0,0.0,0.0,0.0,False,
2,s3,56,1.0,0.0,0.961538,0.0,True,BubbleRegion_2
3,s4,5,0.0,0.0,0.0,0.0,True,BubbleRegion_2
4,s5,11970,1.0,0.0,0.0,0.0,False,


In [188]:
MG_NodeAnalysis_DF["Jaccard_Cont_WiRv"].describe()

count    3138.000000
mean        0.721741
std         0.435274
min         0.000000
25%         0.036899
50%         1.000000
75%         1.000000
max         1.000000
Name: Jaccard_Cont_WiRv, dtype: float64

In [None]:
#!ls -1 $PangenomeAnalysis_Dir/

In [189]:
!head $MG_Node_KmerAnalysis_TSV

NodeID	SeqLength	Jaccard_Cont_WiRv	Jaccard_Cont_WiIS6110	Jaccard_Cont_WiAllOtherNodes	Jaccard_Cont_WiRv_InsSeqAndPhages	IsSVNode	BubbleID
s1	1533	1.0	0.0	0.0	0.0	False	None
s2	58	1.0	0.0	0.0	0.0	False	None
s3	56	1.0	0.0	0.9615384615384616	0.0	True	BubbleRegion_2
s4	5	0.0	0.0	0.0	0.0	True	BubbleRegion_2
s5	11970	1.0	0.0	0.0	0.0	False	None
s6	12847	1.0	0.0	0.0	0.0	False	None
s7	1876	1.0	0.0	0.0065005417118093	0.0	True	BubbleRegion_4
s8	542	1.0	0.0	0.09765625	0.0	False	None
s9	5979	1.0	0.0	0.0	0.0307614725163893	False	None


# Reparse 2 DFs together

In [190]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz"     

MG_Node_KmerAnalysis_TSV = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.KmerAnalysis.VsAllOtherNodes.V4.tsv"



In [192]:
AvA_Nodes_Trim_DF = pd.read_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t" )
AvA_Nodes_Trim_DF.shape

(123104, 6)

In [193]:
AvA_Nodes_Trim_DF.head()

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s3,s2959,56,61,0.78125,0.961538
1,s7,s2247,1876,42,0.006501,0.006501
2,s8,s2823,542,87,0.096339,0.097656
3,s13,s2037,24063,49,0.000208,0.000208
4,s13,s2039,24063,114,0.000208,0.000208


In [194]:
MG_NodeAnalysis_DF = pd.read_csv(MG_Node_KmerAnalysis_TSV, sep = "\t" )
MG_NodeAnalysis_DF.shape

(3138, 8)

In [195]:
MG_NodeAnalysis_DF.head()

Unnamed: 0,NodeID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiAllOtherNodes,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,BubbleID
0,s1,1533,1.0,0.0,0.0,0.0,False,
1,s2,58,1.0,0.0,0.0,0.0,False,
2,s3,56,1.0,0.0,0.961538,0.0,True,BubbleRegion_2
3,s4,5,0.0,0.0,0.0,0.0,True,BubbleRegion_2
4,s5,11970,1.0,0.0,0.0,0.0,False,
