# Minigraph Mtb SV Pan-Genome Evaluation
## Part 1: Sequence & graph processsing

### Maximillian Marin (mgmarin@g.harvard.edu)


### Import Statements

In [301]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.stats

%matplotlib inline

In [302]:
import plotly.express as px 

In [303]:
from Bio import SeqIO
import plotly.express as px

# https://github.com/ipython/ipython/issues/10627
import os
os.environ['QT_QPA_PLATFORM']='offscreen'

In [304]:
import ete3 as rec
from ete3 import Tree

In [305]:
import json

Import [Gfapy](https://github.com/ggonnella/gfapy)

In [306]:
import gfapy

#### Pandas Viewing Settings

In [307]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [308]:
pd.set_option('max_colwidth', 400)

In [309]:
import time

## Define useful Kmer analysis functions

In [310]:
import screed

In [311]:
import mmh3

In [312]:
def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1
    
    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)
        
    return kmers

In [313]:
#import screed a library for reading in FASTA/FASTQ

def read_kmers_from_file(filename, ksize):
    all_kmers = []
    for record in screed.open(filename):
        sequence = record.sequence
        
        kmers = build_kmers(sequence, ksize)
        all_kmers += kmers

    return all_kmers

In [314]:
def hash_kmer(kmer):
    # calculate the reverse complement
    rc_kmer = screed.rc(kmer)
    
    # determine whether original k-mer or reverse complement is lesser
    if kmer < rc_kmer:
        canonical_kmer = kmer
    else:
        canonical_kmer = rc_kmer
        
    # calculate murmurhash using a hash seed of 42
    hash = mmh3.hash64(canonical_kmer, 42)[0]
    if hash < 0: hash += 2**64

    return hash

In [315]:
# def hash_kmers(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
#     return hashes

# def hash_kmers_ToUnqNP(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
        
#     return np.unique(np.array(hashes))

def hash_kmers_ToSet(kmers):
    hashes = set()
    for kmer in kmers:
        hashes.add(hash_kmer(kmer))
    return hashes

In [316]:

def jaccard_containment_FromSets(a, b):
    '''
    This function returns the Jaccard Containment between sets a and b.
    '''
    
    intersection = len(a.intersection(b))
    
    return intersection / len(a)

def jaccard_similarity_FromSets(a, b):
    '''
    This function returns the Jaccard Similarity between sets a and b.
    '''
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    
    return intersection / union


In [317]:
def getAllHash_ExceptTargets_Set_V2(dictOfHashes, targetsToRemove):
    # Convert targetsToRemove to a set for faster lookup
    targetsToRemoveSet = set(targetsToRemove)

    # Use set comprehension for more efficient construction of the result set
    return {hash for seqID, seqInfoDict in dictOfHashes.items() if seqID not in targetsToRemoveSet
            for hash in seqInfoDict["Kmer_Hashes_Set"]}

In [318]:
def getAllHash_InTargetSeqs_Set(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_SeqInfoDict in dictOfHashes.items():
        
        i_Hashes = i_SeqInfoDict["Kmer_Hashes_Set"]
        
        if i_SeqID not in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [319]:
#%reload_ext autoreload
#%autoreload 2

### import panqc toolkit functions
#from panqc.kmerlib import hash_kmers_ToSet, jaccard_similarity_FromSets, jaccard_containment_FromSets


In [320]:
# def jaccard_similarity(a, b):
#     a = set(a)
#     b = set(b)
    
#     intersection = len(a.intersection(b))
#     union = len(a.union(b))
    
#     return intersection / union

In [321]:
# def jaccard_containment(a, b):
#     a = set(a)
#     b = set(b)
    
#     intersection = len(a.intersection(b))
    
#     return intersection / len(a)

In [322]:
# def jaccard_containment_FromUnqHashes_WiNP(a, b):
    
#     # Assumes that the input lists HAVE UNIQUE values
#     Num_intersection = np.intersect1d(a, b, assume_unique = True).shape[0]
        
#     return Num_intersection / a.shape[0]

In [323]:
# def jaccard_similarity_FromUnqHashes_WiNP(a, b):
    
#     # Assumes that the input lists HAVE UNIQUE values
#     Num_intersection = np.intersect1d(a, b, assume_unique = True).shape[0]
    
#     Num_union = np.union1d(a, b).shape[0]
    
#     return Num_intersection / Num_union

In [324]:
# def build_kmers(sequence, ksize):
#     kmers = []
#     n_kmers = len(sequence) - ksize + 1
    
#     for i in range(n_kmers):
#         kmer = sequence[i:i + ksize]
#         kmers.append(kmer)
        
#     return kmers

In [325]:
# #import screed # a library for reading in FASTA/FASTQ

# def read_kmers_from_file(filename, ksize):
#     all_kmers = []
#     for record in screed.open(filename):
#         sequence = record.sequence
        
#         kmers = build_kmers(sequence, ksize)
#         all_kmers += kmers

#     return all_kmers

In [326]:
# #import mmh3

# def hash_kmer(kmer):
#     # calculate the reverse complement
#     rc_kmer = screed.rc(kmer)
    
#     # determine whether original k-mer or reverse complement is lesser
#     if kmer < rc_kmer:
#         canonical_kmer = kmer
#     else:
#         canonical_kmer = rc_kmer
        
#     # calculate murmurhash using a hash seed of 42
#     hash = mmh3.hash64(canonical_kmer, 42)[0]
#     if hash < 0: hash += 2**64
        
#     # done
#     return hash

In [327]:
# def hash_kmers(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
#     return hashes

In [328]:
# def hash_kmers_ToUnqNP(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
        
#     return np.unique(np.array(hashes))

# Parse sample metadata & preprocessed genome info/results

In [329]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"
!mkdir $InputAsmPath_Dir

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"


mkdir: cannot create directory ‘../../Data/231121.InputAsmTSVs.MtbSetV3.151CI’: File exists


## Read table of Mtb isolate info

In [330]:

WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)


## PARSE METADATA FOR ALL 151 assemblies processed by this pipeline

In [331]:

WGA151CI_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")
WGA151CI_Asm_Path_DF.shape

(151, 4)

In [332]:
WGA151CI_Asm_Path_DF.head(4)

Unnamed: 0,SampleID,Dataset_Tag,Genome_ASM_PATH,ShortRead_Genome_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.SR.Asm.fasta
1,N0153,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.SR.Asm.fasta
2,TB3113,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.SR.Asm.fasta
3,TB1236,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.SR.Asm.fasta


In [333]:
SampleID_To_LRAsmFA_PATH_Dict = dict(WGA151CI_Asm_Path_DF[['SampleID', 'Genome_ASM_PATH']].values)

# Import/parse processed H37rv genome annotations

In [334]:
!ls -1 ../../References

201027_H37rv_AnnotatedGenes_And_IntergenicRegions
README.md


In [335]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"
H37Rv_GenomeAnnotations_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.IntergenicRegions.tsv"
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv"    
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed"

## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[["H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]

RvID_To_Symbol_Dict = dict(H37Rv_GeneInfo_Subset_DF[['H37rv_GeneID', 'Symbol']].values)
Symbol_To_FuncCat_Dict = dict(H37Rv_GeneInfo_Subset_DF[['Symbol', 'Functional_Category']].values)



## Parse & Hash H37Rv K-mers

In [336]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"
H37rv_Ref_GBK_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.gbk"
H37rv_Ref_FA_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.fasta"

IS6110_Example_FA_PATH = f"{Mtb_RefDir}/IS6110_From_Rv0795_Rv0796.DNA.fasta"


In [337]:
#!ls -1 $Mtb_RefDir

In [338]:
H37Rv_kmers = read_kmers_from_file(H37rv_Ref_FA_PATH, 31)

#H37Rv_Hashes = hash_kmers(H37Rv_kmers)
#H37Rv_Hashes_UnqNP = hash_kmers_ToUnqNP(H37Rv_kmers)

H37Rv_Hashes_Set = hash_kmers_ToSet(H37Rv_kmers)

In [339]:
H37Rv_kmers[:5]

['TTGACCGATGACCCCGGTTCAGGCTTCACCA',
 'TGACCGATGACCCCGGTTCAGGCTTCACCAC',
 'GACCGATGACCCCGGTTCAGGCTTCACCACA',
 'ACCGATGACCCCGGTTCAGGCTTCACCACAG',
 'CCGATGACCCCGGTTCAGGCTTCACCACAGT']

#### Read in IS6110 example K-mers (From Rv0795 & Rv0796)

In [340]:
IS6110_Ex1_kmers = read_kmers_from_file(IS6110_Example_FA_PATH, 31)

IS6110_Ex1_Hashes_Set = hash_kmers_ToSet(IS6110_Ex1_kmers)


# Parse H37Rv reference gene sequences (MycoBrowser Version)

In [341]:
O2_RefDir = "/n/data1/hms/dbmi/farhat/mm774/References"
MycoBrowser_RefFiles_Dir = f"{O2_RefDir}/190619_Mycobrowser_H37rv_ReferenceFiles"

H37Rv_Genes_MycoBro_FA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_genes_v3.fasta"

H37Rv_Proteins_MycoBro_FAA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_proteins_v3_TrimmedHeader.fasta"

H37Rv_Proteins_NCBI_FAA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_proteins_v3_TrimmedHeader.fasta"

H37Rv_FAA_PATH = f"{O2_RefDir}/GCF_000195955.2_ASM19595v2_proteins.faa"

H37Rv_GBK_PATH = f"{O2_RefDir}/GCF_000195955.2_ASM19595v2_genomic.gbk"

In [342]:
!grep ^">" $H37Rv_Genes_MycoBro_FA | grep "dnaA"


>Rv0001|dnaA|CDS|1-1524|+|Chromosomal replication initiator protein DnaA


In [343]:
dictOf_H37Rv_MycoBrow_GeneSeq = {}
dictOf_H37Rv_MycoBrow_GeneSeq_Rv = {}
dictOf_H37Rv_MycoBrow_Gene_KmerHashes = {}


for index, record in tqdm(enumerate(SeqIO.parse(H37Rv_Genes_MycoBro_FA, "fasta"))):
    
    RecordName = record.name
    RvID = RecordName.split("|")[0]

    GeneID = RecordName.split("|")[1]
    S_Seq = str(record.seq).upper()
    
    dictOf_H37Rv_MycoBrow_GeneSeq[GeneID] = S_Seq
    #dictOf_H37Rv_MycoBrow_GeneSeq_Rv[RvID] = S_Seq
    
    #record_Hashes_UnqNP = hash_kmers_ToUnqNP(build_kmers(S_Seq, 31))
    
    record_Hashes_Set = hash_kmers_ToSet(build_kmers(S_Seq, 31))

    dictOf_H37Rv_MycoBrow_Gene_KmerHashes[GeneID] = record_Hashes_Set
    

4187it [00:28, 146.29it/s]


In [344]:
dictOf_H37Rv_MycoBrow_GeneSeq["dnaA"]

'TTGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCGGTCGTCTCCGAACTTAACGGCGACCCTAAGGTTGACGACGGACCCAGCAGTGATGCTAATCTCAGCGCTCCGCTGACCCCTCAGCAAAGGGCTTGGCTCAATCTCGTCCAGCCATTGACCATCGTCGAGGGGTTTGCTCTGTTATCCGTGCCGAGCAGCTTTGTCCAAAACGAAATCGAGCGCCATCTGCGGGCCCCGATTACCGACGCTCTCAGCCGCCGACTCGGACATCAGATCCAACTCGGGGTCCGCATCGCTCCGCCGGCGACCGACGAAGCCGACGACACTACCGTGCCGCCTTCCGAAAATCCTGCTACCACATCGCCAGACACCACAACCGACAACGACGAGATTGATGACAGCGCTGCGGCACGGGGCGATAACCAGCACAGTTGGCCAAGTTACTTCACCGAGCGCCCGCACAATACCGATTCCGCTACCGCTGGCGTAACCAGCCTTAACCGTCGCTACACCTTTGATACGTTCGTTATCGGCGCCTCCAACCGGTTCGCGCACGCCGCCGCCTTGGCGATCGCAGAAGCACCCGCCCGCGCTTACAACCCCCTGTTCATCTGGGGCGAGTCCGGTCTCGGCAAGACACACCTGCTACACGCGGCAGGCAACTATGCCCAACGGTTGTTCCCGGGAATGCGGGTCAAATATGTCTCCACCGAGGAATTCACCAACGACTTCATTAACTCGCTCCGCGATGACCGCAAGGTCGCATTCAAACGCAGCTACCGCGACGTAGACGTGCTGTTGGTCGACGACATCCAATTCATTGAAGGCAAAGAGGGTATTCAAGAGGAGTTCTTCCACACCTTCAACACCTTGCACAATGCCAACAAGCAAATCGTCATCTCATCTGACCGCCCACCCAAGCAGCTCGCCACCCTCGAGGACCGGCTGAGAACCCGCTTTGAGTGGGGGCTGATCACTGACGTACAACCACCC

In [345]:
list(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"])[:10]

[13580233940393664509,
 5138456728421695490,
 3618210997640110108,
 11878855257731645464,
 15411791937939046426,
 18314959098918223898,
 17544408214687465506,
 16638420273290518564,
 5978695591246659638,
 4065706600454856771]

# Generate array of all 31-mers in annotated MGEs in H37Rv genome

## a) Create DF for only MGEs (Insertion seqs + Phages)

In [346]:
H37Rv_GenomeAnno_Genes_DF.head(1)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded


In [347]:
H37Rv_Anno_InsSeqAndPhages_DF = H37Rv_GenomeAnno_Genes_DF.query("Functional_Category == 'insertion seqs and phages'")     
H37Rv_Anno_InsSeqAndPhages_DF.shape
                                                               

(147, 12)

In [348]:
H37Rv_Anno_InsSeqAndPhages_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
33,NC_000962.3,33581,33794,+,Rv0031,Rv0031,CDS,insertion seqs and phages,No,Possible remnant of a transposase,,InsertionSeqs_And_Phages
99,NC_000962.3,103709,104663,-,Rv0094c,Rv0094c,CDS,insertion seqs and phages,No,Conserved hypothetical protein,,InsertionSeqs_And_Phages


In [349]:
InsSeqAndPhages_GeneIDs = H37Rv_Anno_InsSeqAndPhages_DF["Symbol"].values
len(InsSeqAndPhages_GeneIDs)

147

In [350]:
InsSeqAndPhages_GeneIDs[:5]

array(['Rv0031', 'Rv0094c', 'Rv0095c', 'Rv0336', 'Rv0393'], dtype=object)

In [351]:
len(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["Rv0031"] )

183

## b) Create a NP array of all 31 bp k-mer hashes for annotated MGEs (Phage sequences & Insertion Sequences)

In [352]:
InsSeqAndPhages_GeneIDs.shape

(147,)

In [353]:
InsSeqAndPhages_GeneIDs

array(['Rv0031', 'Rv0094c', 'Rv0095c', 'Rv0336', 'Rv0393', 'Rv0397',
       'Rv0515', 'Rv0605', 'Rv0606', 'Rv0741', 'Rv0755A', 'Rv0795',
       'Rv0796', 'Rv0797', 'Rv0829', 'Rv0850', 'Rv0920c', 'Rv0921',
       'Rv0922', 'Rv1034c', 'Rv1035c', 'Rv1036c', 'Rv1041c', 'Rv1042c',
       'Rv1047', 'Rv1054', 'Rv1055', 'Rv1128c', 'Rv1148c', 'Rv1149',
       'Rv1150', 'Rv1199c', 'Rv1313c', 'Rv1369c', 'Rv1370c', 'Rv1572c',
       'Rv1573', 'Rv1574', 'Rv1575', 'Rv1576c', 'Rv1577c', 'Rv1578c',
       'Rv1579c', 'Rv1580c', 'Rv1581c', 'Rv1582c', 'Rv1583c', 'Rv1584c',
       'Rv1585c', 'Rv1586c', 'Rv1587c', 'Rv1588c', 'Rv1701', 'Rv1702c',
       'Rv1756c', 'Rv1757c', 'Rv1763', 'Rv1764', 'Rv1765A', 'Rv1945',
       'Rv2013', 'Rv2014', 'Rv2085', 'Rv2086', 'Rv2087', 'Rv2100',
       'Rv2105', 'Rv2106', 'Rv2167c', 'Rv2168c', 'Rv2177c', 'Rv2278',
       'Rv2279', 'Rv2309c', 'Rv2310', 'Rv2354', 'Rv2355', 'Rv2424c',
       'Rv2479c', 'Rv2480c', 'Rv2512c', 'Rv2646', 'Rv2647', 'Rv2648',
       'Rv2649', 'Rv2

In [354]:

Rv_MGEs_Hashes_Set = set()

for i_GeneID, i_Hashes in tqdm(dictOf_H37Rv_MycoBrow_Gene_KmerHashes.items()):

    if i_GeneID in InsSeqAndPhages_GeneIDs:
        Rv_MGEs_Hashes_Set.update(i_Hashes)
        
Rv_MGEs_Hashes_Set = set(ListOfAll_Rv_MGE_Hashes_list)

Rv_MGEs_Hashes_List = list(Rv_MGEs_Hashes_Set)

100%|██████████| 4173/4173 [00:00<00:00, 54514.40it/s]


In [355]:
len(Rv_MGEs_Hashes_Set) 

69102

In [356]:
def getAllHashes_InTargetSeqs(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_Hashes in dictOfHashes.items():
                
        if i_SeqID in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [357]:
Rv_MGEs_Hashes_Set_Try2 = getAllHashes_InTargetSeqs(dictOf_H37Rv_MycoBrow_Gene_KmerHashes,
                                                    InsSeqAndPhages_GeneIDs)   


In [358]:
Rv_MGEs_Hashes_Set_Try2 == Rv_MGEs_Hashes_Set

True

In [359]:
len(Rv_MGEs_Hashes_Set_Try2)

69102

### Look at Jaccard Containment between H37Rv and Rv_MGEs

In [360]:
jaccard_similarity_FromSets(H37Rv_Hashes_Set, Rv_MGEs_Hashes_Set)

0.01589562466616704

In [361]:
jaccard_containment_FromSets(H37Rv_Hashes_Set, Rv_MGEs_Hashes_Set)

0.01589562466616704

In [362]:
jaccard_containment_FromSets(Rv_MGEs_Hashes_Set, H37Rv_Hashes_Set)

1.0

# Define output dir of the Mtb-WGA-SMK processing pipeline

In [363]:
# Define pipeline output directories

WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

#MtbWGA_SMK_Pipeline_OutputDir = WGA_SMK_Outputs_Dir + "/220427_WGA158CI_V1"
WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"
Mtb_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir

In [364]:
#!ls -1 $WGA_SMK_Outputs_Dir

## Define PATHS relevant to Minigraph analysis

In [365]:
target_OutputDir = Mtb_SMK_Pipeline_OutputDir

Minigraph_151CI_OutDir = f"{target_OutputDir}/Minigraph"

MG_WGA151CI_GFA = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_158CI.V1.gfa"
MG_WGA151CI_Bubble_SV_BED = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_158CI.V1.Bubble.SV.bed"
MG_WGA151CI_Stable_FA = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_158CI.V1.Stable.fa"

MG_WGA151CI_MergedSVInfo_TSV = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_158CI.MergedSV.Info.tsv"

MG_WGA151CI_MergedSVInfo_SVVCF = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_158CI.MergedSV.Info.svvcf"


In [366]:
!ls -alh $Mtb_SMK_Pipeline_OutputDir

total 480K
drwxrwsr-x  15 mm774 farhat  405 Dec 29 01:01 .
drwxrwsr-x  11 mm774 farhat  402 Nov 22 01:55 ..
drwxrwsr-x 155 mm774 farhat 3.8K Nov 22 02:06 AsmAnalysis
drwxrwsr-x   2 mm774 farhat 3.0K Jan  1 17:21 Asm_MergeSNPs_mpileup
drwxrwsr-x   2 mm774 farhat 1.3K Dec  2 18:38 Asm_MergeVar_mpileup
drwxrwsr-x   3 mm774 farhat   61 Nov 22 02:08 Busco_Download_Tmp
drwxrwsr-x   4 mm774 farhat  102 Nov 22 12:05 FastANI
drwxrwsr-x   5 mm774 farhat  170 Nov 24 17:21 HomologyMapping
drwxrwsr-x   2 mm774 farhat  351 Nov 22 02:06 Minigraph
drwxrwsr-x   3 mm774 farhat   37 Nov 30 22:52 NucDiversity
drwxrwsr-x   3 mm774 farhat   25 Nov 22 01:53 O2logs
drwxrwsr-x  60 mm774 farhat 3.5K Feb  7 18:35 PanGenome_Analysis
drwxrwsr-x   7 mm774 farhat  237 Jan  1 16:51 Phylogenies
drwxrwsr-x   7 mm774 farhat  316 Nov 30 22:11 RecombDetection
drwxrwsr-x   4 mm774 farhat   78 Nov 22 02:12 SourMash


In [367]:
#!gfatools stat $MG_WGA158CI_GFA

## Parse in K-mers of complete Minigraph GFA (151 Mtb assemblies)

#### K-mers parsed for ALL nodes

In [368]:
Mtb_GFA_GP = gfapy.Gfa.from_file(MG_WGA151CI_GFA)

In [369]:
Mtb_GFA_GP.version

'gfa1'

In [370]:

listOfAll_GFA_Nodes = []

MG_dictOf_NodeInfo = {}

for line in tqdm(Mtb_GFA_GP.lines):
    #print(type(line))
    
    line_Str = str(line)
    
    if line_Str.startswith("S"):
        
        line_SplitByTab = line_Str.split("\t")
        
        S_Name = line_SplitByTab[1]
        
        MG_dictOf_NodeInfo[S_Name] = {}
        
        S_Seq = line_SplitByTab[2]
        
        Len_Seq = len(S_Seq)
        
        record_Kmers = build_kmers(S_Seq, 31)
        
        record_Hashes = hash_kmers_ToSet(record_Kmers)
                
        MG_dictOf_NodeInfo[S_Name]["Len"] = Len_Seq
        
        MG_dictOf_NodeInfo[S_Name]["Kmers"] = record_Kmers

        MG_dictOf_NodeInfo[S_Name]["Kmer_Hashes_Set"] = record_Hashes

        listOfAll_GFA_Nodes.append(S_Name)


100%|██████████| 7843/7843 [00:38<00:00, 202.38it/s] 


In [371]:
#record_Kmers

In [372]:
MG_dictOf_NodeInfo["s1"]["Len"]

1533

## Parse the Bubble SV Summary BED from Minigraph

In [373]:
MG_SV_BED_DF = pd.read_csv(MG_WGA151CI_Bubble_SV_BED, sep = "\t", header=None)

MG_SV_BED_DF.columns = ["Chr", "Start", "End", "Unk1", "Unk2",
                        "Unk3", "Len_Ref", "Len_Alt", 
                        "Unk4", "Unk5", "Unk6", "NodePath", "Ref_Seq", "Alt_Seq"]


Target_Col = ["Chr", "Start", "End",
              "Len_Ref", "Len_Alt", 
              "NodePath", "Ref_Seq", "Alt_Seq"]


MG_SV_BED_DF = MG_SV_BED_DF[Target_Col]

# Remove the first and last nodes, these are not included in the SV of interest
MG_SV_BED_DF["NodePath_Trimmed"] = MG_SV_BED_DF["NodePath"].str.split(",").str[1:-1]

MG_SV_BED_DF["Start_Node"] = MG_SV_BED_DF["NodePath"].str.split(",").str[0]
MG_SV_BED_DF["End_Node"] = MG_SV_BED_DF["NodePath"].str.split(",").str[-1]

MG_SV_BED_DF.shape

(535, 11)

In [374]:
MG_SV_BED_DF.head(3)

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,Ref_Seq,Alt_Seq,NodePath_Trimmed,Start_Node,End_Node
0,NC_000962.3,1533,1533,0,0,"s1,s2",*,*,[],s1,s2
1,NC_000962.3,1591,1652,61,1480,"s2,s2034,s2959,s2035,s3,s2036,s2036,s3,s2035,s2959,s2034,s4,s5",ATTGGCTGTGAGTGTCGCTGTGCACAAACCGCGCACAGACTCATACAGTCCCGGCGGTTCC,ATTGGCTGTGAGTGTCGCTGTGCACAAACCGCGCACAGACTCATACAGTCCCGGCGGTTCCGTTCGCCGGGACTGTATGAGTCTGTGCGCGGTTTGTGCACAGCGACACTCACAGCCAATTGAACCGCCCCGGTGAGTCCGGAGACTCTCTGATCTGAGACCTCAGCCGGCGGCTGGTCTCTGGCGTTGAGCGTAGTAGGCAGCCTCGAGTTCGACCGGCGGGACGTCGCCGCAGTACTGGTAGAGGCGGCGATGGTTGAACCAGTCGACCCAGCGCGCGGTGGCCAACTCGACATCCTCGATGGACCGCCAGGGCTTGCCGGGTTTGATCAGCTCGGTCTTGTATAGGCCGTTGATCGTCTCGGCTAGTGCATTGTCATAGGAGCTTCCGACCGC...,"[s2034, s2959, s2035, s3, s2036, s2036, s3, s2035, s2959, s2034, s4]",s2,s5
2,NC_000962.3,13622,13622,0,1358,"s5,s2744,s6",*,TGAACCGCCCCGGTGAGTCCGGAGACTCTCTGATCTGAGACCTCAGCCGGCGGCTGGTCTCTGGCGTTGAGCGTAGTAGGCAGCCTCGAGTTCGACCGGCGGGACGTCGCCGCAGTACTGGTAGAGGCGGCGATGGTTGAACCAGTCGACCCAGCGCGCGGTGGCCAACTCGACATCCTCGATGGACCGCCAGGGCTTGCCGGGTTTGATCAGCTCGGTCTTGTATAGGCCGTTGATCGTCTCGGCTAGTGCATTGTCATAGGAGCTTCCGACCGCTCCGACCGACGGTTGGATGCCTGCCTCGGCGAGCCGCTCGCTGAACCGGATCGATGTGTACTGAGATCCCCTATCCGTATGGTGGATAACGTCTTTCAGGTCGAGTACGCCTTCTTGTTG...,[s2744],s5,s6


### Create Dict of NodeID to BubbleID

In [375]:
NodeID_ToBubbleID_Dict = {}

for i, row in MG_SV_BED_DF.iterrows():
    
    Bubble_Num = i + 1
    BubbleID = "BubbleRegion_" + str(Bubble_Num)
    i_NodePath_Trimmed = row["NodePath_Trimmed"]
    
    for NodeID in i_NodePath_Trimmed: NodeID_ToBubbleID_Dict[NodeID] = BubbleID
        

In [376]:
NodeID_ToBubbleID_Dict["s3"]

'BubbleRegion_2'

## Classify all nodes by wheither they are a CORE NODE or a SV NODE within the graph

In [377]:
SV_NodeIDs_All = []
Non_SV_NodeIDs = []

for i, row in MG_SV_BED_DF.iterrows():

    i_NodePath_Trimmed = row["NodePath_Trimmed"]
    
    SV_NodeIDs_All += (i_NodePath_Trimmed)
    
    Non_SV_NodeIDs.append( row["Start_Node"] )
    Non_SV_NodeIDs.append( row["End_Node"] )

    
SV_NodeIDs_All = list(set(SV_NodeIDs_All))
Non_SV_NodeIDs = list(set(Non_SV_NodeIDs))
    

In [378]:
len(list(set(SV_NodeIDs_All)) )

2602

In [379]:
len(Non_SV_NodeIDs)

536

In [380]:
MG_dictOf_NodeInfo["s1"]["Len"]

1533

In [381]:
np.unique(np.array([])).shape

(0,)

# 1) Compare k-mer content of all SV nodes vs all SV nodes

In [382]:
len(MG_dictOf_NodeInfo.keys())

3138

In [383]:
len(list(MG_dictOf_NodeInfo.keys()))

3138

In [384]:
All_Nodes_List = list(MG_dictOf_NodeInfo.keys())
len(All_Nodes_List)

3138

In [385]:
3137 ** 2 

9840769

In [386]:
start = time.time()

listOfTuples = []

for record_Name_1 in tqdm( All_Nodes_List ) :
    for record_Name_2 in All_Nodes_List:
        record_1_Hashes = MG_dictOf_NodeInfo[record_Name_1]["Kmer_Hashes_Set"]
        record_2_Hashes = MG_dictOf_NodeInfo[record_Name_2]["Kmer_Hashes_Set"]
        
        if (record_1_Hashes) & (record_2_Hashes):
            
            record_1and2_JS = jaccard_similarity_FromSets(record_1_Hashes, record_2_Hashes)

            if record_1and2_JS != 0:
                record_1and2_JC = jaccard_containment_FromSets(record_1_Hashes, record_2_Hashes)
                
                record_1_SeqLen = MG_dictOf_NodeInfo[record_Name_1]["Len"]
                record_2_SeqLen = MG_dictOf_NodeInfo[record_Name_2]["Len"]
                    
                i_Tuple = (record_Name_1, record_Name_2, 
                           record_1_SeqLen, record_2_SeqLen,
                           record_1and2_JS, record_1and2_JC)
                
                listOfTuples.append(i_Tuple)

end = time.time()
print(end - start)

AvA_Nodes_DF = pd.DataFrame(listOfTuples)
AvA_Nodes_DF.columns = ["RecordID_1", "RecordID_2", "Record1_Len", "Record2_Len", "JaccardSim", "JaccardContain"]

#AvA_Nodes_DF = AvA_Nodes_DF.query("RecordID_1 != RecordID_2")


100%|██████████| 3138/3138 [02:21<00:00, 22.24it/s] 

141.15950679779053





In [387]:
AvA_Nodes_DF.shape

(125615, 6)

In [388]:
AvA_Nodes_NoZeroJS_DF = AvA_Nodes_DF.query("JaccardSim != 0")   
AvA_Nodes_NoZeroJS_DF.shape


(125615, 6)

In [389]:
AvA_Nodes_Trim_DF = AvA_Nodes_DF.query("RecordID_1 != RecordID_2").query("JaccardSim != 0")   
AvA_Nodes_Trim_DF.shape


(123104, 6)

## Look at All vs All Nodes Stats

In [390]:
AvA_Nodes_DF.head(3)

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s1,s1,1533,1533,1.0,1.0
1,s2,s2,58,58,1.0,1.0
2,s3,s3,56,56,1.0,1.0


In [391]:
AvA_Nodes_DF.shape

(125615, 6)

In [392]:
AvA_Nodes_DF.query("RecordID_1 != RecordID_2").shape

(123104, 6)

In [393]:
AvA_Nodes_DF.query("JaccardSim >= 0.05").shape

(115801, 6)

In [394]:
AvA_Nodes_DF.query("JaccardSim >= 0.05")["RecordID_1"].nunique()

2511

In [395]:
AvA_Nodes_DF.query("JaccardSim >= 0.5")["RecordID_1"].nunique()

2511

In [396]:
AvA_Nodes_DF.query("JaccardSim >= 0.8")["RecordID_1"].nunique()

2511

In [397]:
AvA_Nodes_DF.query("JaccardSim >= 1")["RecordID_1"].nunique()

2511

## Output All vs All Node k-mer comparison to TSV

In [398]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

!mkdir $PangenomeAnalysis_Dir

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz"     

AvA_Nodes_Trim_DF.to_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV3’: File exists


In [399]:
!du -sh $MG_AvA_Node_KmerAnalysis_TSV_GZ

1.3M	../../Data/MtbPangenomeAnalysis_SetV3/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz


In [400]:
!ls -1 $PangenomeAnalysis_Dir

Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz
Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv


In [401]:
#!rm $MG_AvA_Node_KmerAnalysis_TSV

In [402]:
#!zcat $MG_AvA_Node_KmerAnalysis_TSV | wc -l

### Parse in processed All vs All Kmer analysis

In [403]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz"     


In [404]:
!ls -lah $MG_AvA_Node_KmerAnalysis_TSV_GZ

-rw-r--r-- 1 mm774 farhat 1.2M Mar  5 16:00 ../../Data/MtbPangenomeAnalysis_SetV3/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz


In [405]:
#!ls -lah $MG_AvA_Node_KmerAnalysis_TSV

In [406]:
AvA_Nodes_Trim_DF = pd.read_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t" )
AvA_Nodes_Trim_DF.shape

(123104, 6)

In [407]:
AvA_Nodes_Trim_DF.head()

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s3,s2959,56,61,0.78125,0.961538
1,s7,s2247,1876,42,0.006501,0.006501
2,s8,s2823,542,87,0.096339,0.097656
3,s13,s2037,24063,49,0.000208,0.000208
4,s13,s2039,24063,114,0.000208,0.000208


In [408]:
!ls -lah $PangenomeAnalysis_Dir

total 1.5M
drwxr-sr-x 2 mm774 farhat  152 Mar  5 15:51 .
drwxrwsr-x 7 mm774 farhat  287 Mar  5 15:44 ..
-rw-r--r-- 1 mm774 farhat 1.2M Mar  5 16:00 Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz
-rw-r--r-- 1 mm774 farhat 144K Mar  5 15:54 Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv


In [409]:
MG_dictOf_NodeInfo['s1'].keys()

dict_keys(['Len', 'Kmers', 'Kmer_Hashes_Set'])

In [410]:
#MG_dictOf_NodeInfo['s1']['Kmer_Hashes_Set']

In [411]:
len(Rv_MGEs_Hashes_Set)

69102

In [412]:
len(record_Hashes_Set)

660

In [413]:
len({})

0

In [414]:
len({"t"})

1

# 2) Compare k-mer content of SV nodes to known references (Rv, all other nodes, etc)

In [415]:
NodeAnalysis_listOfRows = []


for NodeID, NodeInfo in tqdm( MG_dictOf_NodeInfo.items() ) :
    
    record_Hashes_Set = NodeInfo["Kmer_Hashes_Set"] 
               
    Len_Seq = NodeInfo["Len"]
        
    if len(record_Hashes_Set) != 0:
        Record_Hash_JC_WiH37Rv = jaccard_containment_FromSets( record_Hashes_Set, H37Rv_Hashes_Set)
        Record_Hash_JC_WiIS6110 = jaccard_containment_FromSets( record_Hashes_Set, IS6110_Ex1_Hashes_Set)

        Record_Hash_JC_WiRv_InsSeqAndPhages = jaccard_containment_FromSets( record_Hashes_Set, Rv_MGEs_Hashes_Set)

    else:
        Record_Hash_JC_WiH37Rv = 0
        Record_Hash_JC_WiIS6110 = 0
        Record_Hash_JC_WiRv_InsSeqAndPhages = 0 
             
        if Len_Seq >= 31:
            print(f"No kmers were produced for segment: {NodeID}")
                
    Status_SVNode = (NodeID in SV_NodeIDs_All)
    
    i_Row = (NodeID,
             Len_Seq,
             Record_Hash_JC_WiH37Rv,
             Record_Hash_JC_WiIS6110,
             Record_Hash_JC_WiRv_InsSeqAndPhages,
             Status_SVNode)
    
    NodeAnalysis_listOfRows.append(i_Row)

    
MG_Nodes_KmerVsRefSets_DF = pd.DataFrame(NodeAnalysis_listOfRows)

MG_Nodes_KmerVsRefSets_DF.columns = ["NodeID",
                              "SeqLength",
                              "Jaccard_Cont_WiRv",
                              "Jaccard_Cont_WiIS6110",
                              "Jaccard_Cont_WiRv_InsSeqAndPhages",
                              "IsSVNode"]


100%|██████████| 3138/3138 [00:01<00:00, 1784.47it/s]


### Add a "BubbleID" column to the node-level analysis

In [416]:
MG_Nodes_KmerVsRefSets_DF["BubbleID"] = MG_Nodes_KmerVsRefSets_DF["NodeID"].map(NodeID_ToBubbleID_Dict)
MG_Nodes_KmerVsRefSets_DF["BubbleID"] = MG_Nodes_KmerVsRefSets_DF["BubbleID"] = MG_Nodes_KmerVsRefSets_DF["BubbleID"].fillna("None")
MG_Nodes_KmerVsRefSets_DF.shape

(3138, 7)

In [417]:
MG_Nodes_KmerVsRefSets_DF.head(10)

Unnamed: 0,NodeID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,BubbleID
0,s1,1533,1.0,0.0,0.0,False,
1,s2,58,1.0,0.0,0.0,False,
2,s3,56,1.0,0.0,0.0,True,BubbleRegion_2
3,s4,5,0.0,0.0,0.0,True,BubbleRegion_2
4,s5,11970,1.0,0.0,0.0,False,
5,s6,12847,1.0,0.0,0.0,False,
6,s7,1876,1.0,0.0,0.0,True,BubbleRegion_4
7,s8,542,1.0,0.0,0.0,False,
8,s9,5979,1.0,0.0,0.030761,False,
9,s10,4133,1.0,0.0,0.0,False,


In [418]:
MG_Nodes_KmerVsRefSets_DF.shape

(3138, 7)

## Output Minigraph NODE Kmer Analysis to TSV

In [419]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

!mkdir $PangenomeAnalysis_Dir

MG_Node_KmerVsRefSets_TSV = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv"     

MG_Nodes_KmerVsRefSets_DF.to_csv(MG_Node_KmerVsRefSets_TSV, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV3’: File exists


In [420]:
!ls -1 $PangenomeAnalysis_Dir

Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz
Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv


In [421]:
!ls -alh $MG_Node_KmerAnalysis_TSV

do_ypcall: clnt_call: RPC: Timed out
-rw-r--r-- 1 mm774 farhat 144K Mar  5 16:00 ../../Data/MtbPangenomeAnalysis_SetV3/Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv


In [422]:
!wc -l $MG_Node_KmerAnalysis_TSV

3139 ../../Data/MtbPangenomeAnalysis_SetV3/Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv


### Test parsing of NODE Kmer Analysis TSV

In [423]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

MG_Node_KmerVsRefSets_TSV = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv"     


In [424]:
MG_Nodes_KmerVsRefSets_DF = pd.read_csv(MG_Node_KmerVsRefSets_TSV, sep = "\t" )
MG_Nodes_KmerVsRefSets_DF.shape  

(3138, 7)

In [425]:
MG_Nodes_KmerVsRefSets_DF.head()

Unnamed: 0,NodeID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,BubbleID
0,s1,1533,1.0,0.0,0.0,False,
1,s2,58,1.0,0.0,0.0,False,
2,s3,56,1.0,0.0,0.0,True,BubbleRegion_2
3,s4,5,0.0,0.0,0.0,True,BubbleRegion_2
4,s5,11970,1.0,0.0,0.0,False,


In [426]:
MG_Nodes_KmerVsRefSets_DF["Jaccard_Cont_WiRv"].describe()

count    3138.000000
mean        0.721741
std         0.435274
min         0.000000
25%         0.036899
50%         1.000000
75%         1.000000
max         1.000000
Name: Jaccard_Cont_WiRv, dtype: float64

In [427]:
#!ls -1 $PangenomeAnalysis_Dir/

In [428]:
!head $MG_Node_KmerAnalysis_TSV

NodeID	SeqLength	Jaccard_Cont_WiRv	Jaccard_Cont_WiIS6110	Jaccard_Cont_WiRv_InsSeqAndPhages	IsSVNode	BubbleID
s1	1533	1.0	0.0	0.0	False	None
s2	58	1.0	0.0	0.0	False	None
s3	56	1.0	0.0	0.0	True	BubbleRegion_2
s4	5	0.0	0.0	0.0	True	BubbleRegion_2
s5	11970	1.0	0.0	0.0	False	None
s6	12847	1.0	0.0	0.0	False	None
s7	1876	1.0	0.0	0.0	True	BubbleRegion_4
s8	542	1.0	0.0	0.0	False	None
s9	5979	1.0	0.0	0.030761472516389308	False	None


# Reparse 2 DFs together

In [429]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV3"

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.AllVsAll.KmerAnalysis.V1.tsv.gz"     

MG_Node_KmerVsRefSets_TSV = f"{PangenomeAnalysis_Dir}/Minigraph.NodeAnalysis.151CI.NodeVsSetsOfInterest.V1.tsv"     



In [430]:
AvA_Nodes_Trim_DF = pd.read_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t" )
AvA_Nodes_Trim_DF.shape

(123104, 6)

In [431]:
AvA_Nodes_Trim_DF.head()

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s3,s2959,56,61,0.78125,0.961538
1,s7,s2247,1876,42,0.006501,0.006501
2,s8,s2823,542,87,0.096339,0.097656
3,s13,s2037,24063,49,0.000208,0.000208
4,s13,s2039,24063,114,0.000208,0.000208


In [432]:
MG_Nodes_KmerVsRefSets_DF = pd.read_csv(MG_Node_KmerAnalysis_TSV, sep = "\t" )
MG_Nodes_KmerVsRefSets_DF.shape

(3138, 7)

In [433]:
MG_Nodes_KmerVsRefSets_DF.head()

Unnamed: 0,NodeID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,BubbleID
0,s1,1533,1.0,0.0,0.0,False,
1,s2,58,1.0,0.0,0.0,False,
2,s3,56,1.0,0.0,0.0,True,BubbleRegion_2
3,s4,5,0.0,0.0,0.0,True,BubbleRegion_2
4,s5,11970,1.0,0.0,0.0,False,
