# Minigraph Mtb SV Pan-Genome Evaluation
## Part 1: graph processsing & k-mer content comparison

### Maximillian Marin (mgmarin@g.harvard.edu)


## Goals: 
- Classify ALL nodes into the different H37Rv gene categories
- Quantify relative abundance of SV nodes belonging to the different gene categories

### Import Statements

In [203]:
import numpy as np
import pandas as pd
import scipy.stats

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [204]:
import ast

In [205]:
# https://bioframe.readthedocs.io/en/latest/guide-intervalops.html
import bioframe as bf


In [206]:
from Bio import SeqIO


In [207]:
import json

In [208]:
import gfapy

#### Pandas Viewing Settings

In [209]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [210]:
pd.set_option('max_colwidth', 400)

## Define useful Kmer analysis functions

In [211]:
import screed

In [212]:
import mmh3

In [213]:
def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1
    
    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)
        
    return kmers

In [214]:
#import screed a library for reading in FASTA/FASTQ

def read_kmers_from_file(filename, ksize):
    all_kmers = []
    for record in screed.open(filename):
        sequence = record.sequence
        
        kmers = build_kmers(sequence, ksize)
        all_kmers += kmers

    return all_kmers

In [215]:
def hash_kmer(kmer):
    # calculate the reverse complement
    rc_kmer = screed.rc(kmer)
    
    # determine whether original k-mer or reverse complement is lesser
    if kmer < rc_kmer:
        canonical_kmer = kmer
    else:
        canonical_kmer = rc_kmer
        
    # calculate murmurhash using a hash seed of 42
    hash = mmh3.hash64(canonical_kmer, 42)[0]
    if hash < 0: hash += 2**64

    return hash

In [216]:
# def hash_kmers(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
#     return hashes

def hash_kmers_ToSet(kmers):
    hashes = set()
    for kmer in kmers:
        hashes.add(hash_kmer(kmer))
    return hashes

In [217]:

def jaccard_containment_FromSets(a, b):
    '''
    This function returns the Jaccard Containment between sets a and b.
    '''
    
    intersection = len(a.intersection(b))
    
    return intersection / len(a)

def jaccard_similarity_FromSets(a, b):
    '''
    This function returns the Jaccard Similarity between sets a and b.
    '''
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    
    return intersection / union


In [218]:
def getAllHash_ExceptTargets_Set_V2(dictOfHashes, targetsToRemove):
    # Convert targetsToRemove to a set for faster lookup
    targetsToRemoveSet = set(targetsToRemove)

    # Use set comprehension for more efficient construction of the result set
    return {hash for seqID, seqInfoDict in dictOfHashes.items() if seqID not in targetsToRemoveSet
            for hash in seqInfoDict["Kmer_Hashes_Set"]}

In [219]:
def getAllHash_InTargetSeqs_Set(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_SeqInfoDict in dictOfHashes.items():
        
        i_Hashes = i_SeqInfoDict["Kmer_Hashes_Set"]
        
        if i_SeqID not in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [220]:
#%reload_ext autoreload
#%autoreload 2

### import panqc toolkit functions
#from panqc.kmerlib import hash_kmers_ToSet, jaccard_similarity_FromSets, jaccard_containment_FromSets


# Part 1: Parse sample metadata & preprocessed genome info/results

In [221]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"
MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"   


## Parse sample Metadata (N = 151)

In [222]:
WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values) 


## Parse FASTA paths for 151 Mtb assemblies

In [223]:
WGA151CI_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")
WGA151CI_Asm_Path_DF.shape

(151, 4)

In [224]:
WGA151CI_Asm_Path_DF.head(4)

Unnamed: 0,SampleID,Dataset_Tag,Genome_ASM_PATH,ShortRead_Genome_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.SR.Asm.fasta
1,N0153,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.SR.Asm.fasta
2,TB3113,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.SR.Asm.fasta
3,TB1236,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.SR.Asm.fasta


In [225]:
SampleID_To_LRAsmFA_PATH_Dict = dict(WGA151CI_Asm_Path_DF[['SampleID', 'Genome_ASM_PATH']].values)

## Import/parse processed H37rv genome annotations

In [226]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"

## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[["H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]

RvID_To_Symbol_Dict = dict(H37Rv_GeneInfo_Subset_DF[['H37rv_GeneID', 'Symbol']].values)
Symbol_To_FuncCat_Dict = dict(H37Rv_GeneInfo_Subset_DF[['Symbol', 'Functional_Category']].values)


In [227]:
H37Rv_GenomeAnno_Genes_DF.head(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucleotidyltransferase),,NotExcluded
2,NC_000962.3,3279,4437,+,Rv0003,recF,CDS,information pathways,No,DNA replication and repair protein RecF (single-strand DNA binding protein),,NotExcluded


In [228]:
H37Rv_GenomeAnno_Genes_DF["Functional_Category"].unique()

array(['information pathways', 'conserved hypotheticals',
       'cell wall and cell processes', 'stable RNAs',
       'intermediary metabolism and respiration', 'regulatory proteins',
       'virulence, detoxification, adaptation',
       'insertion seqs and phages', 'lipid metabolism', 'PE/PPE',
       'unknown'], dtype=object)

In [229]:
H37Rv_GenomeAnno_Genes_DF["Functional_Category"].value_counts()

conserved hypotheticals                    1042
intermediary metabolism and respiration     936
cell wall and cell processes                772
lipid metabolism                            272
information pathways                        242
virulence, detoxification, adaptation       239
regulatory proteins                         198
PE/PPE                                      168
insertion seqs and phages                   147
stable RNAs                                  48
unknown                                      15
Name: Functional_Category, dtype: int64

# Part 2: Generate reference k-mer sets (ie H37Rv, IS6110, Phages + ISs)  

## Generate k-mer info for H37Rv and a representative IS6110 sequence 

In [230]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"

H37rv_Ref_FA_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.fasta"
IS6110_Example_FA_PATH = f"{Mtb_RefDir}/IS6110_From_Rv0795_Rv0796.DNA.fasta"

#### H37Rv - k-mer generation & hashing

In [231]:
H37Rv_kmers = read_kmers_from_file(H37rv_Ref_FA_PATH, 31)

H37Rv_Hashes_Set = hash_kmers_ToSet(H37Rv_kmers)

print(len(H37Rv_kmers))

4411502


#### IS6110 (Rv0795 & Rv0796) - k-mer generation & hashing

In [232]:
IS6110_Ex1_kmers = read_kmers_from_file(IS6110_Example_FA_PATH, 31)

IS6110_Ex1_Hashes_Set = hash_kmers_ToSet(IS6110_Ex1_kmers)

print(len(IS6110_Ex1_kmers))

1254


## Generate k-mer info for all H37Rv gene DNA sequences (Mycobrowser)

In [233]:
O2_RefDir = "/n/data1/hms/dbmi/farhat/mm774/References"
MycoBrowser_RefFiles_Dir = f"{O2_RefDir}/190619_Mycobrowser_H37rv_ReferenceFiles"

H37Rv_Genes_MycoBro_FA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_genes_v3.fasta"


In [234]:
!grep ^">" $H37Rv_Genes_MycoBro_FA | grep "dnaA"

>Rv0001|dnaA|CDS|1-1524|+|Chromosomal replication initiator protein DnaA


### Get 31-mer hashes for all annotated gene DNA sequences

In [235]:
dictOf_H37Rv_MycoBrow_GeneSeq = {}
dictOf_H37Rv_MycoBrow_Gene_KmerHashes = {}

for index, record in tqdm(enumerate(SeqIO.parse(H37Rv_Genes_MycoBro_FA, "fasta"))):
    
    RecordName = record.name
    RvID = RecordName.split("|")[0]
    GeneID = RecordName.split("|")[1]
    S_Seq = str(record.seq).upper()
    
    dictOf_H37Rv_MycoBrow_GeneSeq[GeneID] = S_Seq

    record_Hashes_Set = hash_kmers_ToSet(build_kmers(S_Seq, 31))

    dictOf_H37Rv_MycoBrow_Gene_KmerHashes[GeneID] = record_Hashes_Set
    

4187it [00:24, 168.82it/s]


In [236]:
len(dictOf_H37Rv_MycoBrow_GeneSeq["dnaA"])

1524

In [237]:
list(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"])[:2]

[13580233940393664509, 5138456728421695490]

## Generate k-mer info for each gene category annotation in H37Rv

In [238]:
def getAllHashes_InTargetSeqs(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_Hashes in dictOfHashes.items():
                
        if i_SeqID in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [239]:
Rv_Gene_Category_List = list(H37Rv_GenomeAnno_Genes_DF["Functional_Category"].unique())

RvGeneCat_To_Symbol_Dict = {}
RvGeneCat_To_RvID_Dict = {}
RvGeneCat_To_KmerHashes_Dict = {}

for i_GeneCat in Rv_Gene_Category_List:
    
    Genes_Subset_DF = H37Rv_GenomeAnno_Genes_DF.query(f"Functional_Category == '{i_GeneCat}'")     
    
    Subset_GeneSymbols = Genes_Subset_DF["Symbol"].values
    Subset_RvIDs = Genes_Subset_DF["H37rv_GeneID"].values

    N_Genes = len(Subset_GeneIDs)
    

    RvGeneCat_To_Symbol_Dict[i_GeneCat] = Subset_GeneIDs
    RvGeneCat_To_RvID_Dict[i_GeneCat] = Subset_RvIDs


    i_GeneCat_Hashes_Set = getAllHashes_InTargetSeqs(dictOf_H37Rv_MycoBrow_Gene_KmerHashes,
                                                     Subset_GeneSymbols)   

    RvGeneCat_To_KmerHashes_Dict[i_GeneCat] = i_GeneCat_Hashes_Set
    
    print(i_GeneCat, N_Genes, len(list(i_GeneCat_Hashes_Set)) )


information pathways 15 265284
conserved hypotheticals 15 723221
cell wall and cell processes 15 783261
stable RNAs 15 6735
intermediary metabolism and respiration 15 1011332
regulatory proteins 15 161826
virulence, detoxification, adaptation 15 150509
insertion seqs and phages 15 69102
lipid metabolism 15 407894
PE/PPE 15 265028
unknown 15 7938


In [240]:
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']

In [241]:
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']
Rv_MGEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['insertion seqs and phages']

In [242]:
print(len(Rv_MGEs_Hashes_Set))

69102


In [243]:
jaccard_containment_FromSets(H37Rv_Hashes_Set, Rv_MGEs_Hashes_Set)

0.01589562466616704

In [244]:
jaccard_containment_FromSets(H37Rv_Hashes_Set, Rv_PEPPEs_Hashes_Set)

0.06096474217858988

In [245]:
jaccard_containment_FromSets(Rv_MGEs_Hashes_Set, Rv_PEPPEs_Hashes_Set)

0.0004341408352869671

In [246]:
jaccard_containment_FromSets(Rv_PEPPEs_Hashes_Set, Rv_MGEs_Hashes_Set)

0.00011319558688138612

In [247]:
jaccard_similarity_FromSets(Rv_PEPPEs_Hashes_Set, Rv_MGEs_Hashes_Set)

8.97934750074828e-05

In [248]:
jaccard_similarity_FromSets(Rv_PEPPEs_Hashes_Set, H37Rv_Hashes_Set)

0.06096474217858988

In [249]:
jaccard_similarity_FromSets(Rv_PEPPEs_Hashes_Set, Rv_MGEs_Hashes_Set)

8.97934750074828e-05

#### Look at Jaccard Containment between 31-mers of H37Rv and H37Rv's MGEs (ISs + Phages)

In [250]:
jaccard_containment_FromSets(H37Rv_Hashes_Set, Rv_MGEs_Hashes_Set)

0.01589562466616704

# Part 3: Generate k-mer info per node from Minigraph GFA 

## Define output dir of the Mtb-WGA-SMK processing pipeline

In [251]:
WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"
MtbWGA_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir

## Define PATHS relevant to Minigraph analysis

In [252]:
target_OutputDir = MtbWGA_SMK_Pipeline_OutputDir
Minigraph_151CI_OutDir = f"{target_OutputDir}/Minigraph"

MG_WGA151CI_GFA = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.V1.gfa"
MG_WGA151CI_Bubble_SV_BED = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.V1.Bubble.SV.bed"
MG_WGA151CI_Stable_FA = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.V1.Stable.fa"

MG_WGA151CI_MergedSVInfo_TSV = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.MergedSV.Info.tsv"
MG_WGA151CI_MergedSVInfo_SVVCF = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.MergedSV.Info.svvcf"


#### use `gfatools` to print general stats of the SV graph (rGFA format)

In [253]:
!gfatools stat $MG_WGA151CI_GFA

Number of segments: 3138
Number of links: 4705
Number of arcs: 9410
Max rank: 129
Total segment length: 5196363
Average segment length: 1655.947
Sum of rank-0 segment lengths: 4411532
Max degree: 8
Average degree: 1.499
[M::main] Version: 0.5-r292-dirty
[M::main] CMD: gfatools stat /n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/Minigraph/Minigraph_H37rv_Vs_151CI.V1.gfa
[M::main] Real time: 0.019 sec; CPU: 0.090 sec


## Parse k-mer info for all nodes of `minigraph` GFA (151 Mtb assemblies)


In [254]:
def parse_rGFA_To_KmerInfo(i_Minigraph_GFA):

    GFA_GP = gfapy.Gfa.from_file(i_Minigraph_GFA)

    i_dictOf_NodeInfo = {}

    for line in tqdm(GFA_GP.lines):
        line_Str = str(line)
        
        if line_Str.startswith("S"):
            
            line_SplitByTab = line_Str.split("\t")
            
            S_Name = line_SplitByTab[1]
            
            S_Seq = line_SplitByTab[2]
            
            Len_Seq = len(S_Seq)
            
            record_Kmers = build_kmers(S_Seq, 31)
            
            record_Hashes = hash_kmers_ToSet(record_Kmers)

            i_dictOf_NodeInfo[S_Name] = {}
            i_dictOf_NodeInfo[S_Name]["Len"] = Len_Seq
            i_dictOf_NodeInfo[S_Name]["Kmers"] = record_Kmers
            i_dictOf_NodeInfo[S_Name]["Kmer_Hashes_Set"] = record_Hashes

    return i_dictOf_NodeInfo

## Generate dict of SV PG graph node info (length, k-mers, k-mer hashes)

#### For each node of the graph we have:
- sequence length
- all unique 31-mers
- all hashes of all unique canonical 31-mers 

In [255]:
MG_dictOf_NodeInfo = parse_rGFA_To_KmerInfo(MG_WGA151CI_GFA)

100%|██████████| 7843/7843 [00:32<00:00, 240.78it/s] 


#### Inspect resulting dict of node info (length, k-mers, k-mer hashes)

In [256]:
len(list(MG_dictOf_NodeInfo.keys()))

3138

In [257]:
# For each node of the graph we have its sequence length, unique 31-mers, unique hashes of the canonical 31-mer 

MG_dictOf_NodeInfo["s1"].keys()

dict_keys(['Len', 'Kmers', 'Kmer_Hashes_Set'])

In [258]:
MG_dictOf_NodeInfo["s1"]["Len"]

1533

In [259]:
MG_dictOf_NodeInfo["s1"]["Kmers"][:3]

['TTGACCGATGACCCCGGTTCAGGCTTCACCA',
 'TGACCGATGACCCCGGTTCAGGCTTCACCAC',
 'GACCGATGACCCCGGTTCAGGCTTCACCACA']

In [260]:
list(MG_dictOf_NodeInfo["s1"]["Kmer_Hashes_Set"])[:2]

[13580233940393664509, 5138456728421695490]

# Part 4: Process bubble info from summary files generated by minigraph (BED, SVVCF, SV.TSV)

## Parse the Bubble SV Summary BED from Minigraph

In [305]:
MG_SV_BED_DF = pd.read_csv(MG_WGA151CI_Bubble_SV_BED, sep = "\t", header=None)

MG_SV_BED_DF.columns = ["Chr", "Start", "End", "Unk1", "Unk2", "Unk3", "Len_Ref", "Len_Alt", 
                        "Unk4", "Unk5", "Unk6", "NodePath", "Ref_Seq", "Alt_Seq"]

Target_Col = ["Chr", "Start", "End",
              "Len_Ref", "Len_Alt", 
              "NodePath", "Ref_Seq", "Alt_Seq"]

MG_SV_BED_DF = MG_SV_BED_DF[Target_Col]

# Remove the first and last nodes, these are not included in the SV of interest
MG_SV_BED_DF["NodePath_Trimmed"] = MG_SV_BED_DF["NodePath"].str.split(",").str[1:-1]

MG_SV_BED_DF["Start_Node"] = MG_SV_BED_DF["NodePath"].str.split(",").str[0]
MG_SV_BED_DF["End_Node"] = MG_SV_BED_DF["NodePath"].str.split(",").str[-1]
MG_SV_BED_DF["BubbleNum"] = (MG_SV_BED_DF.index.values + 1)

MG_SV_BED_DF["BubbleID"] = "BubbleRegion_" + MG_SV_BED_DF["BubbleNum"].astype(str)
MG_SV_BED_DF["NumSVNodes"] = MG_SV_BED_DF["NodePath_Trimmed"].str.len()

MG_SV_BED_DF.shape

(535, 14)

In [306]:
MG_SV_BED_DF.head(1)

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,Ref_Seq,Alt_Seq,NodePath_Trimmed,Start_Node,End_Node,BubbleNum,BubbleID,NumSVNodes
0,NC_000962.3,1533,1533,0,0,"s1,s2",*,*,[],s1,s2,1,BubbleRegion_1,0


## Label Bubbles BED DF by overlapping Rv genes

In [307]:
# https://stackoverflow.com/questions/61109186/python-pandas-to-match-rows-with-overlapping-coordinates

listOf_Overlap_Genes = []
listOf_Overlap_Genes_RvIDs = []

for i, row in MG_SV_BED_DF.iterrows():
    
    # a) Target overlapping genes to event
    event_Start = int(row["Start"])
    event_End = int(row["End"])
    event_Range = f"NC_000962.3:{event_Start}-{event_End}"
    
    sub_DF_Overlap_Event_Genes = bf.select(H37Rv_GenomeAnno_Genes_DF, event_Range, cols = ("Chrom", "Start", "End"))    
    
    listOf_Overlap_Genes.append( ",".join(list(sub_DF_Overlap_Event_Genes["Symbol"].values)) )
    listOf_Overlap_Genes_RvIDs.append( ",".join(list(sub_DF_Overlap_Event_Genes["H37rv_GeneID"].values)) )

MG_SV_BED_DF["Overlap_Genes"] = listOf_Overlap_Genes
MG_SV_BED_DF["Overlap_Gene_RvIDs"] = listOf_Overlap_Genes_RvIDs
MG_SV_BED_DF["Overlap_Genes"] = MG_SV_BED_DF["Overlap_Genes"].fillna("None")

MG_SV_BED_DF = MG_SV_BED_DF.drop(["Ref_Seq", "Alt_Seq"], axis=1)

In [308]:
MG_SV_BED_DF.query("BubbleID == 'BubbleRegion_193'") 

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,NodePath_Trimmed,Start_Node,End_Node,BubbleNum,BubbleID,NumSVNodes,Overlap_Genes,Overlap_Gene_RvIDs
192,NC_000962.3,1761789,1761789,0,3511,"s575,s1800,s2706,s1801,s576","[s1800, s2706, s1801]",s575,s576,193,BubbleRegion_193,3,mmpL6,Rv1557


In [309]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

!mkdir $PangenomeAnalysis_Dir

MG_BubbleSumm_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleSummary.BED.tsv.gz"     


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV5’: File exists


In [310]:
MG_SV_BED_DF.to_csv(MG_BubbleSumm_TSV_GZ,
                         sep = "\t", index=False)

In [311]:
MG_SV_BED_DF.head(2)

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,NodePath_Trimmed,Start_Node,End_Node,BubbleNum,BubbleID,NumSVNodes,Overlap_Genes,Overlap_Gene_RvIDs
0,NC_000962.3,1533,1533,0,0,"s1,s2",[],s1,s2,1,BubbleRegion_1,0,,
1,NC_000962.3,1591,1652,61,1480,"s2,s2034,s2959,s2035,s3,s2036,s2036,s3,s2035,s2959,s2034,s4,s5","[s2034, s2959, s2035, s3, s2036, s2036, s3, s2035, s2959, s2034, s4]",s2,s5,2,BubbleRegion_2,11,,


### Create Dict of NodeID to BubbleID

In [312]:
NodeID_ToBubbleID_Dict = {}

for i, row in MG_SV_BED_DF.iterrows():
    
    Bubble_Num = i + 1
    BubbleID = "BubbleRegion_" + str(Bubble_Num)
    i_NodePath_Trimmed = row["NodePath_Trimmed"]

    # Map all SV nodes to their Bubble Region ID
    for NodeID in i_NodePath_Trimmed:
        NodeID_ToBubbleID_Dict[NodeID] = BubbleID
        

In [313]:
list(NodeID_ToBubbleID_Dict.items())[:10]

[('s2034', 'BubbleRegion_2'),
 ('s2959', 'BubbleRegion_2'),
 ('s2035', 'BubbleRegion_2'),
 ('s3', 'BubbleRegion_2'),
 ('s2036', 'BubbleRegion_2'),
 ('s4', 'BubbleRegion_2'),
 ('s2744', 'BubbleRegion_3'),
 ('s2247', 'BubbleRegion_4'),
 ('s7', 'BubbleRegion_4'),
 ('s2823', 'BubbleRegion_5')]

## Classify all nodes by wheither they are a CORE NODE or a SV NODE within the graph

In [314]:
SV_NodeIDs_All = []
Core_NodeIDs = []

for i, row in MG_SV_BED_DF.iterrows():

    i_NodePath_Trimmed = row["NodePath_Trimmed"]
    
    SV_NodeIDs_All += (i_NodePath_Trimmed)
    
    Core_NodeIDs.append( row["Start_Node"] )
    Core_NodeIDs.append( row["End_Node"] )

SV_NodeIDs_All = list(set(SV_NodeIDs_All))
Core_NodeIDs = list(set(Core_NodeIDs))

print("# of SV Nodes:", len(SV_NodeIDs_All))
print("# of Core Nodes:", len(Core_NodeIDs))

# of SV Nodes: 2602
# of Core Nodes: 536


In [315]:
len(list(set(SV_NodeIDs_All)) )

2602

In [316]:
len(Core_NodeIDs)

536

# Part 5: Pairwise comparison of k-mer content between all SV Nodes

In this section we calculate the **jaccard similarity** and **jaccard containment** between all pairwise combinations of SV nodes within the SV pan-genome graph.

This will allow us to identify pairs of nodes that have very similar sequence content (measured through overlap of k-mers)

## Run all vs all k-mer comparison

In [317]:
All_Nodes_List = list(MG_dictOf_NodeInfo.keys())
len(All_Nodes_List)

3138

In [318]:
listOfTuples = []

for record_Name_1 in tqdm( All_Nodes_List ) :
    for record_Name_2 in All_Nodes_List:
        record_1_Hashes = MG_dictOf_NodeInfo[record_Name_1]["Kmer_Hashes_Set"]
        record_2_Hashes = MG_dictOf_NodeInfo[record_Name_2]["Kmer_Hashes_Set"]
        
        if (record_1_Hashes) & (record_2_Hashes): # Verify that 31-mer hashes are available for both nodes ("records")
            
            record_1and2_JS = jaccard_similarity_FromSets(record_1_Hashes, record_2_Hashes)

            if record_1and2_JS != 0: # If jaccard similarity is 0, don't bother to calculate JC or save info
                record_1and2_JC = jaccard_containment_FromSets(record_1_Hashes, record_2_Hashes)
                
                record_1_SeqLen = MG_dictOf_NodeInfo[record_Name_1]["Len"]
                record_2_SeqLen = MG_dictOf_NodeInfo[record_Name_2]["Len"]
                    
                i_Tuple = (record_Name_1, record_Name_2, 
                           record_1_SeqLen, record_2_SeqLen,
                           record_1and2_JS, record_1and2_JC)
                
                listOfTuples.append(i_Tuple)


AvA_Nodes_DF = pd.DataFrame(listOfTuples)
AvA_Nodes_DF.columns = ["RecordID_1", "RecordID_2", "Record1_Len", "Record2_Len", "JaccardSim", "JaccardContain"]

# Exclude comparisons between the same node sequence
AvA_Nodes_DF = AvA_Nodes_DF.query("RecordID_1 != RecordID_2")


100%|██████████| 3138/3138 [03:33<00:00, 14.68it/s]


In [319]:
AvA_Nodes_DF.shape

(123104, 6)

#### Peak at All vs All DF

In [320]:
AvA_Nodes_DF.head(3)

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
3,s3,s2959,56,61,0.78125,0.961538
7,s7,s2247,1876,42,0.006501,0.006501
9,s8,s2823,542,87,0.096339,0.097656


### Output All vs All Node k-mer comparison to TSV

In [321]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

!mkdir $PangenomeAnalysis_Dir

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz"     

AvA_Nodes_DF.to_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV5’: File exists


In [322]:
!du -sh $MG_AvA_Node_KmerAnalysis_TSV_GZ

1.2M	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz


In [323]:
!ls -1 $PangenomeAnalysis_Dir

MtbSVPG.Minigraph.BubbleAlleleInfo.SVInfo.tsv.gz
MtbSVPG.Minigraph.BubbleAlleleInfo.SVVCF.tsv.gz
MtbSVPG.Minigraph.BubbleSummary.BED.tsv.gz
MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz
MtbSVPG.Minigraph.NodeKmerComp.Summary.V1.tsv.gz
MtbSVPG.Pangraph.NodeKmerComp.AllVsAll.V1.tsv.gz
MtbSVPG.Pangraph.NodeKmerComp.Summary.V1.tsv.gz


# Part 6: Comparison of k-mer content of SV nodes to k-mer profiles of interest


information pathways 15 265284
conserved hypotheticals 15 723221
cell wall and cell processes 15 783261
stable RNAs 15 6735
intermediary metabolism and respiration 15 1011332
regulatory proteins 15 161826
virulence, detoxification, adaptation 15 150509
insertion seqs and phages 15 69102
lipid metabolism 15 407894
PE/PPE 15 265028
unknown 15 7938


## 0) Prep sets of unique K-mer for each Rv gene category

In [324]:
RvGeneCat_To_KmerHashes_Dict.keys()

dict_keys(['information pathways', 'conserved hypotheticals', 'cell wall and cell processes', 'stable RNAs', 'intermediary metabolism and respiration', 'regulatory proteins', 'virulence, detoxification, adaptation', 'insertion seqs and phages', 'lipid metabolism', 'PE/PPE', 'unknown'])

In [325]:
Rv_InfoPathways_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['information pathways']
Rv_ConservedHypo_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['conserved hypotheticals']
Rv_CellWallCellProc_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['cell wall and cell processes']
Rv_StableRNAs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['stable RNAs']
Rv_InterMetabolism_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['intermediary metabolism and respiration']
Rv_RegProteins_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['regulatory proteins']
Rv_VirulenceDetoxAdaptation_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['virulence, detoxification, adaptation']
Rv_LipidMetabolism_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['lipid metabolism']
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']
Rv_MGEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['insertion seqs and phages']
Rv_Unknown_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['unknown']

### A) Loop over all nodes and evaluate overlap w/ H37Rv, IS6110, annotated phages + insertion sequences, etc

In [326]:
NodeAnalysis_listOfRows = []

for NodeID, NodeInfo in tqdm( MG_dictOf_NodeInfo.items() ) :
    
    record_Hashes_Set = NodeInfo["Kmer_Hashes_Set"] 
               
    Len_Seq = NodeInfo["Len"]
        
    if len(record_Hashes_Set) != 0:
        Record_Hash_JC_WiH37Rv = jaccard_containment_FromSets( record_Hashes_Set, H37Rv_Hashes_Set)
        Record_Hash_JC_WiIS6110 = jaccard_containment_FromSets( record_Hashes_Set, IS6110_Ex1_Hashes_Set)

        Record_Hash_JC_WiRv_InsSeqAndPhages = jaccard_containment_FromSets( record_Hashes_Set, Rv_MGEs_Hashes_Set)
        Record_Hash_JC_WiRv_PEPPEs = jaccard_containment_FromSets( record_Hashes_Set, Rv_PEPPEs_Hashes_Set)
        Record_Hash_JC_WiRv_InfoPathways = jaccard_containment_FromSets( record_Hashes_Set, Rv_InfoPathways_Hashes_Set)
        Record_Hash_JC_WiRv_ConservedHypo = jaccard_containment_FromSets( record_Hashes_Set, Rv_ConservedHypo_Hashes_Set)
        Record_Hash_JC_WiRv_CellWallCellProc = jaccard_containment_FromSets( record_Hashes_Set, Rv_CellWallCellProc_Hashes_Set)
        Record_Hash_JC_WiRv_StableRNAs = jaccard_containment_FromSets( record_Hashes_Set, Rv_StableRNAs_Hashes_Set)
        Record_Hash_JC_WiRv_InterMetabolism = jaccard_containment_FromSets( record_Hashes_Set, Rv_InterMetabolism_Hashes_Set)
        Record_Hash_JC_WiRv_RegProteins = jaccard_containment_FromSets( record_Hashes_Set, Rv_RegProteins_Hashes_Set)
        Record_Hash_JC_WiRv_VirulenceDetoxAdaptation = jaccard_containment_FromSets( record_Hashes_Set, Rv_VirulenceDetoxAdaptation_Hashes_Set)
        Record_Hash_JC_WiRv_LipidMetabolism = jaccard_containment_FromSets( record_Hashes_Set, Rv_LipidMetabolism_Hashes_Set)
        Record_Hash_JC_WiRv_Unknown = jaccard_containment_FromSets( record_Hashes_Set, Rv_Unknown_Hashes_Set)


    else:
        Record_Hash_JC_WiH37Rv = 0
        Record_Hash_JC_WiIS6110 = 0
        Record_Hash_JC_WiRv_InsSeqAndPhages = 0 
        Record_Hash_JC_WiRv_PEPPEs = 0 
        Record_Hash_JC_WiRv_InfoPathways = 0 
        Record_Hash_JC_WiRv_ConservedHypo = 0 
        Record_Hash_JC_WiRv_CellWallCellProc = 0 
        Record_Hash_JC_WiRv_StableRNAs = 0 
        Record_Hash_JC_WiRv_InterMetabolism = 0 
        Record_Hash_JC_WiRv_RegProteins = 0 
        Record_Hash_JC_WiRv_VirulenceDetoxAdaptation = 0 
        Record_Hash_JC_WiRv_LipidMetabolism = 0 
        Record_Hash_JC_WiRv_Unknown = 0 
        
        if Len_Seq < 31:
            print(f"No kmers were produced for segment: {NodeID}")
                
    Status_SVNode = (NodeID in SV_NodeIDs_All)
    
    i_Row = (NodeID,
             Status_SVNode,
             Len_Seq,
             Record_Hash_JC_WiH37Rv,
             Record_Hash_JC_WiIS6110,
             Record_Hash_JC_WiRv_InsSeqAndPhages,
             Record_Hash_JC_WiRv_PEPPEs, 
             Record_Hash_JC_WiRv_InfoPathways, 
             Record_Hash_JC_WiRv_ConservedHypo, 
             Record_Hash_JC_WiRv_CellWallCellProc, 
             Record_Hash_JC_WiRv_StableRNAs, 
             Record_Hash_JC_WiRv_InterMetabolism, 
             Record_Hash_JC_WiRv_RegProteins, 
             Record_Hash_JC_WiRv_VirulenceDetoxAdaptation, 
             Record_Hash_JC_WiRv_LipidMetabolism, 
             Record_Hash_JC_WiRv_Unknown )
    
    NodeAnalysis_listOfRows.append(i_Row)

    
MG_Nodes_KmerComp_DF = pd.DataFrame(NodeAnalysis_listOfRows)

MG_Nodes_KmerComp_DF.columns = ["NodeID",
                                "IsSVNode",
                                "SeqLength",
                                "Jaccard_Cont_WiRv",
                                "Jaccard_Cont_WiIS6110",
                                "Jaccard_Cont_WiRv_InsSeqAndPhages",
                                "Jaccard_Cont_WiRv_PEPPEs",
                                "Jaccard_Cont_WiRv_InfoPathways",
                                "Jaccard_Cont_WiRv_ConservedHypo", 
                                "Jaccard_Cont_WiRv_CellWallCellProc",
                                "Jaccard_Cont_WiRv_StableRNAs",
                                "Jaccard_Cont_WiRv_InterMetabolism",
                                "Jaccard_Cont_WiRv_RegProteins",
                                "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation",
                                "Jaccard_Cont_WiRv_LipidMetabolism",
                                "Jaccard_Cont_WiRv_Unknown" ]


100%|██████████| 3138/3138 [00:06<00:00, 512.46it/s] 


### B) Add a "BubbleID" column to the node-level analysis

In [327]:
MG_Nodes_KmerComp_DF["BubbleID"] = MG_Nodes_KmerComp_DF["NodeID"].map(NodeID_ToBubbleID_Dict)
MG_Nodes_KmerComp_DF["BubbleID"] = MG_Nodes_KmerComp_DF["BubbleID"] = MG_Nodes_KmerComp_DF["BubbleID"].fillna("None")
MG_Nodes_KmerComp_DF.shape

(3138, 17)

In [328]:
MG_Nodes_KmerComp_DF.head(10)

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID
0,s1,False,1533,1.0,0.0,0.0,0.0,0.994012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,s2,False,58,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,s3,True,56,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2
3,s4,True,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2
4,s5,False,11970,1.0,0.0,0.0,0.0,0.612312,0.044724,0.141457,0.007286,0.0,0.0,0.0,0.0,0.0,
5,s6,False,12847,1.0,0.0,0.0,0.0,0.0,0.041117,0.299368,0.004135,0.052196,0.516814,0.0,0.0,0.0,
6,s7,True,1876,1.0,0.0,0.0,0.0,0.0,0.206934,0.0,0.0,0.0,0.601842,0.0,0.0,0.0,BubbleRegion_4
7,s8,False,542,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.96875,0.0,0.0,
8,s9,False,5979,1.0,0.0,0.030761,0.0,0.0,0.602118,0.0,0.0,0.091108,0.0,0.048748,0.0,0.0,
9,s10,False,4133,1.0,0.0,0.0,0.0,0.0,0.089203,0.0,0.0,0.417743,0.0,0.0,0.46137,0.0,


In [329]:
MG_Nodes_KmerComp_DF.shape

(3138, 17)

### C) Calculate the highest k-mer jaccard containment relative to another node (for each node)
This step uses the all vs all comparison of k-mer content, to identify nodes that have jaccard containment to at least 1 other node within the graph

In [330]:
# Step 1: Create a dict that maps NodeID to the maximum k-mer jaccard containment to any other another node
Dict_MaxJC_ToOtherNode = AvA_Nodes_DF.groupby("RecordID_1").max()['JaccardContain'].to_dict()

# Step 2: Add the Max Jaccard Containment (MaxJC) as a column, 
MG_Nodes_KmerComp_DF["MaxJC_ToOtherNode"] = MG_Nodes_KmerComp_DF["NodeID"].map(Dict_MaxJC_ToOtherNode)

# Step 3: Fill w/ 0 if not in dictionary
MG_Nodes_KmerComp_DF["MaxJC_ToOtherNode"] = MG_Nodes_KmerComp_DF["MaxJC_ToOtherNode"].fillna(0)

### Output Minigraph NODE Kmer Analysis to TSV

In [331]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

!mkdir $PangenomeAnalysis_Dir

MG_Node_KmerComp_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.Summary.V1.tsv.gz" 

MG_Nodes_KmerComp_DF.to_csv(MG_Node_KmerComp_TSV_GZ, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV5’: File exists


# Part 7: Extra reformating of minigraph summary files (SV-Info-TSV, and SV-VCF)

## 7.A) Augment the "SV Info TSV" (From Minigraph)

In [332]:
Header_WiSamplesIDs_InOrderSV = ["Chr", "Start", "End", "Info", "FORMAT",
       'NC_000962.3','N0072','N0153','TB3113','TB1236','TB2659','TB2780','TB1612','TB2512','TB2981','TB3091',
       'M0003941_3','TB3368','N0145','N0155','TB2995','TB3396','N0004','N1274','N0054','02_R1179','01_R1134',
     'M0017522_5','M0016395_7','M0010874_7','02_R1708','02_R0894','01_R1430','M0014888_3','02_R1896','TB4620',
     'TB3162','MT_0080','TB3054','TB3251','M0016737_0','TB2661','TB3237','TB3169','TB3386','TB3334','M0011368_9',
     'TB2968','N1272','N1176','N1202','N1177','N0091','RW-TB008','9050-05','4549-04','696-05','702-06','706-05',
     '8129-04','3003-06','8651-04','QC-3','QC-9','QC-5','QC-8','QC-10','QC-4','QC-7','QC-6','QC-1','mada_1-10',
     'mada_2-46','mada_1-1','mada_1-36','mada_1-39','mada_1-51','mada_1-44','mada_117','mada_118','mada_122',
     'mada_107','R27252','R23887','R30215','R30078','R29816','R30234','18_0621851','R36431','R28703','mada_115',
     'mada_2-42','R31095','R28012','R37765','R27657','R25048','R24120','R28581','R29598','mada_1-11','R24100',
     'R21408','R20574','R20260','R18043','R22601','R23146','R32929','R21893','R30420','R26778','R26791','R28980',
     'R27725','R18040','R27937','mada_1-30','mada_2-31','mada_1-41','R21770','R21839','mada_1-32','R30396',
     'R21363','R20896','mada_102','mada_129','mada_139','mada_151','mada_105','R15311','mada_103','mada_2-25',
     'mada_112','mada_124','mada_126','mada_120','R23571','mada_128','mada_1-12','mada_1-15','mada_2-1','mada_1-53',
     'mada_1-50','mada_123','mada_2-53','mada_1-38','S0070-08','S0085-01','S0107-01','S0089-01','S0256-08','S0123-01',
     'S0106-01', 'S0262-02']

MG_SVInfo_DF = pd.read_csv(MG_WGA151CI_MergedSVInfo_TSV, sep = "\t", comment = "#",
                           header = None)

MG_SVInfo_DF.columns = Header_WiSamplesIDs_InOrderSV

MG_SVInfo_DF["NA"] = MG_SVInfo_DF["Info"].str.split(";").str[1].str.split("=").str[-1].astype(int)

MG_SVInfo_DF["BubbleNum"] = (MG_SVInfo_DF.index.values + 1)
MG_SVInfo_DF["BubbleID"] = "BubbleRegion_" + MG_SVInfo_DF["BubbleNum"].astype(str)

MG_SVInfo_DF.shape

(535, 160)

In [333]:
MG_SVInfo_DF.head(2)

Unnamed: 0,Chr,Start,End,Info,FORMAT,NC_000962.3,N0072,N0153,TB3113,TB1236,TB2659,TB2780,TB1612,TB2512,TB2981,TB3091,M0003941_3,TB3368,N0145,N0155,TB2995,TB3396,N0004,N1274,N0054,02_R1179,01_R1134,M0017522_5,M0016395_7,M0010874_7,02_R1708,02_R0894,01_R1430,M0014888_3,02_R1896,TB4620,TB3162,MT_0080,TB3054,TB3251,M0016737_0,TB2661,TB3237,TB3169,TB3386,TB3334,M0011368_9,TB2968,N1272,N1176,N1202,N1177,N0091,RW-TB008,9050-05,4549-04,696-05,702-06,706-05,8129-04,3003-06,8651-04,QC-3,QC-9,QC-5,QC-8,QC-10,QC-4,QC-7,QC-6,QC-1,mada_1-10,mada_2-46,mada_1-1,mada_1-36,mada_1-39,mada_1-51,mada_1-44,mada_117,mada_118,mada_122,mada_107,R27252,R23887,R30215,R30078,R29816,R30234,18_0621851,R36431,R28703,mada_115,mada_2-42,R31095,R28012,R37765,R27657,R25048,R24120,R28581,R29598,mada_1-11,R24100,R21408,R20574,R20260,R18043,R22601,R23146,R32929,R21893,R30420,R26778,R26791,R28980,R27725,R18040,R27937,mada_1-30,mada_2-31,mada_1-41,R21770,R21839,mada_1-32,R30396,R21363,R20896,mada_102,mada_129,mada_139,mada_151,mada_105,R15311,mada_103,mada_2-25,mada_112,mada_124,mada_126,mada_120,R23571,mada_128,mada_1-12,mada_1-15,mada_2-1,mada_1-53,mada_1-50,mada_123,mada_2-53,mada_1-38,S0070-08,S0085-01,S0107-01,S0089-01,S0256-08,S0123-01,S0106-01,S0262-02,NA,BubbleNum,BubbleID
0,NC_000962.3,1533,1533,NS=152;NA=1;ALEN=0;AC=152;VS=>s1;VE=>s2;AWALK=*,GT:CSTRAND:CTG:CS:CE,0:+:NC_000962.3:1531:1601,0:+:N0072:1531:1601,0:+:N0153:1531:1601,0:+:TB3113:1531:2959,0:+:TB1236:1531:2959,0:+:TB2659:1531:2959,0:+:TB2780:1531:2959,0:+:TB1612:1531:2959,0:+:TB2512:1531:2959,0:+:TB2981:1531:2959,0:+:TB3091:1531:2959,0:+:M0003941_3:1531:2959,0:+:TB3368:1531:2959,0:+:N0145:1531:2959,0:+:N0155:1531:2959,0:+:TB2995:1531:2959,0:+:TB3396:1531:2959,0:+:N0004:1531:1601,0:+:N1274:1531:1601,0:+:N0054:1531:1601,0:+:02_R1179:1531:1601,0:+:01_R1134:1531:1601,0:+:M0017522_5:1531:1601,0:+:M0016395_7:1531:1601,0:+:M0010874_7:1531:1601,0:+:02_R1708:1531:1601,0:+:02_R0894:1531:1601,0:+:01_R1430:1531:1601,0:+:M0014888_3:1531:1601,0:+:02_R1896:1531:1601,0:+:TB4620:1531:1601,0:+:TB3162:1531:1601,0:+:MT_0080:1531:1601,0:+:TB3054:1531:1601,0:+:TB3251:1531:1601,0:+:M0016737_0:1531:1601,0:+:TB2661:1531:1601,0:+:TB3237:1531:1601,0:+:TB3169:1531:1601,0:+:TB3386:1531:1601,0:+:TB3334:1531:1601,0:+:M0011368_9:1531:1601,0:+:TB2968:1531:1601,0:+:N1272:1531:1601,0:+:N1176:1531:1601,0:+:N1202:1531:1601,0:+:N1177:1531:1601,0:+:N0091:1531:1601,0:+:RW-TB008:1531:1601,0:+:9050-05:1531:2960,0:+:4549-04:1531:2959,0:+:696-05:1531:2961,0:+:702-06:1531:2960,0:+:706-05:1531:2959,0:+:8129-04:1531:2960,0:+:3003-06:1531:2960,0:+:8651-04:1531:2959,0:+:QC-3:1531:1601,0:+:QC-9:1531:1601,0:+:QC-5:1531:1601,0:+:QC-8:1531:1601,0:+:QC-10:1531:1601,0:+:QC-4:1531:1601,0:+:QC-7:1531:1601,0:+:QC-6:1531:1601,0:+:QC-1:1531:1601,0:+:mada_1-10:1531:1601,0:+:mada_2-46:1531:1601,0:+:mada_1-1:1531:1601,0:+:mada_1-36:1531:1601,0:+:mada_1-39:1531:1601,0:+:mada_1-51:1531:1601,0:+:mada_1-44:1531:1601,0:+:mada_117:1531:1601,0:+:mada_118:1531:1601,0:+:mada_122:1531:1601,0:+:mada_107:1531:1601,0:+:R27252:1531:1601,0:+:R23887:1531:1601,0:+:R30215:1531:2959,0:+:R30078:1531:2959,0:+:R29816:1531:2959,0:+:R30234:1531:2959,0:+:18_0621851:1531:2959,0:+:R36431:1531:2959,0:+:R28703:1531:2959,0:+:mada_115:1531:2959,0:+:mada_2-42:1531:2959,0:+:R31095:1531:2959,0:+:R28012:1531:2959,0:+:R37765:1531:2959,0:+:R27657:1531:2959,0:+:R25048:1531:2959,0:+:R24120:1531:2959,0:+:R28581:1531:2959,0:+:R29598:1531:2959,0:+:mada_1-11:1531:2959,0:+:R24100:1531:2959,0:+:R21408:1531:2959,0:+:R20574:1531:2959,0:+:R20260:1531:2959,0:+:R18043:1531:2959,0:+:R22601:1531:2959,0:+:R23146:1531:2959,0:+:R32929:1531:2959,0:+:R21893:1531:2959,0:+:R30420:1531:2959,0:+:R26778:1531:2959,0:+:R26791:1531:2959,0:+:R28980:1531:2959,0:+:R27725:1531:2959,0:+:R18040:1531:2959,0:+:R27937:1531:2959,0:+:mada_1-30:1531:1601,0:+:mada_2-31:1531:1601,0:+:mada_1-41:1531:1601,0:+:R21770:1531:1601,0:+:R21839:1531:1601,0:+:mada_1-32:1531:1601,0:+:R30396:1531:1601,0:+:R21363:1531:1601,0:+:R20896:1531:1601,0:+:mada_102:1531:1601,0:+:mada_129:1531:1601,0:+:mada_139:1531:1601,0:+:mada_151:1531:1601,0:+:mada_105:1531:1601,0:+:R15311:1531:1601,0:+:mada_103:1531:1601,0:+:mada_2-25:1531:1601,0:+:mada_112:1531:1601,0:+:mada_124:1531:1601,0:+:mada_126:1531:1601,0:+:mada_120:1531:1601,0:+:R23571:1531:1600,0:+:mada_128:1531:1600,0:+:mada_1-12:1531:1600,0:+:mada_1-15:1531:1600,0:+:mada_2-1:1531:1600,0:+:mada_1-53:1531:1600,0:+:mada_1-50:1531:1600,0:+:mada_123:1531:2958,0:+:mada_2-53:1531:1600,0:+:mada_1-38:1531:1601,0:+:S0070-08:1531:2959,0:+:S0085-01:1531:2959,0:+:S0107-01:1531:2959,0:+:S0089-01:1531:2959,0:+:S0256-08:1531:2959,0:+:S0123-01:1531:1601,0:+:S0106-01:1531:1601,0:+:S0262-02:1531:1601,1,1,BubbleRegion_1
1,NC_000962.3,1591,1652,NS=88;NA=1;ALEN=61;AC=88;VS=>s2;VE=>s5;AWALK=>s3>s4,GT:CSTRAND:CTG:CS:CE,0:+:NC_000962.3:1531:1654,0:+:N0072:1531:1654,0:+:N0153:1531:1654,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:+:N0004:1531:1654,0:+:N1274:1531:1654,0:+:N0054:1531:1654,0:+:02_R1179:1531:1654,0:+:01_R1134:1531:1654,0:+:M0017522_5:1531:1654,0:+:M0016395_7:1531:1654,0:+:M0010874_7:1531:1654,0:+:02_R1708:1531:1654,0:+:02_R0894:1531:1654,0:+:01_R1430:1531:1654,0:+:M0014888_3:1531:1654,0:+:02_R1896:1531:1654,0:+:TB4620:1531:1654,0:+:TB3162:1531:1654,0:+:MT_0080:1531:1654,0:+:TB3054:1531:1654,0:+:TB3251:1531:1654,0:+:M0016737_0:1531:1654,0:+:TB2661:1531:1654,0:+:TB3237:1531:1654,0:+:TB3169:1531:1654,0:+:TB3386:1531:1654,0:+:TB3334:1531:1654,0:+:M0011368_9:1531:1654,0:+:TB2968:1531:1654,0:+:N1272:1531:1654,0:+:N1176:1531:1654,0:+:N1202:1531:1654,.,0:+:N0091:1531:1654,0:+:RW-TB008:1531:1654,.,.,.,.,.,.,.,.,0:+:QC-3:1531:1654,0:+:QC-9:1531:1654,0:+:QC-5:1531:1654,0:+:QC-8:1531:1654,0:+:QC-10:1531:1654,0:+:QC-4:1531:1654,0:+:QC-7:1531:1654,0:+:QC-6:1531:1654,0:+:QC-1:1531:1654,0:+:mada_1-10:1531:1654,0:+:mada_2-46:1531:1654,0:+:mada_1-1:1531:1654,0:+:mada_1-36:1531:1654,0:+:mada_1-39:1531:1654,0:+:mada_1-51:1531:1654,0:+:mada_1-44:1531:1654,0:+:mada_117:1531:1654,0:+:mada_118:1531:1654,0:+:mada_122:1531:1654,0:+:mada_107:1531:1654,0:+:R27252:1531:1654,0:+:R23887:1531:1654,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:+:mada_2-31:1531:1654,0:+:mada_1-41:1531:1654,0:+:R21770:1531:1654,0:+:R21839:1531:1654,0:+:mada_1-32:1531:1654,0:+:R30396:1531:1654,0:+:R21363:1531:1654,0:+:R20896:1531:1654,0:+:mada_102:1531:1654,0:+:mada_129:1531:1654,0:+:mada_139:1531:1654,0:+:mada_151:1531:1654,0:+:mada_105:1531:1654,0:+:R15311:1531:1654,0:+:mada_103:1531:1654,0:+:mada_2-25:1531:1654,0:+:mada_112:1531:1654,0:+:mada_124:1531:1654,0:+:mada_126:1531:1654,0:+:mada_120:1531:1654,0:+:R23571:1531:1653,0:+:mada_128:1531:1653,0:+:mada_1-12:1531:1653,0:+:mada_1-15:1531:1653,0:+:mada_2-1:1531:1653,0:+:mada_1-53:1531:1653,0:+:mada_1-50:1531:1653,.,0:+:mada_2-53:1531:1653,0:+:mada_1-38:1531:1654,.,.,.,.,.,0:+:S0123-01:1531:1654,0:+:S0106-01:1531:1654,0:+:S0262-02:1531:1654,1,2,BubbleRegion_2


### Output processed SV-Info-TSV

In [334]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

!mkdir $PangenomeAnalysis_Dir

MG_SVInfo_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVInfo.tsv.gz" 

MG_SVInfo_DF.to_csv(MG_SVInfo_TSV_GZ, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV5’: File exists


## 7.B) Augment the "SV VCF" (From Minigraph)

In [335]:
SVVCF_Header_WiSamplesIDs_InOrderSV = ["Chr", "Pos", "ID", "Ref", "Alt", "QUAL", "FILTER", "Info", "FORMAT",
       'NC_000962.3','N0072','N0153','TB3113','TB1236','TB2659','TB2780','TB1612','TB2512','TB2981','TB3091',
       'M0003941_3','TB3368','N0145','N0155','TB2995','TB3396','N0004','N1274','N0054','02_R1179','01_R1134',
     'M0017522_5','M0016395_7','M0010874_7','02_R1708','02_R0894','01_R1430','M0014888_3','02_R1896','TB4620',
     'TB3162','MT_0080','TB3054','TB3251','M0016737_0','TB2661','TB3237','TB3169','TB3386','TB3334','M0011368_9',
     'TB2968','N1272','N1176','N1202','N1177','N0091','RW-TB008','9050-05','4549-04','696-05','702-06','706-05',
     '8129-04','3003-06','8651-04','QC-3','QC-9','QC-5','QC-8','QC-10','QC-4','QC-7','QC-6','QC-1','mada_1-10',
     'mada_2-46','mada_1-1','mada_1-36','mada_1-39','mada_1-51','mada_1-44','mada_117','mada_118','mada_122',
     'mada_107','R27252','R23887','R30215','R30078','R29816','R30234','18_0621851','R36431','R28703','mada_115',
     'mada_2-42','R31095','R28012','R37765','R27657','R25048','R24120','R28581','R29598','mada_1-11','R24100',
     'R21408','R20574','R20260','R18043','R22601','R23146','R32929','R21893','R30420','R26778','R26791','R28980',
     'R27725','R18040','R27937','mada_1-30','mada_2-31','mada_1-41','R21770','R21839','mada_1-32','R30396',
     'R21363','R20896','mada_102','mada_129','mada_139','mada_151','mada_105','R15311','mada_103','mada_2-25',
     'mada_112','mada_124','mada_126','mada_120','R23571','mada_128','mada_1-12','mada_1-15','mada_2-1','mada_1-53',
     'mada_1-50','mada_123','mada_2-53','mada_1-38','S0070-08','S0085-01','S0107-01','S0089-01','S0256-08','S0123-01',
     'S0106-01', 'S0262-02']  

MG_SVVCF_DF = pd.read_csv(MG_WGA151CI_MergedSVInfo_SVVCF, sep = "\t", comment = "#",
                           header = None)
MG_SVVCF_DF.columns = SVVCF_Header_WiSamplesIDs_InOrderSV

MG_SVVCF_DF["NA"] = MG_SVVCF_DF["Info"].str.split(";").str[1].str.split("=").str[-1].astype(int)
MG_SVVCF_DF["AWALK_Full"] = MG_SVVCF_DF["Info"].str.split(";").str[6].str.split("=").str[-1]
MG_SVVCF_DF["AWALK_List"] = MG_SVVCF_DF["Info"].str.split(";").str[6].str.split("=").str[-1].str.split(",")
MG_SVVCF_DF["BubbleID"] = MG_SV_BED_DF["BubbleID"]
MG_SVVCF_DF["Start_Node"] = MG_SV_BED_DF["Start_Node"]
MG_SVVCF_DF["End_Node"] = MG_SV_BED_DF["End_Node"]
MG_SVVCF_DF["NodePath_Trimmed"] = MG_SV_BED_DF["NodePath_Trimmed"]

MG_SVVCF_DF["NumSVNodes"] = MG_SVVCF_DF["NodePath_Trimmed"].str.len()

MG_SVVCF_DF.shape

(535, 169)

### Output processed SV-VCF as TSV

In [336]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

!mkdir $PangenomeAnalysis_Dir

MG_SVVCF_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVVCF.tsv.gz" 

MG_SVVCF_DF.to_csv(MG_SVVCF_TSV_GZ, sep = "\t", index = False)


mkdir: cannot create directory ‘../../Data/MtbPangenomeAnalysis_SetV5’: File exists


In [337]:
MG_SVVCF_DF.head(3)

Unnamed: 0,Chr,Pos,ID,Ref,Alt,QUAL,FILTER,Info,FORMAT,NC_000962.3,N0072,N0153,TB3113,TB1236,TB2659,TB2780,TB1612,TB2512,TB2981,TB3091,M0003941_3,TB3368,N0145,N0155,TB2995,TB3396,N0004,N1274,N0054,02_R1179,01_R1134,M0017522_5,M0016395_7,M0010874_7,02_R1708,02_R0894,01_R1430,M0014888_3,02_R1896,TB4620,TB3162,MT_0080,TB3054,TB3251,M0016737_0,TB2661,TB3237,TB3169,TB3386,TB3334,M0011368_9,TB2968,N1272,N1176,N1202,N1177,N0091,RW-TB008,9050-05,4549-04,696-05,702-06,706-05,8129-04,3003-06,8651-04,QC-3,QC-9,QC-5,QC-8,QC-10,QC-4,QC-7,QC-6,QC-1,mada_1-10,mada_2-46,mada_1-1,mada_1-36,mada_1-39,mada_1-51,mada_1-44,mada_117,mada_118,mada_122,mada_107,R27252,R23887,R30215,R30078,R29816,R30234,18_0621851,R36431,R28703,mada_115,mada_2-42,R31095,R28012,R37765,R27657,R25048,R24120,R28581,R29598,mada_1-11,R24100,R21408,R20574,R20260,R18043,R22601,R23146,R32929,R21893,R30420,R26778,R26791,R28980,R27725,R18040,R27937,mada_1-30,mada_2-31,mada_1-41,R21770,R21839,mada_1-32,R30396,R21363,R20896,mada_102,mada_129,mada_139,mada_151,mada_105,R15311,mada_103,mada_2-25,mada_112,mada_124,mada_126,mada_120,R23571,mada_128,mada_1-12,mada_1-15,mada_2-1,mada_1-53,mada_1-50,mada_123,mada_2-53,mada_1-38,S0070-08,S0085-01,S0107-01,S0089-01,S0256-08,S0123-01,S0106-01,S0262-02,NA,AWALK_Full,AWALK_List,BubbleID,Start_Node,End_Node,NodePath_Trimmed,NumSVNodes
0,NC_000962.3,1533,.,N,<CNV>,30,PASS,NS=152;NA=1;ALEN=0;AC=152;VS=>s1;VE=>s2;AWALK=*;END=1533,GT:GT0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,1,*,[*],BubbleRegion_1,s1,s2,[],0
1,NC_000962.3,1591,.,N,<CNV>,30,PASS,NS=88;NA=1;ALEN=61;AC=88;VS=>s2;VE=>s5;AWALK=>s3>s4;END=1652,GT:GT0,0:0,0:0,0:0,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,.,0:0,0:0,.,.,.,.,.,.,.,.,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,.,0:0,0:0,.,.,.,.,.,0:0,0:0,0:0,1,>s3>s4,[>s3>s4],BubbleRegion_2,s2,s5,"[s2034, s2959, s2035, s3, s2036, s2036, s3, s2035, s2959, s2034, s4]",11
2,NC_000962.3,13622,.,N,<CNV>,30,PASS,"NS=152;NA=2;ALEN=0,1358;AC=151,1;VS=>s5;VE=>s6;AWALK=*,>s2744;END=13622",GT:GT0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,1:1,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,2,"*,>s2744","[*, >s2744]",BubbleRegion_3,s5,s6,[s2744],1


In [338]:
MG_SVVCF_DF.query(" BubbleID == 'BubbleRegion_87' ")

Unnamed: 0,Chr,Pos,ID,Ref,Alt,QUAL,FILTER,Info,FORMAT,NC_000962.3,N0072,N0153,TB3113,TB1236,TB2659,TB2780,TB1612,TB2512,TB2981,TB3091,M0003941_3,TB3368,N0145,N0155,TB2995,TB3396,N0004,N1274,N0054,02_R1179,01_R1134,M0017522_5,M0016395_7,M0010874_7,02_R1708,02_R0894,01_R1430,M0014888_3,02_R1896,TB4620,TB3162,MT_0080,TB3054,TB3251,M0016737_0,TB2661,TB3237,TB3169,TB3386,TB3334,M0011368_9,TB2968,N1272,N1176,N1202,N1177,N0091,RW-TB008,9050-05,4549-04,696-05,702-06,706-05,8129-04,3003-06,8651-04,QC-3,QC-9,QC-5,QC-8,QC-10,QC-4,QC-7,QC-6,QC-1,mada_1-10,mada_2-46,mada_1-1,mada_1-36,mada_1-39,mada_1-51,mada_1-44,mada_117,mada_118,mada_122,mada_107,R27252,R23887,R30215,R30078,R29816,R30234,18_0621851,R36431,R28703,mada_115,mada_2-42,R31095,R28012,R37765,R27657,R25048,R24120,R28581,R29598,mada_1-11,R24100,R21408,R20574,R20260,R18043,R22601,R23146,R32929,R21893,R30420,R26778,R26791,R28980,R27725,R18040,R27937,mada_1-30,mada_2-31,mada_1-41,R21770,R21839,mada_1-32,R30396,R21363,R20896,mada_102,mada_129,mada_139,mada_151,mada_105,R15311,mada_103,mada_2-25,mada_112,mada_124,mada_126,mada_120,R23571,mada_128,mada_1-12,mada_1-15,mada_2-1,mada_1-53,mada_1-50,mada_123,mada_2-53,mada_1-38,S0070-08,S0085-01,S0107-01,S0089-01,S0256-08,S0123-01,S0106-01,S0262-02,NA,AWALK_Full,AWALK_List,BubbleID,Start_Node,End_Node,NodePath_Trimmed,NumSVNodes
86,NC_000962.3,767589,.,N,<CNV>,30,PASS,"NS=152;NA=2;ALEN=0,174;AC=151,1;VS=>s229;VE=>s230;AWALK=*,>s2805;END=767589",GT:GT0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,1:1,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,2,"*,>s2805","[*, >s2805]",BubbleRegion_87,s229,s230,[s2805],1


# Part 8: Code to reparse all of the generated files from this notebook

### Define paths

In [339]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

MG_Node_KmerComp_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.Summary.V1.tsv.gz" 

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz"     

MG_BubbleSumm_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleSummary.BED.tsv.gz"     

MG_SVVCF_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVVCF.tsv.gz" 

MG_SVInfo_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVInfo.tsv.gz" 


In [340]:
!ls -1 $PangenomeAnalysis_Dir

MtbSVPG.Minigraph.BubbleAlleleInfo.SVInfo.tsv.gz
MtbSVPG.Minigraph.BubbleAlleleInfo.SVVCF.tsv.gz
MtbSVPG.Minigraph.BubbleSummary.BED.tsv.gz
MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz
MtbSVPG.Minigraph.NodeKmerComp.Summary.V1.tsv.gz
MtbSVPG.Pangraph.NodeKmerComp.AllVsAll.V1.tsv.gz
MtbSVPG.Pangraph.NodeKmerComp.Summary.V1.tsv.gz


In [341]:
!du -sh $PangenomeAnalysis_Dir/

2.0M	../../Data/MtbPangenomeAnalysis_SetV5/


In [342]:
!du -sh $PangenomeAnalysis_Dir/*

631K	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Minigraph.BubbleAlleleInfo.SVInfo.tsv.gz
61K	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Minigraph.BubbleAlleleInfo.SVVCF.tsv.gz
32K	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Minigraph.BubbleSummary.BED.tsv.gz
1.2M	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz
57K	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Minigraph.NodeKmerComp.Summary.V1.tsv.gz
50K	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Pangraph.NodeKmerComp.AllVsAll.V1.tsv.gz
22K	../../Data/MtbPangenomeAnalysis_SetV5/MtbSVPG.Pangraph.NodeKmerComp.Summary.V1.tsv.gz


### Parse in `AvA_Nodes_DF`

In [343]:
AvA_Nodes_DF = pd.read_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t" )
AvA_Nodes_DF.shape

(123104, 6)

In [344]:
AvA_Nodes_DF.head()

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s3,s2959,56,61,0.78125,0.961538
1,s7,s2247,1876,42,0.006501,0.006501
2,s8,s2823,542,87,0.096339,0.097656
3,s13,s2037,24063,49,0.000208,0.000208
4,s13,s2039,24063,114,0.000208,0.000208


### Parse in `MG_Nodes_KmerVsRefSets_DF`

In [345]:
MG_Nodes_KmerComp_DF = pd.read_csv(MG_Node_KmerComp_TSV_GZ, sep = "\t" )
MG_Nodes_KmerComp_DF.shape

(3138, 18)

In [346]:
MG_Nodes_KmerComp_DF.head()

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode
0,s1,False,1533,1.0,0.0,0.0,0.0,0.994012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,s2,False,58,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,s3,True,56,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2,0.961538
3,s4,True,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2,0.0
4,s5,False,11970,1.0,0.0,0.0,0.0,0.612312,0.044724,0.141457,0.007286,0.0,0.0,0.0,0.0,0.0,,0.0


### Parse in `MG_SV_BED_DF`

In [347]:
MG_SV_BED_DF = pd.read_csv(MG_BubbleSumm_TSV_GZ, sep = "\t" )

# Convert the string representation of the list of nodes to python list of nodes
MG_SV_BED_DF["NodePath_Trimmed"] = MG_SV_BED_DF["NodePath_Trimmed"].apply(ast.literal_eval)

MG_SV_BED_DF.shape

(535, 14)

In [348]:
MG_SV_BED_DF.head(3)

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,NodePath_Trimmed,Start_Node,End_Node,BubbleNum,BubbleID,NumSVNodes,Overlap_Genes,Overlap_Gene_RvIDs
0,NC_000962.3,1533,1533,0,0,"s1,s2",[],s1,s2,1,BubbleRegion_1,0,,
1,NC_000962.3,1591,1652,61,1480,"s2,s2034,s2959,s2035,s3,s2036,s2036,s3,s2035,s2959,s2034,s4,s5","[s2034, s2959, s2035, s3, s2036, s2036, s3, s2035, s2959, s2034, s4]",s2,s5,2,BubbleRegion_2,11,,
2,NC_000962.3,13622,13622,0,1358,"s5,s2744,s6",[s2744],s5,s6,3,BubbleRegion_3,1,,


### Parse in `MG_SVVCF_DF`

In [349]:
MG_SVVCF_DF = pd.read_csv(MG_SVVCF_TSV_GZ, sep = "\t" )
MG_SVVCF_DF.shape

(535, 169)

In [350]:
MG_SVVCF_DF.head(3)

Unnamed: 0,Chr,Pos,ID,Ref,Alt,QUAL,FILTER,Info,FORMAT,NC_000962.3,N0072,N0153,TB3113,TB1236,TB2659,TB2780,TB1612,TB2512,TB2981,TB3091,M0003941_3,TB3368,N0145,N0155,TB2995,TB3396,N0004,N1274,N0054,02_R1179,01_R1134,M0017522_5,M0016395_7,M0010874_7,02_R1708,02_R0894,01_R1430,M0014888_3,02_R1896,TB4620,TB3162,MT_0080,TB3054,TB3251,M0016737_0,TB2661,TB3237,TB3169,TB3386,TB3334,M0011368_9,TB2968,N1272,N1176,N1202,N1177,N0091,RW-TB008,9050-05,4549-04,696-05,702-06,706-05,8129-04,3003-06,8651-04,QC-3,QC-9,QC-5,QC-8,QC-10,QC-4,QC-7,QC-6,QC-1,mada_1-10,mada_2-46,mada_1-1,mada_1-36,mada_1-39,mada_1-51,mada_1-44,mada_117,mada_118,mada_122,mada_107,R27252,R23887,R30215,R30078,R29816,R30234,18_0621851,R36431,R28703,mada_115,mada_2-42,R31095,R28012,R37765,R27657,R25048,R24120,R28581,R29598,mada_1-11,R24100,R21408,R20574,R20260,R18043,R22601,R23146,R32929,R21893,R30420,R26778,R26791,R28980,R27725,R18040,R27937,mada_1-30,mada_2-31,mada_1-41,R21770,R21839,mada_1-32,R30396,R21363,R20896,mada_102,mada_129,mada_139,mada_151,mada_105,R15311,mada_103,mada_2-25,mada_112,mada_124,mada_126,mada_120,R23571,mada_128,mada_1-12,mada_1-15,mada_2-1,mada_1-53,mada_1-50,mada_123,mada_2-53,mada_1-38,S0070-08,S0085-01,S0107-01,S0089-01,S0256-08,S0123-01,S0106-01,S0262-02,NA,AWALK_Full,AWALK_List,BubbleID,Start_Node,End_Node,NodePath_Trimmed,NumSVNodes
0,NC_000962.3,1533,.,N,<CNV>,30,PASS,NS=152;NA=1;ALEN=0;AC=152;VS=>s1;VE=>s2;AWALK=*;END=1533,GT:GT0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,1,*,['*'],BubbleRegion_1,s1,s2,[],0
1,NC_000962.3,1591,.,N,<CNV>,30,PASS,NS=88;NA=1;ALEN=61;AC=88;VS=>s2;VE=>s5;AWALK=>s3>s4;END=1652,GT:GT0,0:0,0:0,0:0,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,.,0:0,0:0,.,.,.,.,.,.,.,.,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,.,0:0,0:0,.,.,.,.,.,0:0,0:0,0:0,1,>s3>s4,['>s3>s4'],BubbleRegion_2,s2,s5,"['s2034', 's2959', 's2035', 's3', 's2036', 's2036', 's3', 's2035', 's2959', 's2034', 's4']",11
2,NC_000962.3,13622,.,N,<CNV>,30,PASS,"NS=152;NA=2;ALEN=0,1358;AC=151,1;VS=>s5;VE=>s6;AWALK=*,>s2744;END=13622",GT:GT0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,1:1,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,0:0,2,"*,>s2744","['*', '>s2744']",BubbleRegion_3,s5,s6,['s2744'],1


### Parse in `MG_SVInfo_DF`

In [351]:
MG_SVInfo_DF = pd.read_csv(MG_SVInfo_TSV_GZ, sep = "\t" )
MG_SVInfo_DF.shape

(535, 160)

In [352]:
MG_SVInfo_DF.head(3)

Unnamed: 0,Chr,Start,End,Info,FORMAT,NC_000962.3,N0072,N0153,TB3113,TB1236,TB2659,TB2780,TB1612,TB2512,TB2981,TB3091,M0003941_3,TB3368,N0145,N0155,TB2995,TB3396,N0004,N1274,N0054,02_R1179,01_R1134,M0017522_5,M0016395_7,M0010874_7,02_R1708,02_R0894,01_R1430,M0014888_3,02_R1896,TB4620,TB3162,MT_0080,TB3054,TB3251,M0016737_0,TB2661,TB3237,TB3169,TB3386,TB3334,M0011368_9,TB2968,N1272,N1176,N1202,N1177,N0091,RW-TB008,9050-05,4549-04,696-05,702-06,706-05,8129-04,3003-06,8651-04,QC-3,QC-9,QC-5,QC-8,QC-10,QC-4,QC-7,QC-6,QC-1,mada_1-10,mada_2-46,mada_1-1,mada_1-36,mada_1-39,mada_1-51,mada_1-44,mada_117,mada_118,mada_122,mada_107,R27252,R23887,R30215,R30078,R29816,R30234,18_0621851,R36431,R28703,mada_115,mada_2-42,R31095,R28012,R37765,R27657,R25048,R24120,R28581,R29598,mada_1-11,R24100,R21408,R20574,R20260,R18043,R22601,R23146,R32929,R21893,R30420,R26778,R26791,R28980,R27725,R18040,R27937,mada_1-30,mada_2-31,mada_1-41,R21770,R21839,mada_1-32,R30396,R21363,R20896,mada_102,mada_129,mada_139,mada_151,mada_105,R15311,mada_103,mada_2-25,mada_112,mada_124,mada_126,mada_120,R23571,mada_128,mada_1-12,mada_1-15,mada_2-1,mada_1-53,mada_1-50,mada_123,mada_2-53,mada_1-38,S0070-08,S0085-01,S0107-01,S0089-01,S0256-08,S0123-01,S0106-01,S0262-02,NA,BubbleNum,BubbleID
0,NC_000962.3,1533,1533,NS=152;NA=1;ALEN=0;AC=152;VS=>s1;VE=>s2;AWALK=*,GT:CSTRAND:CTG:CS:CE,0:+:NC_000962.3:1531:1601,0:+:N0072:1531:1601,0:+:N0153:1531:1601,0:+:TB3113:1531:2959,0:+:TB1236:1531:2959,0:+:TB2659:1531:2959,0:+:TB2780:1531:2959,0:+:TB1612:1531:2959,0:+:TB2512:1531:2959,0:+:TB2981:1531:2959,0:+:TB3091:1531:2959,0:+:M0003941_3:1531:2959,0:+:TB3368:1531:2959,0:+:N0145:1531:2959,0:+:N0155:1531:2959,0:+:TB2995:1531:2959,0:+:TB3396:1531:2959,0:+:N0004:1531:1601,0:+:N1274:1531:1601,0:+:N0054:1531:1601,0:+:02_R1179:1531:1601,0:+:01_R1134:1531:1601,0:+:M0017522_5:1531:1601,0:+:M0016395_7:1531:1601,0:+:M0010874_7:1531:1601,0:+:02_R1708:1531:1601,0:+:02_R0894:1531:1601,0:+:01_R1430:1531:1601,0:+:M0014888_3:1531:1601,0:+:02_R1896:1531:1601,0:+:TB4620:1531:1601,0:+:TB3162:1531:1601,0:+:MT_0080:1531:1601,0:+:TB3054:1531:1601,0:+:TB3251:1531:1601,0:+:M0016737_0:1531:1601,0:+:TB2661:1531:1601,0:+:TB3237:1531:1601,0:+:TB3169:1531:1601,0:+:TB3386:1531:1601,0:+:TB3334:1531:1601,0:+:M0011368_9:1531:1601,0:+:TB2968:1531:1601,0:+:N1272:1531:1601,0:+:N1176:1531:1601,0:+:N1202:1531:1601,0:+:N1177:1531:1601,0:+:N0091:1531:1601,0:+:RW-TB008:1531:1601,0:+:9050-05:1531:2960,0:+:4549-04:1531:2959,0:+:696-05:1531:2961,0:+:702-06:1531:2960,0:+:706-05:1531:2959,0:+:8129-04:1531:2960,0:+:3003-06:1531:2960,0:+:8651-04:1531:2959,0:+:QC-3:1531:1601,0:+:QC-9:1531:1601,0:+:QC-5:1531:1601,0:+:QC-8:1531:1601,0:+:QC-10:1531:1601,0:+:QC-4:1531:1601,0:+:QC-7:1531:1601,0:+:QC-6:1531:1601,0:+:QC-1:1531:1601,0:+:mada_1-10:1531:1601,0:+:mada_2-46:1531:1601,0:+:mada_1-1:1531:1601,0:+:mada_1-36:1531:1601,0:+:mada_1-39:1531:1601,0:+:mada_1-51:1531:1601,0:+:mada_1-44:1531:1601,0:+:mada_117:1531:1601,0:+:mada_118:1531:1601,0:+:mada_122:1531:1601,0:+:mada_107:1531:1601,0:+:R27252:1531:1601,0:+:R23887:1531:1601,0:+:R30215:1531:2959,0:+:R30078:1531:2959,0:+:R29816:1531:2959,0:+:R30234:1531:2959,0:+:18_0621851:1531:2959,0:+:R36431:1531:2959,0:+:R28703:1531:2959,0:+:mada_115:1531:2959,0:+:mada_2-42:1531:2959,0:+:R31095:1531:2959,0:+:R28012:1531:2959,0:+:R37765:1531:2959,0:+:R27657:1531:2959,0:+:R25048:1531:2959,0:+:R24120:1531:2959,0:+:R28581:1531:2959,0:+:R29598:1531:2959,0:+:mada_1-11:1531:2959,0:+:R24100:1531:2959,0:+:R21408:1531:2959,0:+:R20574:1531:2959,0:+:R20260:1531:2959,0:+:R18043:1531:2959,0:+:R22601:1531:2959,0:+:R23146:1531:2959,0:+:R32929:1531:2959,0:+:R21893:1531:2959,0:+:R30420:1531:2959,0:+:R26778:1531:2959,0:+:R26791:1531:2959,0:+:R28980:1531:2959,0:+:R27725:1531:2959,0:+:R18040:1531:2959,0:+:R27937:1531:2959,0:+:mada_1-30:1531:1601,0:+:mada_2-31:1531:1601,0:+:mada_1-41:1531:1601,0:+:R21770:1531:1601,0:+:R21839:1531:1601,0:+:mada_1-32:1531:1601,0:+:R30396:1531:1601,0:+:R21363:1531:1601,0:+:R20896:1531:1601,0:+:mada_102:1531:1601,0:+:mada_129:1531:1601,0:+:mada_139:1531:1601,0:+:mada_151:1531:1601,0:+:mada_105:1531:1601,0:+:R15311:1531:1601,0:+:mada_103:1531:1601,0:+:mada_2-25:1531:1601,0:+:mada_112:1531:1601,0:+:mada_124:1531:1601,0:+:mada_126:1531:1601,0:+:mada_120:1531:1601,0:+:R23571:1531:1600,0:+:mada_128:1531:1600,0:+:mada_1-12:1531:1600,0:+:mada_1-15:1531:1600,0:+:mada_2-1:1531:1600,0:+:mada_1-53:1531:1600,0:+:mada_1-50:1531:1600,0:+:mada_123:1531:2958,0:+:mada_2-53:1531:1600,0:+:mada_1-38:1531:1601,0:+:S0070-08:1531:2959,0:+:S0085-01:1531:2959,0:+:S0107-01:1531:2959,0:+:S0089-01:1531:2959,0:+:S0256-08:1531:2959,0:+:S0123-01:1531:1601,0:+:S0106-01:1531:1601,0:+:S0262-02:1531:1601,1,1,BubbleRegion_1
1,NC_000962.3,1591,1652,NS=88;NA=1;ALEN=61;AC=88;VS=>s2;VE=>s5;AWALK=>s3>s4,GT:CSTRAND:CTG:CS:CE,0:+:NC_000962.3:1531:1654,0:+:N0072:1531:1654,0:+:N0153:1531:1654,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:+:N0004:1531:1654,0:+:N1274:1531:1654,0:+:N0054:1531:1654,0:+:02_R1179:1531:1654,0:+:01_R1134:1531:1654,0:+:M0017522_5:1531:1654,0:+:M0016395_7:1531:1654,0:+:M0010874_7:1531:1654,0:+:02_R1708:1531:1654,0:+:02_R0894:1531:1654,0:+:01_R1430:1531:1654,0:+:M0014888_3:1531:1654,0:+:02_R1896:1531:1654,0:+:TB4620:1531:1654,0:+:TB3162:1531:1654,0:+:MT_0080:1531:1654,0:+:TB3054:1531:1654,0:+:TB3251:1531:1654,0:+:M0016737_0:1531:1654,0:+:TB2661:1531:1654,0:+:TB3237:1531:1654,0:+:TB3169:1531:1654,0:+:TB3386:1531:1654,0:+:TB3334:1531:1654,0:+:M0011368_9:1531:1654,0:+:TB2968:1531:1654,0:+:N1272:1531:1654,0:+:N1176:1531:1654,0:+:N1202:1531:1654,.,0:+:N0091:1531:1654,0:+:RW-TB008:1531:1654,.,.,.,.,.,.,.,.,0:+:QC-3:1531:1654,0:+:QC-9:1531:1654,0:+:QC-5:1531:1654,0:+:QC-8:1531:1654,0:+:QC-10:1531:1654,0:+:QC-4:1531:1654,0:+:QC-7:1531:1654,0:+:QC-6:1531:1654,0:+:QC-1:1531:1654,0:+:mada_1-10:1531:1654,0:+:mada_2-46:1531:1654,0:+:mada_1-1:1531:1654,0:+:mada_1-36:1531:1654,0:+:mada_1-39:1531:1654,0:+:mada_1-51:1531:1654,0:+:mada_1-44:1531:1654,0:+:mada_117:1531:1654,0:+:mada_118:1531:1654,0:+:mada_122:1531:1654,0:+:mada_107:1531:1654,0:+:R27252:1531:1654,0:+:R23887:1531:1654,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0:+:mada_2-31:1531:1654,0:+:mada_1-41:1531:1654,0:+:R21770:1531:1654,0:+:R21839:1531:1654,0:+:mada_1-32:1531:1654,0:+:R30396:1531:1654,0:+:R21363:1531:1654,0:+:R20896:1531:1654,0:+:mada_102:1531:1654,0:+:mada_129:1531:1654,0:+:mada_139:1531:1654,0:+:mada_151:1531:1654,0:+:mada_105:1531:1654,0:+:R15311:1531:1654,0:+:mada_103:1531:1654,0:+:mada_2-25:1531:1654,0:+:mada_112:1531:1654,0:+:mada_124:1531:1654,0:+:mada_126:1531:1654,0:+:mada_120:1531:1654,0:+:R23571:1531:1653,0:+:mada_128:1531:1653,0:+:mada_1-12:1531:1653,0:+:mada_1-15:1531:1653,0:+:mada_2-1:1531:1653,0:+:mada_1-53:1531:1653,0:+:mada_1-50:1531:1653,.,0:+:mada_2-53:1531:1653,0:+:mada_1-38:1531:1654,.,.,.,.,.,0:+:S0123-01:1531:1654,0:+:S0106-01:1531:1654,0:+:S0262-02:1531:1654,1,2,BubbleRegion_2
2,NC_000962.3,13622,13622,"NS=152;NA=2;ALEN=0,1358;AC=151,1;VS=>s5;VE=>s6;AWALK=*,>s2744",GT:CSTRAND:CTG:CS:CE,0:+:NC_000962.3:13614:13631,0:+:N0072:13614:13631,0:+:N0153:13615:13632,0:+:TB3113:14972:14989,0:+:TB1236:14972:14989,0:+:TB2659:14972:14989,0:+:TB2780:14972:14989,0:+:TB1612:14972:14989,0:+:TB2512:14972:14989,0:+:TB2981:14972:14989,0:+:TB3091:14972:14989,0:+:M0003941_3:14972:14989,0:+:TB3368:14972:14988,0:+:N0145:14972:14989,0:+:N0155:14972:14989,0:+:TB2995:14972:14989,0:+:TB3396:14972:14989,0:+:N0004:13614:13631,0:+:N1274:13614:13631,0:+:N0054:13614:13631,0:+:02_R1179:13614:13631,0:+:01_R1134:13614:13631,0:+:M0017522_5:13614:13631,0:+:M0016395_7:13614:13631,0:+:M0010874_7:13614:13631,0:+:02_R1708:13614:13631,0:+:02_R0894:13614:13631,0:+:01_R1430:13614:13642,0:+:M0014888_3:13614:13631,0:+:02_R1896:13614:13631,0:+:TB4620:13614:13631,0:+:TB3162:13614:13631,0:+:MT_0080:13614:13631,0:+:TB3054:13614:13631,0:+:TB3251:13614:13631,0:+:M0016737_0:13614:13631,0:+:TB2661:13614:13631,0:+:TB3237:13614:13631,0:+:TB3169:13614:13631,0:+:TB3386:13614:13631,0:+:TB3334:13614:13631,0:+:M0011368_9:13614:13631,0:+:TB2968:13614:13631,0:+:N1272:13614:13631,0:+:N1176:13614:13631,0:+:N1202:13614:13631,0:+:N1177:14971:14988,0:+:N0091:13614:13631,0:+:RW-TB008:13614:13631,0:+:9050-05:14973:15001,1:+:4549-04:14972:16347,0:+:696-05:14974:14991,0:+:702-06:14973:14990,0:+:706-05:14972:14989,0:+:8129-04:14973:14990,0:+:3003-06:14973:14990,0:+:8651-04:14972:14989,0:+:QC-3:13614:13631,0:+:QC-9:13614:13631,0:+:QC-5:13614:13631,0:+:QC-8:13614:13631,0:+:QC-10:13614:13631,0:+:QC-4:13614:13631,0:+:QC-7:13614:13631,0:+:QC-6:13614:13631,0:+:QC-1:13614:13631,0:+:mada_1-10:13614:13631,0:+:mada_2-46:13614:13631,0:+:mada_1-1:13614:13631,0:+:mada_1-36:13614:13631,0:+:mada_1-39:13614:13631,0:+:mada_1-51:13614:13631,0:+:mada_1-44:13614:13631,0:+:mada_117:13614:13631,0:+:mada_118:13614:13631,0:+:mada_122:13614:13631,0:+:mada_107:13614:13631,0:+:R27252:13614:13631,0:+:R23887:13614:13631,0:+:R30215:14972:14989,0:+:R30078:14972:14989,0:+:R29816:14972:14989,0:+:R30234:14972:14989,0:+:18_0621851:14972:14989,0:+:R36431:14972:14989,0:+:R28703:14972:14989,0:+:mada_115:14972:14989,0:+:mada_2-42:14972:14989,0:+:R31095:14972:14989,0:+:R28012:14972:14989,0:+:R37765:14971:14988,0:+:R27657:14972:14989,0:+:R25048:14972:14989,0:+:R24120:14972:14989,0:+:R28581:14972:14989,0:+:R29598:14972:14989,0:+:mada_1-11:14972:14989,0:+:R24100:14972:14989,0:+:R21408:14972:14989,0:+:R20574:14972:14989,0:+:R20260:14972:14989,0:+:R18043:14972:14989,0:+:R22601:14972:14989,0:+:R23146:14972:14989,0:+:R32929:14972:14989,0:+:R21893:14972:14989,0:+:R30420:14972:14989,0:+:R26778:14972:14989,0:+:R26791:14972:14989,0:+:R28980:14972:14989,0:+:R27725:14972:14989,0:+:R18040:14972:14989,0:+:R27937:14972:14989,0:+:mada_1-30:14972:14989,0:+:mada_2-31:13614:13631,0:+:mada_1-41:13614:13631,0:+:R21770:13614:13631,0:+:R21839:13614:13631,0:+:mada_1-32:13614:13631,0:+:R30396:13614:13631,0:+:R21363:13614:13631,0:+:R20896:13614:13631,0:+:mada_102:13614:13631,0:+:mada_129:13614:13631,0:+:mada_139:13614:13631,0:+:mada_151:13614:13631,0:+:mada_105:13614:13631,0:+:R15311:13614:13631,0:+:mada_103:13614:13631,0:+:mada_2-25:13614:13631,0:+:mada_112:13614:13631,0:+:mada_124:13614:13631,0:+:mada_126:13614:13631,0:+:mada_120:13614:13631,0:+:R23571:13613:13630,0:+:mada_128:13612:13629,0:+:mada_1-12:13613:13630,0:+:mada_1-15:13613:13630,0:+:mada_2-1:13613:13630,0:+:mada_1-53:13613:13630,0:+:mada_1-50:13613:13630,0:+:mada_123:14971:14988,0:+:mada_2-53:13613:13630,0:+:mada_1-38:13614:13631,0:+:S0070-08:14972:14989,0:+:S0085-01:14972:14989,0:+:S0107-01:14972:14989,0:+:S0089-01:14972:14989,0:+:S0256-08:14972:14989,0:+:S0123-01:13614:13631,0:+:S0106-01:13614:13631,0:+:S0262-02:13614:13631,2,3,BubbleRegion_3


In [353]:
MG_Nodes_KmerComp_DF["Jaccard_Cont_WiRv"].describe()

count    3138.000000
mean        0.721741
std         0.435274
min         0.000000
25%         0.036899
50%         1.000000
75%         1.000000
max         1.000000
Name: Jaccard_Cont_WiRv, dtype: float64

In [354]:
MG_Nodes_KmerComp_DF["Jaccard_Cont_WiRv_InsSeqAndPhages"].describe()

count    3138.000000
mean        0.102290
std         0.282077
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Jaccard_Cont_WiRv_InsSeqAndPhages, dtype: float64

In [355]:
MG_Nodes_KmerComp_DF["Jaccard_Cont_WiIS6110"].describe()

count    3138.000000
mean        0.092373
std         0.273029
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Jaccard_Cont_WiIS6110, dtype: float64