# Analysis & comparison of SV pan-genome graph construction results (Mtb-151CI)

### Import Statements

In [1]:
import numpy as np
import pandas as pd
import scipy.stats

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# https://bioframe.readthedocs.io/en/latest/guide-intervalops.html
import bioframe as bf


In [3]:
import time

In [4]:
import ast

#### Pandas Viewing Settings

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Import/parse processed H37rv genome annotations

In [6]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"

## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[["H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]

RvID_To_Symbol_Dict = dict(H37Rv_GeneInfo_Subset_DF[['H37rv_GeneID', 'Symbol']].values)
Symbol_To_FuncCat_Dict = dict(H37Rv_GeneInfo_Subset_DF[['Symbol', 'Functional_Category']].values)


In [7]:
H37Rv_GenomeAnno_Genes_DF.head(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucl...,,NotExcluded
2,NC_000962.3,3279,4437,+,Rv0003,recF,CDS,information pathways,No,DNA replication and repair protein RecF (singl...,,NotExcluded


# Parse sample metadata & preprocessed genome info/results

In [8]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"
MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"

## Parse sample Metadata (N = 151)

In [9]:
WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)
WGA151CI_AsmSummary_DF.shape

(151, 7)

# Define output dir of the Mtb-WGA-SMK processing pipeline

In [10]:
WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"
MtbWGA_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir

### Define paths to processed analysis of SV Pan-genome graph (minigraph)

In [11]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

MG_Node_KmerComp_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.Summary.V1.tsv.gz" 

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz"     

MG_BubbleSumm_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleSummary.BED.tsv.gz"     

MG_SVVCF_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVVCF.tsv.gz" 

MG_SVInfo_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVInfo.tsv.gz" 


### Parse in `MG_Nodes_KmerVsRefSets_DF`

In [12]:
MG_Nodes_KmerComp_DF = pd.read_csv(MG_Node_KmerComp_TSV_GZ, sep = "\t" )
MG_Nodes_KmerComp_DF.shape

(3138, 18)

In [13]:
MG_Nodes_KmerComp_DF.head()

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode
0,s1,False,1533,1.0,0.0,0.0,0.0,0.994012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,s2,False,58,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,s3,True,56,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2,0.961538
3,s4,True,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2,0.0
4,s5,False,11970,1.0,0.0,0.0,0.0,0.612312,0.044724,0.141457,0.007286,0.0,0.0,0.0,0.0,0.0,,0.0


# Parse processed Pangraph results

In [14]:
PG_T1_NodeSummary_DF = pd.read_csv("T1_NodeSummary_DF.csv")
PG_T1_NodeSummary_DF.shape

(1119, 9)

In [15]:
PG_T2_NodeSummary_DF = pd.read_csv("T2_NodeSummary_DF.csv")
PG_T2_NodeSummary_DF.shape

(1341, 9)

In [16]:
PG_T3_NodeSummary_DF = pd.read_csv("T3_NodeSummary_DF.csv")
PG_T3_NodeSummary_DF.shape

(1481, 9)

# Begin analysis & comparison (Pangraph vs Minigraph)

In [17]:
PG_T1_NodeSummary_DF.head(4)

Unnamed: 0,NodeID,SeqLength,N_Asms,N_Occurrences,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,MaxJC_ToOtherNode
0,JISCTWOKQZ,9501,152,152,0.993454,0.0,0.0,False,0.0
1,IKRXJRHUBC,644,17,17,0.0,0.0,0.0,True,0.084691
2,MHUQRHINBO,1095,1,1,0.0,0.0,0.0,True,0.0
3,MOJKXXEYYU,3213,152,152,1.0,0.0,0.0,False,0.0


### What is breakdown of SV vs Core Nodes in graph?

In [18]:
MG_Nodes_KmerComp_DF["IsSVNode"].value_counts()

True     2602
False     536
Name: IsSVNode, dtype: int64

In [19]:
PG_T1_NodeSummary_DF["IsSVNode"].value_counts()

True     649
False    470
Name: IsSVNode, dtype: int64

In [20]:
PG_T2_NodeSummary_DF["IsSVNode"].value_counts()

True     845
False    496
Name: IsSVNode, dtype: int64

In [21]:
PG_T3_NodeSummary_DF["IsSVNode"].value_counts()

True     957
False    524
Name: IsSVNode, dtype: int64

### What is LENGTH breakdown of SV vs Core Nodes in graph?

In [22]:
MG_Nodes_KmerComp_DF.groupby("IsSVNode")["SeqLength"].sum()

IsSVNode
False    3913140
True     1283223
Name: SeqLength, dtype: int64

In [23]:
PG_T1_NodeSummary_DF.groupby("IsSVNode")["SeqLength"].sum()

IsSVNode
False    3886407
True      387800
Name: SeqLength, dtype: int64

In [24]:
PG_T2_NodeSummary_DF.groupby("IsSVNode")["SeqLength"].sum()

IsSVNode
False    3971389
True      465526
Name: SeqLength, dtype: int64

In [25]:
PG_T3_NodeSummary_DF.groupby("IsSVNode")["SeqLength"].sum()

IsSVNode
False    3964632
True      468998
Name: SeqLength, dtype: int64

### What proportion of SV Nodes have repeated sequences versus UNIQUE sequences?

In [26]:
MG_Nodes_KmerComp_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode < 0.05")["SeqLength"].sum()   

301511

In [27]:
MG_Nodes_KmerComp_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode >= 0.05")["SeqLength"].sum()   

975039

In [28]:
PG_T1_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode < 0.05")["SeqLength"].sum()   

368933

In [29]:
PG_T1_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode >= 0.05")["SeqLength"].sum()   

18822

In [30]:
PG_T2_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode < 0.05")["SeqLength"].sum()   

410195

In [31]:
PG_T2_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode >= 0.05")["SeqLength"].sum()   

55312

In [32]:
PG_T3_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode < 0.05")["SeqLength"].sum()   

419457

In [33]:
PG_T3_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & MaxJC_ToOtherNode >= 0.05")["SeqLength"].sum()   

49522

### What proportion of SV Nodes have NOVEL sequences relative to H37Rv?

In [34]:
MG_Nodes_KmerComp_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv < 0.05")["SeqLength"].sum()

66621

In [35]:
MG_Nodes_KmerComp_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv >= 0.05")["SeqLength"].sum()

1209929

In [36]:
PG_T1_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv < 0.05")["SeqLength"].sum()   

62305

In [37]:
PG_T1_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv >= 0.05")["SeqLength"].sum()   

325450

In [38]:
PG_T2_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv < 0.05")["SeqLength"].sum()   

68434

In [39]:
PG_T2_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv >= 0.05")["SeqLength"].sum()

397073

In [40]:
PG_T3_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv < 0.05")["SeqLength"].sum()   

69058

In [41]:
PG_T3_NodeSummary_DF.query("IsSVNode == True & SeqLength >= 31 & Jaccard_Cont_WiRv >= 0.05")["SeqLength"].sum()   

399921

# Summary Analysis - Round 2

In [61]:
def summarize_Pangenome_Node_Stats(node_summary_df):
    """
    Summarizes key statistics from the pan-genome graph node summary DataFrame.
    
    Args:
        node_summary_df (pd.DataFrame): DataFrame containing node summaries.
    
    Returns:
        dict: A dictionary summarizing key statistics.
    """
    stats = {}

    # Core Nodes
    core_nodes_all = node_summary_df.query("IsSVNode == False")
    core_nodes_pass = core_nodes_all.query("SeqLength >= 31")
    core_nodes_sub31bp = core_nodes_all.query("SeqLength < 31")

    # SV Nodes
    sv_nodes_all = node_summary_df.query("IsSVNode == True")
    sv_nodes_pass = sv_nodes_all.query("SeqLength >= 31")
    sv_nodes_sub31bp = sv_nodes_all.query("SeqLength < 31")

    # SV Nodes by MaxJC threshold
    sv_pass_UnqInGraph = sv_nodes_pass.query("MaxJC_ToOtherNode < 0.05")
    sv_pass_RepeatInGraph = sv_nodes_pass.query("MaxJC_ToOtherNode >= 0.05")

    # SV Nodes by Rv match threshold
    sv_pass_UnqToH37Rv = sv_nodes_pass.query("Jaccard_Cont_WiRv < 0.05")
    sv_pass_InH37Rv = sv_nodes_pass.query("Jaccard_Cont_WiRv >= 0.05")
    

    stats["N_CoreNodes"] = core_nodes_all.shape[0]
    stats["SeqLength_CoreNodes"] = core_nodes_all["SeqLength"].sum()
    
    stats["N_SV_All"] = sv_nodes_all.shape[0]
    stats["SeqLength_SV_All"] = sv_nodes_all["SeqLength"].sum()

    stats["N_SV_UnqInGraph"] = sv_pass_UnqInGraph.shape[0]
    stats["SeqLength_SV_UnqInGraph"] = sv_pass_UnqInGraph["SeqLength"].sum()

    stats["N_SV_RepeatInGraph"] = sv_pass_RepeatInGraph.shape[0]
    stats["SeqLength_SV_RepeatInGraph"] = sv_pass_RepeatInGraph["SeqLength"].sum()

    stats["N_SV_UnqToH37Rv"] = sv_pass_UnqToH37Rv.shape[0]
    stats["SeqLength_SV_UnqToH37Rv"] = sv_pass_UnqToH37Rv["SeqLength"].sum()

    return stats


In [80]:
summary_stats_Dict = {"Minigraph"   : summarize_Pangenome_Node_Stats(MG_Nodes_KmerComp_DF), 
                      "Pangraph_T1" : summarize_Pangenome_Node_Stats(PG_T1_NodeSummary_DF), 
                      "Pangraph_T2" : summarize_Pangenome_Node_Stats(PG_T2_NodeSummary_DF),  
                      "Pangraph_T3" : summarize_Pangenome_Node_Stats(PG_T3_NodeSummary_DF) }

SummStats_DF_List = []

for key, i_Dict in summary_stats_Dict.items():
    i_Dict["Method"] = key

    i_DF = pd.DataFrame([i_Dict])
    
    SummStats_DF_List.append(i_DF)
    


All_Stats_DF = pd.concat(SummStats_DF_List, axis = 0)

# Move the "Method" column to the first position
method_col = All_Stats_DF.pop("Method")

All_Stats_DF.insert(0, "Method", method_col)

All_Stats_DF.shape

(4, 11)

In [81]:
All_Stats_DF

Unnamed: 0,Method,N_CoreNodes,SeqLength_CoreNodes,N_SV_All,SeqLength_SV_All,N_SV_UnqInGraph,SeqLength_SV_UnqInGraph,N_SV_RepeatInGraph,SeqLength_SV_RepeatInGraph,N_SV_UnqToH37Rv,SeqLength_SV_UnqToH37Rv
0,Minigraph,536,3913140,2602,1283223,463,301511,1562,975039,160,66621
0,Pangraph_T1,470,3886407,649,387800,532,368933,115,18822,83,62305
0,Pangraph_T2,496,3971389,845,465526,579,410195,265,55312,95,68434
0,Pangraph_T3,524,3964632,957,468998,630,419457,326,49522,102,69058


In [71]:
SummStats_DF_List[0]

{'N_CoreNodes': 536,
 'SeqLength_CoreNodes': 3913140,
 'N_SV_All': 2602,
 'SeqLength_SV_All': 1283223,
 'N_SV_UnqInGraph': 463,
 'SeqLength_SV_UnqInGraph': 301511,
 'N_SV_RepeatInGraph': 1562,
 'SeqLength_SV_RepeatInGraph': 975039,
 'N_SV_UnqToH37Rv': 160,
 'SeqLength_SV_UnqToH37Rv': 66621,
 'Method': 'Minigraph'}

In [73]:
SummStats_DF_List

[]

In [62]:
# Summarize statistics
summary_stats = summarize_Pangenome_Node_Stats(MG_Nodes_KmerComp_DF)

# Display summary
for key, value in summary_stats.items():
    
    print(f"{key}: {value}")

N_CoreNodes: 536
SeqLength_CoreNodes: 3913140
N_SV_All: 2602
SeqLength_SV_All: 1283223
N_SV_UnqInGraph: 463
SeqLength_SV_UnqInGraph: 301511
N_SV_RepeatInGraph: 1562
SeqLength_SV_RepeatInGraph: 975039
N_SV_UnqToH37Rv: 160
SeqLength_SV_UnqToH37Rv: 66621


In [63]:
# Summarize statistics
summary_stats = summarize_Pangenome_Node_Stats(PG_T1_NodeSummary_DF)

# Display summary
for key, value in summary_stats.items():
    
    print(f"{key}: {value}")

N_CoreNodes: 470
SeqLength_CoreNodes: 3886407
N_SV_All: 649
SeqLength_SV_All: 387800
N_SV_UnqInGraph: 532
SeqLength_SV_UnqInGraph: 368933
N_SV_RepeatInGraph: 115
SeqLength_SV_RepeatInGraph: 18822
N_SV_UnqToH37Rv: 83
SeqLength_SV_UnqToH37Rv: 62305


In [64]:
# Summarize statistics
summary_stats = summarize_Pangenome_Node_Stats(PG_T2_NodeSummary_DF)

# Display summary
for key, value in summary_stats.items():
    
    print(f"{key}: {value}")

N_CoreNodes: 496
SeqLength_CoreNodes: 3971389
N_SV_All: 845
SeqLength_SV_All: 465526
N_SV_UnqInGraph: 579
SeqLength_SV_UnqInGraph: 410195
N_SV_RepeatInGraph: 265
SeqLength_SV_RepeatInGraph: 55312
N_SV_UnqToH37Rv: 95
SeqLength_SV_UnqToH37Rv: 68434


In [65]:
# Summarize statistics
summary_stats = summarize_Pangenome_Node_Stats(PG_T3_NodeSummary_DF)

# Display summary
for key, value in summary_stats.items():
    
    print(f"{key}: {value}")


N_CoreNodes: 524
SeqLength_CoreNodes: 3964632
N_SV_All: 957
SeqLength_SV_All: 468998
N_SV_UnqInGraph: 630
SeqLength_SV_UnqInGraph: 419457
N_SV_RepeatInGraph: 326
SeqLength_SV_RepeatInGraph: 49522
N_SV_UnqToH37Rv: 102
SeqLength_SV_UnqToH37Rv: 69058


In [46]:
MG_Nodes_KmerComp_DF.head(1)

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode
0,s1,False,1533,1.0,0.0,0.0,0.0,0.994012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


In [42]:
PG_T1_NodeSummary_DF.head(2)

Unnamed: 0,NodeID,SeqLength,N_Asms,N_Occurrences,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,IsSVNode,MaxJC_ToOtherNode
0,JISCTWOKQZ,9501,152,152,0.993454,0.0,0.0,False,0.0
1,IKRXJRHUBC,644,17,17,0.0,0.0,0.0,True,0.084691


In [43]:
PG_T1_NodeSummary_DF["N_Asms"].describe()

count    1119.000000
mean      104.250223
std        68.062977
min         0.000000
25%         1.000000
50%       151.000000
75%       152.000000
max       152.000000
Name: N_Asms, dtype: float64

In [44]:
PG_T1_NodeSummary_DF["N_Occurrences"].describe()

count    1119.000000
mean      104.250223
std        68.062977
min         0.000000
25%         1.000000
50%       151.000000
75%       152.000000
max       152.000000
Name: N_Occurrences, dtype: float64