In [1]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm
%matplotlib inline


In [127]:
%reload_ext autoreload
%autoreload 2

### import panqc (pan-genome quality control) toolkit functions
# from panqc.ava import ava
# from panqc.nscluster import clusterBy_KmerJC, summarize_NSClusters, create_MaxKmerSim_JC_Dict, create_MST_FiltByJC, make_ClusterID_Maps 
# from panqc.nscluster import make_NS_ClusterMerged_Pres_DF

from panqc.kmerlib import read_kmers_from_file_ToHashesDict

from panqc.utils import parse_PresAbs_Rtab, parse_PresAbs_CSV_Roary, parse_PresAbs_CSV_Panaroo, get_PG_Stats_FromPresAbs

from panqc.utils import  parse_PG_Ref_FA, get_PG_Stats_FromDNASeqPresAbs

# from panqc.asm_gene_search import parse_AlnHits_To_DF
# from panqc.asm_gene_search import PresAbsQC_CheckAsmForGeneSeq, SRAsm_PresAbsQC_CheckInLRAsm
# from panqc.asm_gene_search import get_SRAsm_Vs_LRAsm_QCStats


In [128]:
def read_kmers_from_file_ToHashesDict(filename, ksize):

    all_hashes_Set_Dict = {}
    seqLen_Dict = {}
    
    NumParsedRecords = 0
    
    for record in screed.open(filename):
        
        # Check if a space exists in the name before splitting
        if " " in record.name:
            ShortName = record.name.split(" ", 1)[1]
        else:
            ShortName = record.name
            
        NumParsedRecords += 1
        sequence = record.sequence

        kmers = build_kmers(sequence, ksize)
        hashes_Set = hash_kmers_ToSet(kmers)
        
        all_hashes_Set_Dict[ShortName] = hashes_Set
        seqLen_Dict[ShortName] = len(sequence)

    print(NumParsedRecords, " total records were parsed")
    
    return all_hashes_Set_Dict, seqLen_Dict

In [3]:
import gfapy
import ast

In [4]:
import time

In [5]:
import screed

In [6]:
import mappy as mp

In [7]:
# Set max column width to a specific value (e.g., 100 characters)
pd.set_option('display.max_colwidth', 100)
# Set to display a specific number of columns (e.g., 20 columns)
pd.set_option('display.max_columns', 180)

## Define useful Kmer analysis functions

In [8]:
import screed

In [9]:
import mmh3

In [10]:
def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1
    
    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)
        
    return kmers

In [11]:
#import screed a library for reading in FASTA/FASTQ

def read_kmers_from_file(filename, ksize):
    all_kmers = []
    for record in screed.open(filename):
        sequence = record.sequence
        
        kmers = build_kmers(sequence, ksize)
        all_kmers += kmers

    return all_kmers

In [12]:
def hash_kmer(kmer):
    # calculate the reverse complement
    rc_kmer = screed.rc(kmer)
    
    # determine whether original k-mer or reverse complement is lesser
    if kmer < rc_kmer:
        canonical_kmer = kmer
    else:
        canonical_kmer = rc_kmer
        
    # calculate murmurhash using a hash seed of 42
    hash = mmh3.hash64(canonical_kmer, 42)[0]
    if hash < 0: hash += 2**64

    return hash

In [13]:
# def hash_kmers(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
#     return hashes

def hash_kmers_ToSet(kmers):
    hashes = set()
    for kmer in kmers:
        hashes.add(hash_kmer(kmer))
    return hashes

In [14]:

def jaccard_containment_FromSets(a, b):
    '''
    This function returns the Jaccard Containment between sets a and b.
    '''
    
    intersection = len(a.intersection(b))
    
    return intersection / len(a)

def jaccard_similarity_FromSets(a, b):
    '''
    This function returns the Jaccard Similarity between sets a and b.
    '''
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    
    return intersection / union


In [15]:
def getAllHash_ExceptTargets_Set_V2(dictOfHashes, targetsToRemove):
    # Convert targetsToRemove to a set for faster lookup
    targetsToRemoveSet = set(targetsToRemove)

    # Use set comprehension for more efficient construction of the result set
    return {hash for seqID, seqInfoDict in dictOfHashes.items() if seqID not in targetsToRemoveSet
            for hash in seqInfoDict["Kmer_Hashes_Set"]}

In [16]:
def getAllHash_InTargetSeqs_Set(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_SeqInfoDict in dictOfHashes.items():
        
        i_Hashes = i_SeqInfoDict["Kmer_Hashes_Set"]
        
        if i_SeqID not in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

## Import/parse processed H37rv genome annotations

In [17]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"

## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[["H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]

RvID_To_Symbol_Dict = dict(H37Rv_GeneInfo_Subset_DF[['H37rv_GeneID', 'Symbol']].values)
Symbol_To_FuncCat_Dict = dict(H37Rv_GeneInfo_Subset_DF[['Symbol', 'Functional_Category']].values)


# Part 2: Generate reference k-mer sets (ie H37Rv, IS6110, Phages + ISs)  

## Generate k-mer info for H37Rv and a representative IS6110 sequence 

In [18]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"

H37rv_Ref_FA_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.fasta"
IS6110_Example_FA_PATH = f"{Mtb_RefDir}/IS6110_From_Rv0795_Rv0796.DNA.fasta"

#### H37Rv - k-mer generation & hashing

In [19]:
H37Rv_kmers = read_kmers_from_file(H37rv_Ref_FA_PATH, 31)

H37Rv_Hashes_Set = hash_kmers_ToSet(H37Rv_kmers)

print(len(H37Rv_kmers))

4411502


#### IS6110 (Rv0795 & Rv0796) - k-mer generation & hashing

In [20]:
IS6110_Ex1_kmers = read_kmers_from_file(IS6110_Example_FA_PATH, 31)

IS6110_Ex1_Hashes_Set = hash_kmers_ToSet(IS6110_Ex1_kmers)

print(len(IS6110_Ex1_kmers))

1254


## Generate k-mer info for all H37Rv gene DNA sequences (Mycobrowser)

In [21]:
from Bio import SeqIO


In [22]:
O2_RefDir = "/n/data1/hms/dbmi/farhat/mm774/References"
MycoBrowser_RefFiles_Dir = f"{O2_RefDir}/190619_Mycobrowser_H37rv_ReferenceFiles"

H37Rv_Genes_MycoBro_FA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_genes_v3.fasta"


In [23]:
!grep ^">" $H37Rv_Genes_MycoBro_FA | grep "dnaA"

>Rv0001|dnaA|CDS|1-1524|+|Chromosomal replication initiator protein DnaA


### Get 31-mer hashes for all annotated gene DNA sequences

In [24]:
dictOf_H37Rv_MycoBrow_GeneSeq = {}
dictOf_H37Rv_MycoBrow_Gene_KmerHashes = {}

for index, record in tqdm(enumerate(SeqIO.parse(H37Rv_Genes_MycoBro_FA, "fasta"))):
    
    RecordName = record.name
    RvID = RecordName.split("|")[0]
    GeneID = RecordName.split("|")[1]
    S_Seq = str(record.seq).upper()
    
    dictOf_H37Rv_MycoBrow_GeneSeq[GeneID] = S_Seq

    record_Hashes_Set = hash_kmers_ToSet(build_kmers(S_Seq, 31))

    dictOf_H37Rv_MycoBrow_Gene_KmerHashes[GeneID] = record_Hashes_Set
    

4187it [00:30, 135.33it/s]


In [25]:
len(dictOf_H37Rv_MycoBrow_GeneSeq["dnaA"])

1524

In [26]:
list(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"])[:2]

[13580233940393664509, 5138456728421695490]

## Generate k-mer info for each gene category annotation in H37Rv

In [27]:
def getAllHashes_InTargetSeqs(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_Hashes in dictOfHashes.items():
                
        if i_SeqID in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [28]:
Rv_Gene_Category_List = list(H37Rv_GenomeAnno_Genes_DF["Functional_Category"].unique())

RvGeneCat_To_Symbol_Dict = {}
RvGeneCat_To_RvID_Dict = {}
RvGeneCat_To_KmerHashes_Dict = {}

for i_GeneCat in Rv_Gene_Category_List:
    
    Genes_Subset_DF = H37Rv_GenomeAnno_Genes_DF.query(f"Functional_Category == '{i_GeneCat}'")     
    
    Subset_GeneSymbols = Genes_Subset_DF["Symbol"].values
    Subset_RvIDs = Genes_Subset_DF["H37rv_GeneID"].values

    N_Genes = len(Subset_GeneSymbols)
    

    RvGeneCat_To_Symbol_Dict[i_GeneCat] = Subset_GeneSymbols
    RvGeneCat_To_RvID_Dict[i_GeneCat] = Subset_RvIDs


    i_GeneCat_Hashes_Set = getAllHashes_InTargetSeqs(dictOf_H37Rv_MycoBrow_Gene_KmerHashes,
                                                     Subset_GeneSymbols)   

    RvGeneCat_To_KmerHashes_Dict[i_GeneCat] = i_GeneCat_Hashes_Set
    
    print(i_GeneCat, N_Genes, len(list(i_GeneCat_Hashes_Set)) )


information pathways 242 265284
conserved hypotheticals 1042 723221
cell wall and cell processes 772 783261
stable RNAs 48 6735
intermediary metabolism and respiration 936 1011332
regulatory proteins 198 161826
virulence, detoxification, adaptation 239 150509
insertion seqs and phages 147 69102
lipid metabolism 272 407894
PE/PPE 168 265028
unknown 15 7938


In [29]:
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']

In [30]:
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']
Rv_MGEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['insertion seqs and phages']

In [31]:
print(len(Rv_MGEs_Hashes_Set))

69102


# Parse sample metadata & preprocessed genome info/results

In [32]:
!pwd

/n/data1/hms/dbmi/farhat/mm774/Snakemake_Pipelines/mtb-pg-benchmarking-2024paper/Analysis/PartC_Mtb_PG_Eval


In [33]:
#!ls -1 ../../Data

## Parse sample Metadata (N = 151)

In [34]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"

WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)
WGA151CI_AsmSummary_DF.shape

(151, 7)

## PARSE PATHs FOR ALL assemblies processed by this pipeline

In [35]:
WGA151CI_LRandSR_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")
print(WGA151CI_LRandSR_Asm_Path_DF.columns)
WGA151CI_LRandSR_Asm_Path_DF.columns = ['SampleID', 'Dataset_Tag',
                                        'Genome_LR_ASM_PATH', 'Genome_SR_ASM_PATH']


Index(['SampleID', 'Dataset_Tag', 'Genome_ASM_PATH',
       'ShortRead_Genome_ASM_PATH'],
      dtype='object')


In [36]:
WGA151CI_LRandSR_Asm_Path_DF.head(1)

Unnamed: 0,SampleID,Dataset_Tag,Genome_LR_ASM_PATH,Genome_SR_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/...,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/...


#### Create Dict of Asm FA PATHs

In [37]:

LR_AsmFA_Dict = dict(WGA151CI_LRandSR_Asm_Path_DF[['SampleID', 'Genome_LR_ASM_PATH']].values)
SR_AsmFA_Dict = dict(WGA151CI_LRandSR_Asm_Path_DF[['SampleID', 'Genome_SR_ASM_PATH']].values)


### Define Phylo order of samples:

In [38]:
OrderOfSampleIDs_Phylo = ['N0153', 'N0072', 'mada_2-46', 'mada_1-44', 'mada_107',
                          'mada_1-1', 'mada_1-51', 'mada_1-39', 'mada_1-36',
                          'mada_117', 'mada_122', 'mada_118', 'mada_1-10', 'R27252',
                          'R23887', 'TB3091', '9050-05', '3003-06', '702-06', '696-05',
                          '8651-04', 'TB3396', '4549-04', 'TB1612', 'TB2780', 'TB3368',
                          'TB1236', 'TB2659', '8129-04', 'R30215', 'R25048', 'TB2512',
                          'TB2981', 'TB2995', 'TB3113', '706-05', 'R30078', 'R28012',
                          'R27657', 'R30234', 'R31095', 'R28703', 'R24120', 'R36431',
                          'R29816', 'S0070-08', 'N0155', 'N0145', 'R29598', 'R24100',
                          'S0107-01', 'R28581', 'S0256-08', 'S0085-01', 'S0089-01',
                          'mada_1-11', 'M0003941_3', 'mada_115', 'mada_2-42', 'R37765',
                          '18_0621851', 'R22601', 'R27937', 'R18040', 'R18043', 'R27725',
                          'R26791', 'R20574', 'R20260', 'R21408', 'R23146', 'R28980', 'R32929',
                          'R26778', 'R30420', 'R21893', 'QC-9', 'QC-5', 'QC-3', 'N0004',
                          'mada_1-30', 'N0054', 'N1274', '01_R1134', 'TB2968', 'mada_1-53',
                          'mada_2-53', 'mada_1-50', 'mada_2-1', 'R23571', 'mada_123',
                          'mada_1-12', 'mada_1-15', 'mada_128', 'mada_1-38', 'TB3054',
                          'mada_126', 'mada_120', 'TB4620', 'M0016737_0', 'M0016395_7',
                          'R15311', 'TB2661', 'TB3386', 'TB3162', '02_R1179', 'M0010874_7',
                          'QC-7', 'QC-6', 'QC-1', '01_R1430', 'M0011368_9', '02_R1896',
                          'mada_2-25', 'TB3237', 'mada_103', 'mada_112', 'mada_124',
                          'S0123-01', 'S0262-02', 'TB3251', 'M0017522_5', 'R30396', 'R20896',
                          'mada_1-32', 'S0106-01', 'R21839', 'R21363', 'R21770', 'MT_0080','mada_102',
                          'TB3334', 'M0014888_3', 'mada_151', 'TB3169', 'mada_105', 'QC-8',
                          'QC-10', 'QC-4', 'mada_129', 'mada_139', '02_R1708', '02_R0894',
                          'mada_2-31', 'mada_1-41', 'N1272', 'N1176', 'N1202', 'N0091',
                          'N1177','RW-TB008']


In [39]:
WGA151CI_AsmSummary_DF.head(5)

Unnamed: 0,SampleID,numContigs_Complete,Flye_CircContig_Cov,PrimaryLineage,Lineage,Dataset_Tag,AsmApproach
0,N0072,1,358,lineage1,"lineage1,lineage1.1,lineage1.1.2",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
1,N0153,1,372,lineage1,"lineage1,lineage1.1,lineage1.1.1,lineage1.1.1.1",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
2,TB3113,1,933,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
3,TB1236,1,374,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
4,TB2659,1,421,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon


#### Make sample lineage & color mapping

In [40]:
# Dictionary for lineage to color mapping
LinToColor_Dict = {
    "lineage1": "#DF83AC",
    "lineage2": "#7098CB",
    "lineage3": "#815D9F",
    "lineage4": "#E76956",
    "lineage5": "#B67548",
    "lineage6": "#6AB79E",
    "lineage8": "#E4515B",
    "None": "black",
}

# Extracting the mapping between IsolateID and PrimaryLineage_Ill
lineage_mapping = WGA151CI_AsmSummary_DF.set_index('SampleID')['PrimaryLineage'].to_dict()

# Creating a color mapping for the samples
sample_colors = {sample: LinToColor_Dict.get(lineage, "black") for sample, lineage in lineage_mapping.items()}


# Define output dir of the Mtb-WGA-SMK processing pipeline

In [41]:
# Define varaint calling pipeline output directories

WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"

MtbWGA_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir


# Parse Minigraph info

### Define PATHS relevant to Minigraph analysis

In [42]:
target_OutputDir = MtbWGA_SMK_Pipeline_OutputDir
Minigraph_151CI_OutDir = f"{target_OutputDir}/Minigraph"

MG_WGA151CI_GFA = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.V1.gfa"
MG_WGA151CI_Bubble_SV_BED = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.V1.Bubble.SV.bed"
MG_WGA151CI_Stable_FA = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.V1.Stable.fa"

MG_WGA151CI_MergedSVInfo_TSV = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.MergedSV.Info.tsv"
MG_WGA151CI_MergedSVInfo_SVVCF = f"{Minigraph_151CI_OutDir}/Minigraph_H37rv_Vs_151CI.MergedSV.Info.svvcf"


In [43]:
Minigraph_151CI_OutDir

'/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/Minigraph'

#### use `gfatools` to print general stats of the SV graph (rGFA format)

In [44]:
!gfatools stat $MG_WGA151CI_GFA

Number of segments: 3138
Number of links: 4705
Number of arcs: 9410
Max rank: 129
Total segment length: 5196363
Average segment length: 1655.947
Sum of rank-0 segment lengths: 4411532
Max degree: 8
Average degree: 1.499
[M::main] Version: 0.5-r292-dirty
[M::main] CMD: gfatools stat /n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/Minigraph/Minigraph_H37rv_Vs_151CI.V1.gfa
[M::main] Real time: 0.024 sec; CPU: 0.062 sec


### Define paths to processed analysis of SV Pan-genome graph (minigraph)

In [45]:
PangenomeAnalysis_Dir = "../../Data/MtbPangenomeAnalysis_SetV5"

MG_Node_KmerComp_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.Summary.V1.tsv.gz" 

MG_AvA_Node_KmerAnalysis_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.NodeKmerComp.AllVsAll.V1.tsv.gz"     

MG_BubbleSumm_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleSummary.BED.tsv.gz"     

MG_SVVCF_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVVCF.tsv.gz" 

MG_SVInfo_TSV_GZ = f"{PangenomeAnalysis_Dir}/MtbSVPG.Minigraph.BubbleAlleleInfo.SVInfo.tsv.gz" 


### Parse in `AvA_Nodes_DF`

In [46]:
AvA_Nodes_DF = pd.read_csv(MG_AvA_Node_KmerAnalysis_TSV_GZ, sep = "\t" )
AvA_Nodes_DF.shape

(123104, 6)

In [47]:
AvA_Nodes_DF.head()

Unnamed: 0,RecordID_1,RecordID_2,Record1_Len,Record2_Len,JaccardSim,JaccardContain
0,s3,s2959,56,61,0.78125,0.961538
1,s7,s2247,1876,42,0.006501,0.006501
2,s8,s2823,542,87,0.096339,0.097656
3,s13,s2037,24063,49,0.000208,0.000208
4,s13,s2039,24063,114,0.000208,0.000208


### Parse in `MG_Nodes_KmerVsRefSets_DF`

In [48]:
MG_Nodes_KmerComp_DF = pd.read_csv(MG_Node_KmerComp_TSV_GZ, sep = "\t" )
MG_Nodes_KmerComp_DF.shape

(3138, 18)

In [49]:
MG_Nodes_KmerComp_DF.head()

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode
0,s1,False,1533,1.0,0.0,0.0,0.0,0.994012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,s2,False,58,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,s3,True,56,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2,0.961538
3,s4,True,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2,0.0
4,s5,False,11970,1.0,0.0,0.0,0.0,0.612312,0.044724,0.141457,0.007286,0.0,0.0,0.0,0.0,0.0,,0.0


### Parse in `MG_SV_BED_DF`

In [50]:
import ast

In [51]:
MG_SV_BED_DF = pd.read_csv(MG_BubbleSumm_TSV_GZ, sep = "\t" )

# Convert the string representation of the list of nodes to python list of nodes
MG_SV_BED_DF["NodePath_Trimmed"] = MG_SV_BED_DF["NodePath_Trimmed"].apply(ast.literal_eval)

MG_SV_BED_DF.shape

(535, 14)

In [52]:
MG_SV_BED_DF.head(3)

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,NodePath_Trimmed,Start_Node,End_Node,BubbleNum,BubbleID,NumSVNodes,Overlap_Genes,Overlap_Gene_RvIDs
0,NC_000962.3,1533,1533,0,0,"s1,s2",[],s1,s2,1,BubbleRegion_1,0,,
1,NC_000962.3,1591,1652,61,1480,"s2,s2034,s2959,s2035,s3,s2036,s2036,s3,s2035,s2959,s2034,s4,s5","[s2034, s2959, s2035, s3, s2036, s2036, s3, s2035, s2959, s2034, s4]",s2,s5,2,BubbleRegion_2,11,,
2,NC_000962.3,13622,13622,0,1358,"s5,s2744,s6",[s2744],s5,s6,3,BubbleRegion_3,1,,


### Parse in `MG_SVVCF_DF`

In [53]:
MG_SVVCF_DF = pd.read_csv(MG_SVVCF_TSV_GZ, sep = "\t" )
MG_SVVCF_DF.shape

(535, 169)

### Parse in `MG_SVInfo_DF`

In [54]:
MG_SVInfo_DF = pd.read_csv(MG_SVInfo_TSV_GZ, sep = "\t" )
MG_SVInfo_DF.shape

(535, 160)

In [55]:
MG_SV_BED_DF["NodePath_Trimmed"].values[:2]

array([list([]),
       list(['s2034', 's2959', 's2035', 's3', 's2036', 's2036', 's3', 's2035', 's2959', 's2034', 's4'])],
      dtype=object)

In [56]:
MG_SV_BED_DF.head(4)

Unnamed: 0,Chr,Start,End,Len_Ref,Len_Alt,NodePath,NodePath_Trimmed,Start_Node,End_Node,BubbleNum,BubbleID,NumSVNodes,Overlap_Genes,Overlap_Gene_RvIDs
0,NC_000962.3,1533,1533,0,0,"s1,s2",[],s1,s2,1,BubbleRegion_1,0,,
1,NC_000962.3,1591,1652,61,1480,"s2,s2034,s2959,s2035,s3,s2036,s2036,s3,s2035,s2959,s2034,s4,s5","[s2034, s2959, s2035, s3, s2036, s2036, s3, s2035, s2959, s2034, s4]",s2,s5,2,BubbleRegion_2,11,,
2,NC_000962.3,13622,13622,0,1358,"s5,s2744,s6",[s2744],s5,s6,3,BubbleRegion_3,1,,
3,NC_000962.3,26469,28345,42,1876,"s6,s2247,s7,s8","[s2247, s7]",s6,s8,4,BubbleRegion_4,2,"Rv0021c,whiB5,Rv0023","Rv0021c,Rv0022c,Rv0023"


### Create Dict of SV Node ID to BubbleID mappings

In [57]:
NodeID_ToBubbleID_Dict = {}

for i, row in MG_SV_BED_DF.iterrows():
    
    i_BubbleID =  row["BubbleID"]
    i_NodePath_Trimmed = row["NodePath_Trimmed"]

    # Map all SV nodes to their Bubble Region ID
    for NodeID in i_NodePath_Trimmed:
        NodeID_ToBubbleID_Dict[NodeID] = i_BubbleID
        

In [58]:
list(NodeID_ToBubbleID_Dict.items())[:10]

[('s2034', 'BubbleRegion_2'),
 ('s2959', 'BubbleRegion_2'),
 ('s2035', 'BubbleRegion_2'),
 ('s3', 'BubbleRegion_2'),
 ('s2036', 'BubbleRegion_2'),
 ('s4', 'BubbleRegion_2'),
 ('s2744', 'BubbleRegion_3'),
 ('s2247', 'BubbleRegion_4'),
 ('s7', 'BubbleRegion_4'),
 ('s2823', 'BubbleRegion_5')]

## Create dictionary of BubbleID to overlapping H37Rv gene annotations

In [59]:
BubbleID_To_OvrLapGenes_Dict = MG_SV_BED_DF.set_index("BubbleID")["Overlap_Genes"].to_dict()

#### Peak at the genes overlapping the first 10 bubble regions

In [60]:
list(BubbleID_To_OvrLapGenes_Dict.items())[:10]

[('BubbleRegion_1', nan),
 ('BubbleRegion_2', nan),
 ('BubbleRegion_3', nan),
 ('BubbleRegion_4', 'Rv0021c,whiB5,Rv0023'),
 ('BubbleRegion_5', 'Rv0024'),
 ('BubbleRegion_6', 'bioF2'),
 ('BubbleRegion_7', nan),
 ('BubbleRegion_8', nan),
 ('BubbleRegion_9', nan),
 ('BubbleRegion_10', 'Rv0063')]

### Preparation - Breakdown core and SV nodes

In [61]:
MG_CoreNodes_All_DF = MG_Nodes_KmerComp_DF.query("IsSVNode == False")

MG_SVNodes_All_DF = MG_Nodes_KmerComp_DF.query("IsSVNode == True")
MG_SVNodes_PASS_DF = MG_Nodes_KmerComp_DF.query("IsSVNode == True").query("SeqLength >= 31")     
MG_SVNodes_Sub31bp_DF = MG_Nodes_KmerComp_DF.query("IsSVNode == True").query("SeqLength < 31")     

MG_SVNodes_UnqSeq_DF = MG_SVNodes_PASS_DF.query("MaxJC_ToOtherNode < 0.05")

MG_SVNodes_UnqSeq_UnqToRv_DF = MG_SVNodes_UnqSeq_DF.query("Jaccard_Cont_WiRv < 0.05")

SVNodeIDs_UnqSeq = MG_SVNodes_UnqSeq_DF["NodeID"].values

SVNodeIDs_UnqSeq_And_UnqToRv = MG_SVNodes_UnqSeq_UnqToRv_DF["NodeID"].values


In [62]:
MG_SVNodes_UnqSeq_DF.head()

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode
6,s7,True,1876,1.0,0.0,0.0,0.0,0.0,0.206934,0.0,0.0,0.0,0.601842,0.0,0.0,0.0,BubbleRegion_4,0.006501
15,s16,True,904,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_11,0.0
17,s18,True,732,1.0,0.0,0.0,0.0,0.040698,0.0,0.0,0.0,0.059593,0.0,0.0,0.0,0.0,BubbleRegion_12,0.0
18,s19,True,687,1.0,0.0,0.0,0.0,0.922374,0.045662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_12,0.045662
20,s21,True,2557,1.0,0.0,0.0,0.0,0.0,0.103285,0.784725,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_12,0.003166


In [63]:
MG_SVNodes_UnqSeq_UnqToRv_DF.head()

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode
1746,s1747,True,527,0.024145,0.0,0.0,0.024145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_31,0.006036
1747,s1748,True,72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_31,0.0
1748,s1749,True,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_31,0.0
1749,s1750,True,96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_31,0.0
1769,s1770,True,533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_82,0.0


## Parse k-mer info for all nodes of `minigraph` GFA (151 Mtb assemblies)

In [64]:
def parse_rGFA_To_KmerInfo(i_Minigraph_GFA):

    GFA_GP = gfapy.Gfa.from_file(i_Minigraph_GFA)

    i_dictOf_NodeInfo = {}

    for line in tqdm(GFA_GP.lines):
        line_Str = str(line)
        
        if line_Str.startswith("S"):
            
            line_SplitByTab = line_Str.split("\t")
            
            S_Name = line_SplitByTab[1]
            
            S_Seq = line_SplitByTab[2]
            
            Len_Seq = len(S_Seq)
            
            record_Kmers = build_kmers(S_Seq, 31)
            
            record_Hashes = hash_kmers_ToSet(record_Kmers)

            i_dictOf_NodeInfo[S_Name] = {}
            i_dictOf_NodeInfo[S_Name]["Len"] = Len_Seq
            i_dictOf_NodeInfo[S_Name]["Kmers"] = record_Kmers
            i_dictOf_NodeInfo[S_Name]["Kmer_Hashes_Set"] = record_Hashes

    return i_dictOf_NodeInfo

## Generate dict of SV PG graph node info (length, k-mers, k-mer hashes)

#### For each node of the graph we have:
- sequence length
- all unique 31-mers
- all hashes of all unique canonical 31-mers 

In [65]:
MG_dictOf_NodeInfo = parse_rGFA_To_KmerInfo(MG_WGA151CI_GFA)

100%|██████████| 7843/7843 [00:42<00:00, 185.82it/s] 


#### Inspect resulting dict of node info (length, k-mers, k-mer hashes)

In [66]:
len(list(MG_dictOf_NodeInfo.keys()))

3138

In [67]:
# For each node of the graph we have its sequence length, unique 31-mers, unique hashes of the canonical 31-mer 

MG_dictOf_NodeInfo["s1"].keys()

dict_keys(['Len', 'Kmers', 'Kmer_Hashes_Set'])

In [68]:
MG_dictOf_NodeInfo["s1"]["Len"]

1533

In [69]:
MG_dictOf_NodeInfo["s1"]["Kmers"][:3]

['TTGACCGATGACCCCGGTTCAGGCTTCACCA',
 'TGACCGATGACCCCGGTTCAGGCTTCACCAC',
 'GACCGATGACCCCGGTTCAGGCTTCACCACA']

In [70]:
list(MG_dictOf_NodeInfo["s1"]["Kmer_Hashes_Set"])[:2]

[13580233940393664509, 5138456728421695490]

In [71]:
MG_dictOf_NodeInfo["s1"].keys()

dict_keys(['Len', 'Kmers', 'Kmer_Hashes_Set'])

In [72]:
MG_SVNodes_UnqSeq_HashDict = {}
MG_SVNodes_UnqSeqAndUnqToRv_HashDict = {}

for i_NodeID in MG_dictOf_NodeInfo.keys():
    
    if i_NodeID in SVNodeIDs_UnqSeq:
        MG_SVNodes_UnqSeq_HashDict[i_NodeID] =  MG_dictOf_NodeInfo[i_NodeID]["Kmer_Hashes_Set"]
    
    if i_NodeID in SVNodeIDs_UnqSeq_And_UnqToRv:
        MG_SVNodes_UnqSeqAndUnqToRv_HashDict[i_NodeID] =  MG_dictOf_NodeInfo[i_NodeID]["Kmer_Hashes_Set"]


In [73]:
len(SVNodeIDs_UnqSeq)

463

In [74]:
len(list(MG_SVNodes_UnqSeq_HashDict.keys()))

463

In [75]:
len(SVNodeIDs_UnqSeq_And_UnqToRv)

76

In [76]:
len(list(MG_SVNodes_UnqSeqAndUnqToRv_HashDict.keys()))

76

In [77]:
# MG_SVNodes_UnqSeq_HashDict
# MG_SVNodes_UnqSeqAndUnqToRv_HashDict

# Classify SV Nodes based on match to H37Rv gene categories

### Define functions

In [78]:
# Define the Jaccard index columns corresponding to gene categories
listOf_JC_Cols = [
    "Jaccard_Cont_WiRv_InsSeqAndPhages",
    "Jaccard_Cont_WiRv_PEPPEs",
    "Jaccard_Cont_WiRv_InfoPathways",
    "Jaccard_Cont_WiRv_ConservedHypo",
    "Jaccard_Cont_WiRv_CellWallCellProc",
    "Jaccard_Cont_WiRv_StableRNAs",
    "Jaccard_Cont_WiRv_InterMetabolism",
    "Jaccard_Cont_WiRv_RegProteins",
    "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation",
    "Jaccard_Cont_WiRv_LipidMetabolism",
    "Jaccard_Cont_WiRv_Unknown"
]

listOf_JC_Cols = [
    "Jaccard_Cont_WiRv_InsSeqAndPhages",
    "Jaccard_Cont_WiRv_PEPPEs",
    "Jaccard_Cont_WiRv_InfoPathways",
    "Jaccard_Cont_WiRv_ConservedHypo",
    "Jaccard_Cont_WiRv_CellWallCellProc",
    "Jaccard_Cont_WiRv_StableRNAs",
    "Jaccard_Cont_WiRv_InterMetabolism",
    "Jaccard_Cont_WiRv_RegProteins",
    "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation",
    "Jaccard_Cont_WiRv_LipidMetabolism",
]

JC_Cols_ToCategoryName = {
    "Jaccard_Cont_WiRv_PEPPEs": "PE/PPE",
    "Jaccard_Cont_WiRv_CellWallCellProc": "cell wall and cell processes",
    "Jaccard_Cont_WiRv_ConservedHypo": "conserved hypotheticals",
    "Jaccard_Cont_WiRv_InfoPathways": "information pathways",
    "Jaccard_Cont_WiRv_InsSeqAndPhages": "insertion seqs and phages",
    "Jaccard_Cont_WiRv_StableRNAs": "stable RNAs",
    "Jaccard_Cont_WiRv_InterMetabolism": "intermediary metabolism and respiration",
    "Jaccard_Cont_WiRv_RegProteins": "regulatory proteins",
    "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation": "virulence, detoxification, adaptation",
    "Jaccard_Cont_WiRv_LipidMetabolism": "lipid metabolism",
    "Jaccard_Cont_WiRv_Unknown": "unknown"
}


In [79]:
# Define function to classify a node based on a threshold
def classify_node_ToRvGeneCat(row, threshold=0.25):

    listOf_Rv_JCtoGeneCat_Cols = [
        "Jaccard_Cont_WiRv_InsSeqAndPhages",
        "Jaccard_Cont_WiRv_PEPPEs",
        "Jaccard_Cont_WiRv_InfoPathways",
        "Jaccard_Cont_WiRv_ConservedHypo",
        "Jaccard_Cont_WiRv_CellWallCellProc",
        "Jaccard_Cont_WiRv_StableRNAs",
        "Jaccard_Cont_WiRv_InterMetabolism",
        "Jaccard_Cont_WiRv_RegProteins",
        "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation",
        "Jaccard_Cont_WiRv_LipidMetabolism",
    ]
    
    filtered_matches = {col: row[col] for col in listOf_Rv_JCtoGeneCat_Cols if row[col] >= threshold}
    if filtered_matches:
        max_value = max(filtered_matches.values())
        strongest_categories = [
            col for col, value in filtered_matches.items() if value == max_value
        ]
        return strongest_categories
    return None  # No valid matches



# Define function to summarize node categories
def summarize_sv_categories(sv_nodes_df, listOf_JC_Cols):
    category_summary = {}
    for col in listOf_JC_Cols:
        total_length = sv_nodes_df.loc[
            sv_nodes_df["KmerMatch_RvGeneCat"].apply(
                lambda matches: col in matches if matches else False
            ),
            "SeqLength"
        ].sum()
        node_count = sv_nodes_df["KmerMatch_RvGeneCat"].apply(
            lambda matches: col in matches if matches else False
        ).sum()
        category_summary[col] = {"Total Length": total_length, "Node Count": node_count}

    # Add "NoMatch" to the summary
    no_match_length = sv_nodes_df.loc[
        sv_nodes_df["KmerMatch_RvGeneCat"].isnull(), "SeqLength"
    ].sum()
    no_match_count = sv_nodes_df["KmerMatch_RvGeneCat"].isnull().sum()

    category_summary["NoMatch"] = {
        "Total Length": no_match_length,
        "Node Count": no_match_count,
    }

    # Convert to DataFrame
    summary_df = pd.DataFrame.from_dict(category_summary, orient="index")
    summary_df["Relative Size (%)"] = (
        summary_df["Total Length"] / summary_df["Total Length"].sum() * 100
    ).round(1)

    return summary_df


# Update the summarize_sv_categories function to include "Functional_Category"
def summarize_sv_categories_with_functional_category(sv_nodes_df, listOf_JC_Cols):
    category_summary = {}
    category_mapping = {
        "Jaccard_Cont_WiRv_PEPPEs": "PE/PPE",
        "Jaccard_Cont_WiRv_CellWallCellProc": "cell wall and cell processes",
        "Jaccard_Cont_WiRv_ConservedHypo": "conserved hypotheticals",
        "Jaccard_Cont_WiRv_InfoPathways": "information pathways",
        "Jaccard_Cont_WiRv_InsSeqAndPhages": "insertion seqs and phages",
        "Jaccard_Cont_WiRv_StableRNAs": "stable RNAs",
        "Jaccard_Cont_WiRv_InterMetabolism": "intermediary metabolism and respiration",
        "Jaccard_Cont_WiRv_RegProteins": "regulatory proteins",
        "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation": "virulence, detoxification, adaptation",
        "Jaccard_Cont_WiRv_LipidMetabolism": "lipid metabolism",
        "Jaccard_Cont_WiRv_Unknown": "unknown",
    }

    for col in listOf_JC_Cols:
        total_length = sv_nodes_df.loc[
            sv_nodes_df["KmerMatch_RvGeneCat"].apply(
                lambda matches: col in matches if matches else False
            ),
            "SeqLength"
        ].sum()
        
        node_count = sv_nodes_df["KmerMatch_RvGeneCat"].apply(
            lambda matches: col in matches if matches else False
        ).sum()
        
        category_summary[col] = {
            "Functional_Category": category_mapping.get(col, "unknown"),
            "Total Length": total_length,
            "Node Count": node_count,
        }

    # Add "NoMatch" to the summary
    no_match_length = sv_nodes_df.loc[
        sv_nodes_df["KmerMatch_RvGeneCat"].isnull(), "SeqLength"
    ].sum()
    no_match_count = sv_nodes_df["KmerMatch_RvGeneCat"].isnull().sum()

    category_summary["NoMatch"] = {
        "Functional_Category": "NoMatch",
        "Total Length": no_match_length,
        "Node Count": no_match_count,
    }

    # Convert to DataFrame
    summary_df = pd.DataFrame.from_dict(category_summary, orient="index")
    summary_df["Relative Size (%)"] = (
        summary_df["Total Length"] / summary_df["Total Length"].sum() * 100
    ).round(2)

    return summary_df


###  Step 1: Look at relative size of gene categories in H73R

In [80]:
# Correctly reference "Start" and "End" columns to calculate gene lengths
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[
    ["Functional_Category", "Start", "End"]
].copy()
H37Rv_GeneInfo_Subset_DF["Gene_Length"] = (
    H37Rv_GeneInfo_Subset_DF["End"] - H37Rv_GeneInfo_Subset_DF["Start"] + 1
)

# Group by functional category and calculate total length and relative fractions
gene_cat_lengths = (
    H37Rv_GeneInfo_Subset_DF.groupby("Functional_Category")["Gene_Length"]
    .sum()
    .reset_index()
)

gene_cat_lengths["Relative_Fraction"] = (
    gene_cat_lengths["Gene_Length"] / gene_cat_lengths["Gene_Length"].sum()
)

gene_cat_lengths["Percent_RefGeneLengths"] = (gene_cat_lengths["Relative_Fraction"] * 100).round(2)

Gene_Cat_RefPerc = gene_cat_lengths[["Functional_Category", "Percent_RefGeneLengths"]]

In [81]:
Gene_Cat_RefPerc

Unnamed: 0,Functional_Category,Percent_RefGeneLengths
0,PE/PPE,7.01
1,cell wall and cell processes,20.0
2,conserved hypotheticals,18.77
3,information pathways,6.76
4,insertion seqs and phages,2.69
5,intermediary metabolism and respiration,25.75
6,lipid metabolism,10.53
7,regulatory proteins,4.18
8,stable RNAs,0.2
9,unknown,0.21


In [82]:
MG_Nodes_KmerComp_DF.head(1)

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode
0,s1,False,1533,1.0,0.0,0.0,0.0,0.994012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


### Step 2: Classify nodes

In [83]:
# Apply classification to the main DataFrame
threshold = 0.5  # Set the classification threshold
MG_Nodes_KmerComp_DF["KmerMatch_RvGeneCat"] = MG_Nodes_KmerComp_DF.apply(
    lambda row: classify_node_ToRvGeneCat(row, threshold), axis=1
)

In [84]:
# Filter SV Nodes based on the queries provided
MG_SVNodes_PASS_DF = MG_Nodes_KmerComp_DF.query("IsSVNode == True and SeqLength >= 31")

MG_SVNodes_UnqSeq_DF = MG_SVNodes_PASS_DF.query("MaxJC_ToOtherNode < 0.05")
MG_SVNodes_UnqSeq_UnqToRv_DF = MG_SVNodes_UnqSeq_DF.query("Jaccard_Cont_WiRv < 0.05")
MG_SVNodes_NoUnqSeq_DF = MG_SVNodes_PASS_DF.query("MaxJC_ToOtherNode >= 0.05")


In [85]:
Gene_Cat_RefPerc

Unnamed: 0,Functional_Category,Percent_RefGeneLengths
0,PE/PPE,7.01
1,cell wall and cell processes,20.0
2,conserved hypotheticals,18.77
3,information pathways,6.76
4,insertion seqs and phages,2.69
5,intermediary metabolism and respiration,25.75
6,lipid metabolism,10.53
7,regulatory proteins,4.18
8,stable RNAs,0.2
9,unknown,0.21


In [86]:
MG_SVNodes_PASS_DF.head(3)

Unnamed: 0,NodeID,IsSVNode,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,BubbleID,MaxJC_ToOtherNode,KmerMatch_RvGeneCat
2,s3,True,56,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_2,0.961538,
6,s7,True,1876,1.0,0.0,0.0,0.0,0.0,0.206934,0.0,0.0,0.0,0.601842,0.0,0.0,0.0,BubbleRegion_4,0.006501,[Jaccard_Cont_WiRv_RegProteins]
15,s16,True,904,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,BubbleRegion_11,0.0,[Jaccard_Cont_WiRv_CellWallCellProc]


In [87]:
# Summarize UNIQUE SV nodes
All_SV_nodes_summary = summarize_sv_categories_with_functional_category(MG_SVNodes_PASS_DF, listOf_JC_Cols)
All_SV_nodes_summary = pd.merge(All_SV_nodes_summary, Gene_Cat_RefPerc, on = "Functional_Category")

# Summarize UNIQUE SV nodes
unique_nodes_summary = summarize_sv_categories_with_functional_category(MG_SVNodes_UnqSeq_DF, listOf_JC_Cols)
unique_nodes_summary = pd.merge(unique_nodes_summary, Gene_Cat_RefPerc, on = "Functional_Category")

# Summarize NON-UNIQUE SV nodes
nonunique_nodes_summary = summarize_sv_categories_with_functional_category(MG_SVNodes_NoUnqSeq_DF, listOf_JC_Cols)
nonunique_nodes_summary = pd.merge(nonunique_nodes_summary, Gene_Cat_RefPerc, on = "Functional_Category")


In [88]:
All_SV_nodes_summary

Unnamed: 0,Functional_Category,Total Length,Node Count,Relative Size (%),Percent_RefGeneLengths
0,insertion seqs and phages,503748,346,39.46,2.69
1,PE/PPE,167166,647,13.1,7.01
2,information pathways,4363,10,0.34,6.76
3,conserved hypotheticals,66735,111,5.23,18.77
4,cell wall and cell processes,42715,73,3.35,20.0
5,stable RNAs,0,0,0.0,0.2
6,intermediary metabolism and respiration,69088,105,5.41,25.75
7,regulatory proteins,13000,21,1.02,4.18
8,"virulence, detoxification, adaptation",10554,7,0.83,3.91
9,lipid metabolism,51437,39,4.03,10.53


In [89]:
unique_nodes_summary

Unnamed: 0,Functional_Category,Total Length,Node Count,Relative Size (%),Percent_RefGeneLengths
0,insertion seqs and phages,14643,7,4.86,2.69
1,PE/PPE,45845,185,15.21,7.01
2,information pathways,4249,8,1.41,6.76
3,conserved hypotheticals,23659,27,7.85,18.77
4,cell wall and cell processes,28466,37,9.44,20.0
5,stable RNAs,0,0,0.0,0.2
6,intermediary metabolism and respiration,40813,43,13.54,25.75
7,regulatory proteins,6849,7,2.27,4.18
8,"virulence, detoxification, adaptation",9697,5,3.22,3.91
9,lipid metabolism,25307,24,8.39,10.53


In [90]:
nonunique_nodes_summary

Unnamed: 0,Functional_Category,Total Length,Node Count,Relative Size (%),Percent_RefGeneLengths
0,insertion seqs and phages,489105,339,50.16,2.69
1,PE/PPE,121321,462,12.44,7.01
2,information pathways,114,2,0.01,6.76
3,conserved hypotheticals,43076,84,4.42,18.77
4,cell wall and cell processes,14249,36,1.46,20.0
5,stable RNAs,0,0,0.0,0.2
6,intermediary metabolism and respiration,28275,62,2.9,25.75
7,regulatory proteins,6151,14,0.63,4.18
8,"virulence, detoxification, adaptation",857,2,0.09,3.91
9,lipid metabolism,26130,15,2.68,10.53


# Parse PG Tool outputs

## Define output dirs of Pangenome Analysis pipelines

In [91]:
target_OutputDir = MtbWGA_SMK_Pipeline_OutputDir

i_Pangenome_Dir = f"{target_OutputDir}/PanGenome_Analysis"


### Define path to Panaroo output files (151 LR genomes, MergeParalogs Parameters)

In [92]:

PG_OutDir_Dict = {   "Panaroo_Strict_MP": f"{i_Pangenome_Dir}/Panaroo_Strict_MergeParalogs_AllIsolates",
                     "Panaroo_Moderate_MP": f"{i_Pangenome_Dir}/Panaroo_Moderate_MergeParalogs_AllIsolates",
                     "Panaroo_Sens_MP": f"{i_Pangenome_Dir}/Panaroo_Sensitive_MergeParalogs_AllIsolates",
                     # "Panaroo_Strict": f"{i_Pangenome_Dir}/Panaroo_Strict_AllIsolates",
                     # "Panaroo_Moderate": f"{i_Pangenome_Dir}/Panaroo_Moderate_AllIsolates",
                     # "Panaroo_Sens": f"{i_Pangenome_Dir}/Panaroo_Sensitive_AllIsolates",
                     "Roary_Default": f"{i_Pangenome_Dir}/Roary_Default_AllIsolates",
                     "Roary_NoSplitParalogs": f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_AllIsolates",
                     "Roary_NoSplitParalogs_I80": f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_I80_AllIsolates",
                     "Roary_NoSplitParalogs_I90": f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_I90_AllIsolates",
                     "SR_Panaroo_Strict_MP": f"{i_Pangenome_Dir}/SR_Panaroo_Strict_MergeParalogs_AllIsolates",
                     "SR_Panaroo_Strict_MP": f"{i_Pangenome_Dir}/SR_Panaroo_Strict_MergeParalogs_AllIsolates",
                     "SR_Panaroo_Sens_MP": f"{i_Pangenome_Dir}/SR_Panaroo_Sensitive_MergeParalogs_AllIsolates",
                     "SR_Roary_Default": f"{i_Pangenome_Dir}/SR_Roary_Default_AllIsolates",
                     "SR_Roary_NoSplitParalogs": f"{i_Pangenome_Dir}/SR_Roary_NoSplitParalogs_AllIsolates",
                     "SR_Roary_NoSplitParalogs_I80": f"{i_Pangenome_Dir}/SR_Roary_NoSplitParalogs_I80_AllIsolates",
                     "SR_Roary_NoSplitParalogs_I90": f"{i_Pangenome_Dir}/SR_Roary_NoSplitParalogs_I90_AllIsolates",}


PG_PresAbs_CSV_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_PresAbs_CSV_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.csv"  
    
PG_PresAbs_Rtab_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_PresAbs_Rtab_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.Rtab"  

PG_GeneRefFA_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_GeneRefFA_PATH_Dict[i_param] = f"{i_outdir}/pan_genome_reference.fa"  

PG_AvA_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_AvA_PATH_Dict[i_param] = f"{i_outdir}/pan_genome_reference.KmerComparison.AllVsAll.MaxJC.tsv"



# Parse in processed data

### A) Parse in processed All vs All Kmer analysis

In [93]:
# AvA_DF_Dict = {}

# for i_Param, AvA_TSV_PATH in PG_AvA_PATH_Dict.items():

#     PG_AvA_DF = pd.read_csv(AvA_TSV_PATH, sep = "\t" )
#     AvA_DF_Dict[i_Param] = PG_AvA_DF


### B) Parse in Gene PresAbs Info

In [94]:
PresAbs_DF_Dict = {}

for i_Param, PresAbs_CSV_PATH in PG_PresAbs_CSV_PATH_Dict.items():

    if "Roary" in i_Param: 
        i_Gene_PresAbs_DF = parse_PresAbs_CSV_Roary(PresAbs_CSV_PATH)
    else:
        i_Gene_PresAbs_DF = parse_PresAbs_CSV_Panaroo(PresAbs_CSV_PATH)

    ### Relabel Columns for presence/absence tracking
    i_Gene_PresAbs_DF.columns = [ x.split(".Bakta")[0] for x in i_Gene_PresAbs_DF.columns ]

    print(i_Param, i_Gene_PresAbs_DF.shape)
    
    PresAbs_DF_Dict[i_Param] = i_Gene_PresAbs_DF
    

  


Panaroo_Strict_MP (4200, 153)


  


Panaroo_Moderate_MP (4280, 153)
Panaroo_Sens_MP (4281, 153)
Roary_Default (5366, 153)


  


Roary_NoSplitParalogs (4366, 153)


  


Roary_NoSplitParalogs_I80 (4252, 153)


  


Roary_NoSplitParalogs_I90 (4293, 153)


  


SR_Panaroo_Strict_MP (4211, 153)
SR_Panaroo_Sens_MP (4600, 153)
SR_Roary_Default (6006, 153)
SR_Roary_NoSplitParalogs (5025, 153)
SR_Roary_NoSplitParalogs_I80 (4866, 153)
SR_Roary_NoSplitParalogs_I90 (4956, 153)


### C) Parse in PG Gene Reference FASTAs for each PG output

In [95]:

PG_RefSeqs_DF_Dict = {}

for i_Param, i_PG_Ref_FA_PATH in PG_GeneRefFA_PATH_Dict.items():

    PG_RefSeqs_DF_Dict[i_Param] = parse_PG_Ref_FA(i_PG_Ref_FA_PATH)

    LR_PG_Ref_IDs = list( PG_RefSeqs_DF_Dict[i_Param].keys())
    print(i_Param, len(LR_PG_Ref_IDs))
    

Panaroo_Strict_MP 4200
Panaroo_Moderate_MP 4280
Panaroo_Sens_MP 4281
Roary_Default 5366
Roary_NoSplitParalogs 4366
Roary_NoSplitParalogs_I80 4252
Roary_NoSplitParalogs_I90 4293
SR_Panaroo_Strict_MP 4211
SR_Panaroo_Sens_MP 4600
SR_Roary_Default 6006
SR_Roary_NoSplitParalogs 5025
SR_Roary_NoSplitParalogs_I80 4866
SR_Roary_NoSplitParalogs_I90 4956


### Define functions for gene sequence classification

In [96]:
# Define function to classify a node based on a threshold
def classify_node(row, i_listOf_JC_Cols, threshold=0.25):
    filtered_matches = {col: row[col] for col in i_listOf_JC_Cols if row[col] >= threshold}
    if filtered_matches:
        max_value = max(filtered_matches.values())
        strongest_categories = [
            col for col, value in filtered_matches.items() if value == max_value
        ]

        if len(strongest_categories) == 1:
            return strongest_categories[0]
        else:
            return tuple(strongest_categories)
            
    return "None"  # No valid matches

In [97]:
def create_simple_gene_df(Ref_DictOf_Hashes, Ref_DictOf_SeqLen, i_N_AsmWiGene_Dict):
    """
    Computes k-mer match Jaccard containment for genes.
    
    Args:
        Ref_DictOf_Hashes (dict): Dictionary mapping GeneID to sets of k-mers (hashes).
        Ref_DictOf_SeqLen (dict): Dictionary mapping GeneID to their sequence lengths.
        category_hash_sets (dict): Dictionary of category names mapping to hash sets.
        N_AsmWiGene_Dict (dict): Dictionary mapping GeneID to number of assemblies matching the gene.
    
    Returns:
        pd.DataFrame: DataFrame summarizing Jaccard containment results for all genes.
    """
    gene_analysis_rows = []

    for GeneID, Gene_Hashes_Set in tqdm(Ref_DictOf_Hashes.items()):
        Len_Seq = Ref_DictOf_SeqLen.get(GeneID, 0)
        record_hashes_set = Gene_Hashes_Set

        # Prepare row for the DataFrame
        row = [GeneID, Len_Seq]
        gene_analysis_rows.append(row)

    # Create the DataFrame
    columns = ["GeneID", "SeqLength"]
    gene_kmer_match_df = pd.DataFrame(gene_analysis_rows, columns=columns)

    # Add the number of assemblies matching the gene
    gene_kmer_match_df["NumAsm_WiGene"] = gene_kmer_match_df["GeneID"].map(i_N_AsmWiGene_Dict)

    return gene_kmer_match_df

In [98]:
def compute_kmer_match_df(Ref_DictOf_Hashes, Ref_DictOf_SeqLen, category_hash_sets, N_AsmWiGene_Dict):
    """
    Computes k-mer match Jaccard containment for genes.
    
    Args:
        Ref_DictOf_Hashes (dict): Dictionary mapping GeneID to sets of k-mers (hashes).
        Ref_DictOf_SeqLen (dict): Dictionary mapping GeneID to their sequence lengths.
        category_hash_sets (dict): Dictionary of category names mapping to hash sets.
        N_AsmWiGene_Dict (dict): Dictionary mapping GeneID to number of assemblies matching the gene.
    
    Returns:
        pd.DataFrame: DataFrame summarizing Jaccard containment results for all genes.
    """
    gene_analysis_rows = []

    for GeneID, Gene_Hashes_Set in tqdm(Ref_DictOf_Hashes.items()):
        Len_Seq = Ref_DictOf_SeqLen.get(GeneID, 0)
        record_hashes_set = Gene_Hashes_Set

        # Initialize results for Jaccard containment
        jc_results = {}

        if len(record_hashes_set) != 0:
            # Calculate Jaccard containment for each category
            for category, hash_set in category_hash_sets.items():
                jc_results[category] = jaccard_containment_FromSets(record_hashes_set, hash_set)
        else:
            # Set all results to 0 if no hashes exist
            jc_results = {category: 0 for category in category_hash_sets}
            if Len_Seq < 31:
                print(f"No kmers were produced for segment: {GeneID}")

        # Prepare row for the DataFrame
        row = [GeneID, Len_Seq] + list(jc_results.values())
        gene_analysis_rows.append(row)

    # Create the DataFrame
    columns = ["GeneID", "SeqLength"] + list(category_hash_sets.keys())
    gene_kmer_match_df = pd.DataFrame(gene_analysis_rows, columns=columns)

    # Add the number of assemblies matching the gene
    gene_kmer_match_df["NumAsm_WiGene"] = gene_kmer_match_df["GeneID"].map(N_AsmWiGene_Dict)

    return gene_kmer_match_df

In [99]:
ListOf_Rv_GeneCats = ['information pathways', 'conserved hypotheticals', 'cell wall and cell processes', 'stable RNAs', 'intermediary metabolism and respiration', 'regulatory proteins', 'virulence, detoxification, adaptation', 'insertion seqs and phages', 'lipid metabolism', 'PE/PPE', 'unknown']


In [100]:
RvGeneCat_To_KmerHashes_Dict.keys()

dict_keys(['information pathways', 'conserved hypotheticals', 'cell wall and cell processes', 'stable RNAs', 'intermediary metabolism and respiration', 'regulatory proteins', 'virulence, detoxification, adaptation', 'insertion seqs and phages', 'lipid metabolism', 'PE/PPE', 'unknown'])

In [101]:
#PG_GeneRefFA_PATH_Dict

## D) Perform gene-level classification

In [129]:
PG_GeneSeq_KmerCatMatch_DF_Dict = {}
PG_GeneSeq_UnqSeqSVNodeMatch_DF_Dict = {}

PG_GeneSeq_CategorySumm_DF_Dict = {}

PG_Gene_DF_Dict = {}

for i_Param, i_PG_Ref_FA_PATH in tqdm(PG_GeneRefFA_PATH_Dict.items()):

    i_Ref_DictOf_Hashes, i_Ref_DictOf_SeqLen = read_kmers_from_file_ToHashesDict(i_PG_Ref_FA_PATH, 31)  
    
    i_N_AsmWiGene_Dict = PresAbs_DF_Dict[i_Param]["NumAsm_WiGene"].to_dict()    

    print(i_Param, len(list(i_Ref_DictOf_Hashes.keys())), len(list(i_N_AsmWiGene_Dict.keys())) )

    i_Gene_Simp_DF = create_simple_gene_df(i_Ref_DictOf_Hashes,
                                           i_Ref_DictOf_SeqLen,
                                           i_N_AsmWiGene_Dict)

    print(i_Gene_Simp_DF.shape[0])
    
    PG_Gene_DF_Dict[i_Param] = i_Gene_Simp_DF

    i_Gene_KmerCatMatch_DF = compute_kmer_match_df(i_Ref_DictOf_Hashes,
                                                   i_Ref_DictOf_SeqLen,
                                                   RvGeneCat_To_KmerHashes_Dict,
                                                   i_N_AsmWiGene_Dict)

    # Apply classification to the main DataFrame
    threshold = 0.25  # Set the classification threshold
    i_Gene_KmerCatMatch_DF["KmerMatch_RvGeneCat"] = i_Gene_KmerCatMatch_DF.apply(
        lambda row: classify_node(row, ListOf_Rv_GeneCats, threshold), axis=1)

    # Add the number of assemblies matching the gene
    i_Gene_KmerCatMatch_DF["NumAsm_WiGene"] = i_Gene_KmerCatMatch_DF["GeneID"].map(i_N_AsmWiGene_Dict)

    PG_GeneSeq_KmerCatMatch_DF_Dict[i_Param] = i_Gene_KmerCatMatch_DF
    PG_GeneSeq_CategorySumm_DF_Dict[i_Param] = summarize_sv_categories_with_functional_category(i_Gene_KmerCatMatch_DF,
                                                                                                ListOf_Rv_GeneCats)    


    # i_Gene_CompToUnqSeq_SVNodes_DF = compute_kmer_match_df(i_Ref_DictOf_Hashes,
    #                                                        i_Ref_DictOf_SeqLen,
    #                                                        MG_SVNodes_UnqSeq_HashDict,
    #                                                        i_N_AsmWiGene_Dict)

    # # Apply classification to the main DataFrame
    # threshold = 0.25  # Set the classification threshold
    # i_Gene_CompToUnqSeq_SVNodes_DF["Matched_UnqSeq_NodeIDs"] = i_Gene_CompToUnqSeq_SVNodes_DF.apply(
    #     lambda row: classify_node(row, SVNodeIDs_UnqSeq, threshold), axis=1)
    
    # PG_GeneSeq_UnqSeqSVNodeMatch_DF_Dict[i_Param] = i_Gene_CompToUnqSeq_SVNodes_DF

    print("\n\n")

  0%|          | 0/13 [00:00<?, ?it/s]

4200  total records were parsed



  0%|          | 0/4200 [00:00<?, ?it/s][A
100%|██████████| 4200/4200 [00:00<00:00, 56080.19it/s][A
  0%|          | 0/4200 [00:00<?, ?it/s][A

Panaroo_Strict_MP 4200 4200
4200



  1%|▏         | 59/4200 [00:00<00:07, 588.72it/s][A
  3%|▎         | 141/4200 [00:00<00:06, 640.94it/s][A
  6%|▌         | 239/4200 [00:00<00:05, 711.84it/s][A
  8%|▊         | 334/4200 [00:00<00:05, 769.71it/s][A
 11%|█         | 455/4200 [00:00<00:04, 863.30it/s][A
 14%|█▎        | 573/4200 [00:00<00:03, 938.68it/s][A
 16%|█▌        | 667/4200 [00:00<00:03, 936.11it/s][A
 19%|█▊        | 779/4200 [00:00<00:03, 983.33it/s][A
 21%|██        | 878/4200 [00:00<00:03, 948.36it/s][A
 23%|██▎       | 974/4200 [00:01<00:03, 950.91it/s][A
 25%|██▌       | 1070/4200 [00:01<00:03, 950.13it/s][A
 28%|██▊       | 1166/4200 [00:01<00:03, 950.35it/s][A
 30%|███       | 1262/4200 [00:01<00:03, 951.48it/s][A
 33%|███▎      | 1390/4200 [00:01<00:02, 1028.22it/s][A
 36%|███▌      | 1495/4200 [00:01<00:02, 1006.73it/s][A
 38%|███▊      | 1598/4200 [00:01<00:02, 971.93it/s] [A
 40%|████      | 1697/4200 [00:01<00:02, 948.59it/s][A
 43%|████▎     | 1801/4200 [00:01<00:02, 970.29it/s][A







  0%|          | 0/4280 [00:00<?, ?it/s][A

4280  total records were parsed
Panaroo_Moderate_MP 4280 4280



 50%|█████     | 2151/4280 [00:00<00:00, 7090.36it/s][A
100%|██████████| 4280/4280 [00:00<00:00, 13630.12it/s][A
  0%|          | 0/4280 [00:00<?, ?it/s][A
  1%|▏         | 61/4280 [00:00<00:06, 607.97it/s][A

4280



  3%|▎         | 146/4280 [00:00<00:06, 662.57it/s][A
  6%|▌         | 247/4280 [00:00<00:05, 737.99it/s][A
  8%|▊         | 352/4280 [00:00<00:04, 809.36it/s][A
 11%|█         | 473/4280 [00:00<00:04, 898.10it/s][A
 14%|█▍        | 589/4280 [00:00<00:03, 962.90it/s][A
 16%|█▌        | 691/4280 [00:00<00:03, 976.60it/s][A
 19%|█▉        | 805/4280 [00:00<00:03, 1020.11it/s][A
 21%|██        | 907/4280 [00:00<00:03, 985.62it/s] [A
 24%|██▎       | 1010/4280 [00:01<00:03, 996.37it/s][A
 26%|██▌       | 1113/4280 [00:01<00:03, 1001.28it/s][A
 28%|██▊       | 1214/4280 [00:01<00:03, 997.50it/s] [A
 32%|███▏      | 1351/4280 [00:01<00:02, 1081.09it/s][A
 34%|███▍      | 1463/4280 [00:01<00:02, 1089.94it/s][A
 37%|███▋      | 1574/4280 [00:01<00:02, 1045.85it/s][A
 39%|███▉      | 1680/4280 [00:01<00:02, 956.08it/s] [A
 42%|████▏     | 1784/4280 [00:01<00:02, 979.64it/s][A
 44%|████▍     | 1901/4280 [00:01<00:02, 1028.13it/s][A
 47%|████▋     | 2006/4280 [00:01<00:02, 984.7







  0%|          | 0/4281 [00:00<?, ?it/s][A

4281  total records were parsed
Panaroo_Sens_MP 4281 4281



100%|██████████| 4281/4281 [00:00<00:00, 172385.21it/s][A
  0%|          | 0/4281 [00:00<?, ?it/s][A
  2%|▏         | 70/4281 [00:00<00:06, 691.91it/s][A

4281



  4%|▍         | 174/4281 [00:00<00:05, 766.06it/s][A
  7%|▋         | 283/4281 [00:00<00:04, 840.51it/s][A
  9%|▉         | 392/4281 [00:00<00:04, 901.55it/s][A
 12%|█▏        | 531/4281 [00:00<00:03, 1006.71it/s][A
 15%|█▍        | 642/4281 [00:00<00:03, 1031.89it/s][A
 18%|█▊        | 753/4281 [00:00<00:03, 1051.44it/s][A
 20%|██        | 858/4281 [00:00<00:03, 1050.95it/s][A
 22%|██▏       | 962/4281 [00:00<00:03, 1033.25it/s][A
 25%|██▌       | 1074/4281 [00:01<00:03, 1055.04it/s][A
 28%|██▊       | 1182/4281 [00:01<00:02, 1059.71it/s][A
 30%|███       | 1293/4281 [00:01<00:02, 1073.16it/s][A
 33%|███▎      | 1429/4281 [00:01<00:02, 1145.37it/s][A
 36%|███▌      | 1545/4281 [00:01<00:02, 1116.86it/s][A
 39%|███▊      | 1658/4281 [00:01<00:02, 1019.89it/s][A
 41%|████▏     | 1769/4281 [00:01<00:02, 1043.60it/s][A
 44%|████▍     | 1888/4281 [00:01<00:02, 1082.96it/s][A
 47%|████▋     | 1998/4281 [00:01<00:02, 1024.34it/s][A
 49%|████▉     | 2103/4281 [00:02<00:02, 




5366  total records were parsed



  0%|          | 0/5366 [00:00<?, ?it/s][A

Roary_Default 5366 5366



 27%|██▋       | 1448/5366 [00:00<00:00, 4262.07it/s][A
100%|██████████| 5366/5366 [00:00<00:00, 15172.98it/s][A
  0%|          | 0/5366 [00:00<?, ?it/s][A
  1%|          | 56/5366 [00:00<00:09, 558.56it/s][A

5366



  3%|▎         | 141/5366 [00:00<00:08, 620.11it/s][A
  4%|▍         | 228/5366 [00:00<00:07, 675.75it/s][A
  6%|▌         | 304/5366 [00:00<00:07, 695.14it/s][A
  7%|▋         | 388/5366 [00:00<00:06, 733.02it/s][A
  9%|▉         | 486/5366 [00:00<00:06, 791.43it/s][A
 11%|█         | 597/5366 [00:00<00:05, 863.18it/s][A
 13%|█▎        | 705/5366 [00:00<00:05, 917.00it/s][A
 15%|█▌        | 819/5366 [00:00<00:04, 973.12it/s][A
 17%|█▋        | 918/5366 [00:01<00:04, 971.07it/s][A
 19%|█▉        | 1017/5366 [00:01<00:04, 974.02it/s][A
 21%|██        | 1119/5366 [00:01<00:04, 985.26it/s][A
 23%|██▎       | 1224/5366 [00:01<00:04, 1003.58it/s][A
 25%|██▍       | 1325/5366 [00:01<00:04, 948.91it/s] [A
 26%|██▋       | 1421/5366 [00:01<00:04, 936.54it/s][A
 28%|██▊       | 1520/5366 [00:01<00:04, 950.23it/s][A
 30%|███       | 1616/5366 [00:01<00:03, 939.88it/s][A
 32%|███▏      | 1714/5366 [00:01<00:03, 949.57it/s][A
 34%|███▍      | 1812/5366 [00:01<00:03, 956.62it/s][




4366  total records were parsed



  0%|          | 0/4366 [00:00<?, ?it/s][A
100%|██████████| 4366/4366 [00:00<00:00, 282715.50it/s][A
  0%|          | 0/4366 [00:00<?, ?it/s][A
  2%|▏         | 87/4366 [00:00<00:04, 866.73it/s][A

Roary_NoSplitParalogs 4366 4366
4366



  4%|▍         | 171/4366 [00:00<00:04, 857.54it/s][A
  6%|▌         | 257/4366 [00:00<00:04, 856.39it/s][A
  8%|▊         | 339/4366 [00:00<00:04, 844.84it/s][A
 10%|▉         | 416/4366 [00:00<00:04, 820.49it/s][A
 12%|█▏        | 513/4366 [00:00<00:04, 858.78it/s][A
 14%|█▍        | 614/4366 [00:00<00:04, 898.34it/s][A
 16%|█▌        | 698/4366 [00:00<00:06, 551.76it/s][A
 18%|█▊        | 774/4366 [00:01<00:06, 587.45it/s][A
 20%|█▉        | 866/4366 [00:01<00:05, 658.80it/s][A
 22%|██▏       | 967/4366 [00:01<00:04, 735.17it/s][A
 24%|██▍       | 1060/4366 [00:01<00:04, 784.07it/s][A
 27%|██▋       | 1177/4366 [00:01<00:03, 870.13it/s][A
 29%|██▉       | 1273/4366 [00:01<00:03, 883.71it/s][A
 31%|███▏      | 1368/4366 [00:01<00:03, 883.36it/s][A
 34%|███▎      | 1464/4366 [00:01<00:03, 903.43it/s][A
 36%|███▌      | 1562/4366 [00:01<00:03, 922.28it/s][A
 38%|███▊      | 1660/4366 [00:02<00:02, 937.50it/s][A
 40%|████      | 1756/4366 [00:02<00:02, 926.51it/s][A
 







  0%|          | 0/4252 [00:00<?, ?it/s][A

4252  total records were parsed
Roary_NoSplitParalogs_I80 4252 4252



 18%|█▊        | 759/4252 [00:00<00:00, 3866.98it/s][A
100%|██████████| 4252/4252 [00:00<00:00, 20491.94it/s][A
  0%|          | 0/4252 [00:00<?, ?it/s][A
  2%|▏         | 95/4252 [00:00<00:04, 941.20it/s][A

4252



  4%|▍         | 183/4252 [00:00<00:04, 921.94it/s][A
  6%|▋         | 269/4252 [00:00<00:04, 899.47it/s][A
  8%|▊         | 352/4252 [00:00<00:04, 876.83it/s][A
 10%|█         | 436/4252 [00:00<00:04, 864.89it/s][A
 13%|█▎        | 541/4252 [00:00<00:04, 911.79it/s][A
 15%|█▌        | 647/4252 [00:00<00:03, 948.49it/s][A
 18%|█▊        | 767/4252 [00:00<00:03, 1010.24it/s][A
 20%|██        | 866/4252 [00:00<00:03, 1003.03it/s][A
 23%|██▎       | 964/4252 [00:01<00:03, 994.17it/s] [A
 25%|██▌       | 1071/4252 [00:01<00:03, 1014.12it/s][A
 28%|██▊       | 1187/4252 [00:01<00:02, 1052.25it/s][A
 30%|███       | 1292/4252 [00:01<00:02, 989.88it/s] [A
 33%|███▎      | 1392/4252 [00:01<00:02, 977.65it/s][A
 35%|███▌      | 1491/4252 [00:01<00:02, 970.95it/s][A
 37%|███▋      | 1589/4252 [00:01<00:02, 970.93it/s][A
 40%|███▉      | 1687/4252 [00:01<00:02, 939.95it/s][A
 42%|████▏     | 1794/4252 [00:01<00:02, 973.77it/s][A
 44%|████▍     | 1892/4252 [00:01<00:02, 949.36it/




4293  total records were parsed



  0%|          | 0/4293 [00:00<?, ?it/s][A
100%|██████████| 4293/4293 [00:00<00:00, 416028.91it/s][A
  0%|          | 0/4293 [00:00<?, ?it/s][A
  2%|▏         | 76/4293 [00:00<00:05, 747.47it/s][A

Roary_NoSplitParalogs_I90 4293 4293
4293



  4%|▍         | 165/4293 [00:00<00:05, 783.71it/s][A
  6%|▌         | 256/4293 [00:00<00:04, 816.51it/s][A
  8%|▊         | 339/4293 [00:00<00:04, 819.85it/s][A
 10%|▉         | 428/4293 [00:00<00:04, 836.75it/s][A
 13%|█▎        | 539/4293 [00:00<00:04, 901.77it/s][A
 15%|█▌        | 649/4293 [00:00<00:03, 946.69it/s][A
 17%|█▋        | 740/4293 [00:00<00:05, 642.11it/s][A
 20%|█▉        | 852/4293 [00:01<00:04, 735.77it/s][A
 22%|██▏       | 953/4293 [00:01<00:04, 799.00it/s][A
 24%|██▍       | 1043/4293 [00:01<00:03, 821.34it/s][A
 27%|██▋       | 1156/4293 [00:01<00:03, 892.17it/s][A
 29%|██▉       | 1252/4293 [00:01<00:03, 900.11it/s][A
 31%|███▏      | 1347/4293 [00:01<00:03, 892.48it/s][A
 34%|███▎      | 1440/4293 [00:01<00:03, 892.42it/s][A
 36%|███▌      | 1535/4293 [00:01<00:03, 908.07it/s][A
 38%|███▊      | 1637/4293 [00:01<00:02, 938.95it/s][A
 40%|████      | 1733/4293 [00:01<00:02, 907.58it/s][A
 43%|████▎     | 1830/4293 [00:02<00:02, 924.12it/s][A





4211  total records were parsed



  0%|          | 0/4211 [00:00<?, ?it/s][A

SR_Panaroo_Strict_MP 4211 4211



 18%|█▊        | 747/4211 [00:00<00:01, 2921.11it/s][A
100%|██████████| 4211/4211 [00:00<00:00, 15760.86it/s][A
  0%|          | 0/4211 [00:00<?, ?it/s][A
  2%|▏         | 72/4211 [00:00<00:05, 717.30it/s][A

4211



  4%|▍         | 182/4211 [00:00<00:05, 800.83it/s][A
  7%|▋         | 312/4211 [00:00<00:04, 904.41it/s][A
 10%|█         | 423/4211 [00:00<00:03, 956.23it/s][A
 13%|█▎        | 535/4211 [00:00<00:03, 1000.05it/s][A
 15%|█▌        | 643/4211 [00:00<00:03, 1020.43it/s][A
 18%|█▊        | 762/4211 [00:00<00:03, 1063.89it/s][A
 21%|██        | 880/4211 [00:00<00:03, 1092.92it/s][A
 23%|██▎       | 988/4211 [00:00<00:02, 1085.02it/s][A
 27%|██▋       | 1121/4211 [00:01<00:02, 1148.13it/s][A
 29%|██▉       | 1236/4211 [00:01<00:02, 1130.56it/s][A
 32%|███▏      | 1350/4211 [00:01<00:02, 1102.35it/s][A
 35%|███▍      | 1461/4211 [00:01<00:02, 1096.05it/s][A
 38%|███▊      | 1591/4211 [00:01<00:02, 1148.41it/s][A
 41%|████      | 1713/4211 [00:01<00:02, 1167.11it/s][A
 43%|████▎     | 1831/4211 [00:01<00:02, 1119.96it/s][A
 46%|████▋     | 1952/4211 [00:01<00:01, 1143.32it/s][A
 49%|████▉     | 2068/4211 [00:01<00:01, 1134.64it/s][A
 52%|█████▏    | 2190/4211 [00:01<00:01, 




4600  total records were parsed



  0%|          | 0/4600 [00:00<?, ?it/s][A
100%|██████████| 4600/4600 [00:00<00:00, 78028.20it/s][A
  0%|          | 0/4600 [00:00<?, ?it/s][A
  2%|▏         | 106/4600 [00:00<00:04, 1048.44it/s][A

SR_Panaroo_Sens_MP 4600 4600
4600



  5%|▍         | 220/4600 [00:00<00:04, 1073.24it/s][A
  7%|▋         | 337/4600 [00:00<00:03, 1100.12it/s][A
 10%|▉         | 454/4600 [00:00<00:03, 1116.69it/s][A
 12%|█▏        | 559/4600 [00:00<00:03, 1094.41it/s][A
 14%|█▍        | 663/4600 [00:00<00:03, 1076.25it/s][A
 16%|█▋        | 759/4600 [00:00<00:06, 566.98it/s] [A
 19%|█▉        | 880/4600 [00:01<00:05, 673.55it/s][A
 21%|██▏       | 985/4600 [00:01<00:04, 753.96it/s][A
 24%|██▍       | 1114/4600 [00:01<00:04, 860.76it/s][A
 27%|██▋       | 1228/4600 [00:01<00:03, 925.46it/s][A
 29%|██▉       | 1335/4600 [00:01<00:03, 878.77it/s][A
 32%|███▏      | 1450/4600 [00:01<00:03, 945.51it/s][A
 34%|███▍      | 1573/4600 [00:01<00:02, 1015.64it/s][A
 37%|███▋      | 1709/4600 [00:01<00:02, 1095.44it/s][A
 40%|███▉      | 1826/4600 [00:01<00:02, 1087.18it/s][A
 42%|████▏     | 1951/4600 [00:02<00:02, 1130.82it/s][A
 45%|████▍     | 2068/4600 [00:02<00:02, 1135.66it/s][A
 48%|████▊     | 2193/4600 [00:02<00:02, 116




6006  total records were parsed



  0%|          | 0/6006 [00:00<?, ?it/s][A
100%|██████████| 6006/6006 [00:00<00:00, 221535.21it/s][A
  0%|          | 0/6006 [00:00<?, ?it/s][A
  2%|▏         | 101/6006 [00:00<00:05, 1008.79it/s][A

SR_Roary_Default 6006 6006
6006



  3%|▎         | 202/6006 [00:00<00:05, 1007.25it/s][A
  5%|▌         | 327/6006 [00:00<00:05, 1068.34it/s][A
  7%|▋         | 442/6006 [00:00<00:05, 1088.37it/s][A
  9%|▉         | 532/6006 [00:00<00:05, 1023.29it/s][A
 11%|█         | 647/6006 [00:00<00:05, 1057.87it/s][A
 13%|█▎        | 755/6006 [00:00<00:04, 1061.72it/s][A
 14%|█▍        | 857/6006 [00:00<00:04, 1046.87it/s][A
 16%|█▌        | 957/6006 [00:00<00:04, 1027.90it/s][A
 18%|█▊        | 1060/6006 [00:01<00:04, 1027.34it/s][A
 19%|█▉        | 1161/6006 [00:01<00:04, 998.12it/s] [A
 21%|██        | 1263/6006 [00:01<00:04, 1003.80it/s][A
 23%|██▎       | 1363/6006 [00:01<00:04, 997.81it/s] [A
 24%|██▍       | 1464/6006 [00:01<00:04, 999.40it/s][A
 26%|██▌       | 1571/6006 [00:01<00:04, 1017.06it/s][A
 28%|██▊       | 1673/6006 [00:01<00:04, 977.92it/s] [A
 30%|██▉       | 1782/6006 [00:01<00:04, 1008.36it/s][A
 31%|███▏      | 1888/6006 [00:01<00:04, 1020.12it/s][A
 33%|███▎      | 1991/6006 [00:01<00:04




5025  total records were parsed



  0%|          | 0/5025 [00:00<?, ?it/s][A
100%|██████████| 5025/5025 [00:00<00:00, 384205.80it/s][A
  0%|          | 0/5025 [00:00<?, ?it/s][A
  2%|▏         | 88/5025 [00:00<00:05, 878.80it/s][A

SR_Roary_NoSplitParalogs 5025 5025
5025



  4%|▎         | 185/5025 [00:00<00:05, 902.62it/s][A
  6%|▌         | 287/5025 [00:00<00:05, 931.22it/s][A
  8%|▊         | 397/5025 [00:00<00:04, 975.51it/s][A
 10%|▉         | 483/5025 [00:00<00:04, 934.52it/s][A
 11%|█▏        | 568/5025 [00:00<00:04, 907.04it/s][A
 13%|█▎        | 667/5025 [00:00<00:04, 930.25it/s][A
 16%|█▌        | 779/5025 [00:00<00:04, 978.98it/s][A
 17%|█▋        | 874/5025 [00:00<00:04, 964.06it/s][A
 19%|█▉        | 968/5025 [00:01<00:04, 945.38it/s][A
 21%|██▏       | 1070/5025 [00:01<00:04, 965.50it/s][A
 23%|██▎       | 1166/5025 [00:01<00:04, 933.21it/s][A
 25%|██▌       | 1263/5025 [00:01<00:03, 940.78it/s][A
 27%|██▋       | 1360/5025 [00:01<00:03, 946.04it/s][A
 29%|██▉       | 1460/5025 [00:01<00:03, 957.55it/s][A
 31%|███       | 1563/5025 [00:01<00:03, 973.03it/s][A
 33%|███▎      | 1661/5025 [00:01<00:03, 934.74it/s][A
 35%|███▌      | 1775/5025 [00:01<00:03, 986.14it/s][A
 37%|███▋      | 1884/5025 [00:01<00:03, 1014.72it/s][A







  0%|          | 0/4866 [00:00<?, ?it/s][A

4866  total records were parsed
SR_Roary_NoSplitParalogs_I80 4866 4866



 44%|████▍     | 2152/4866 [00:00<00:00, 10241.43it/s][A
 59%|█████▊    | 2849/4866 [00:02<00:02, 948.04it/s]  [A
100%|██████████| 4866/4866 [00:02<00:00, 1936.39it/s][A
  0%|          | 0/4866 [00:00<?, ?it/s][A
  2%|▏         | 110/4866 [00:00<00:04, 1096.53it/s][A

4866



  4%|▍         | 198/4866 [00:00<00:04, 1015.66it/s][A
  6%|▌         | 301/4866 [00:00<00:04, 1016.24it/s][A
  9%|▊         | 415/4866 [00:00<00:04, 1046.44it/s][A
 10%|█         | 503/4866 [00:00<00:04, 989.09it/s] [A
 12%|█▏        | 597/4866 [00:00<00:04, 973.29it/s][A
 14%|█▍        | 690/4866 [00:00<00:04, 955.52it/s][A
 16%|█▋        | 799/4866 [00:00<00:04, 991.58it/s][A
 18%|█▊        | 893/4866 [00:00<00:04, 960.49it/s][A
 20%|██        | 986/4866 [00:01<00:04, 949.19it/s][A
 22%|██▏       | 1087/4866 [00:01<00:03, 963.98it/s][A
 24%|██▍       | 1182/4866 [00:01<00:03, 928.36it/s][A
 26%|██▌       | 1274/4866 [00:01<00:03, 920.93it/s][A
 28%|██▊       | 1366/4866 [00:01<00:03, 917.35it/s][A
 30%|██▉       | 1459/4866 [00:01<00:03, 916.50it/s][A
 32%|███▏      | 1560/4866 [00:01<00:03, 941.33it/s][A
 34%|███▍      | 1655/4866 [00:01<00:03, 902.99it/s][A
 36%|███▋      | 1764/4866 [00:01<00:03, 950.00it/s][A
 38%|███▊      | 1869/4866 [00:01<00:03, 977.60it/s]




4956  total records were parsed



  0%|          | 0/4956 [00:00<?, ?it/s][A
100%|██████████| 4956/4956 [00:00<00:00, 32143.04it/s][A
  0%|          | 0/4956 [00:00<?, ?it/s][A

SR_Roary_NoSplitParalogs_I90 4956 4956
4956



  2%|▏         | 75/4956 [00:00<00:06, 744.55it/s][A
  4%|▎         | 183/4956 [00:00<00:05, 820.90it/s][A
  6%|▌         | 291/4956 [00:00<00:05, 882.49it/s][A
  8%|▊         | 404/4956 [00:00<00:04, 941.61it/s][A
 10%|█         | 502/4956 [00:00<00:04, 947.52it/s][A
 12%|█▏        | 607/4956 [00:00<00:04, 975.30it/s][A
 14%|█▍        | 707/4956 [00:00<00:04, 982.45it/s][A
 17%|█▋        | 822/4956 [00:00<00:04, 1026.10it/s][A
 19%|█▊        | 922/4956 [00:00<00:04, 1003.23it/s][A
 21%|██        | 1026/4956 [00:01<00:03, 1013.74it/s][A
 23%|██▎       | 1127/4956 [00:01<00:03, 978.33it/s] [A
 25%|██▍       | 1234/4956 [00:01<00:03, 1003.72it/s][A
 27%|██▋       | 1335/4956 [00:01<00:03, 997.65it/s] [A
 29%|██▉       | 1436/4956 [00:01<00:03, 998.11it/s][A
 31%|███       | 1544/4956 [00:01<00:03, 1018.99it/s][A
 33%|███▎      | 1646/4956 [00:01<00:03, 995.96it/s] [A
 35%|███▌      | 1756/4956 [00:01<00:03, 1021.17it/s][A
 38%|███▊      | 1872/4956 [00:01<00:02, 1058.66









In [138]:
i_Param

'SR_Roary_NoSplitParalogs_I90'

In [139]:
i_Gene_KmerCatMatch_DF.head(5)

Unnamed: 0,GeneID,SeqLength,information pathways,conserved hypotheticals,cell wall and cell processes,stable RNAs,intermediary metabolism and respiration,regulatory proteins,"virulence, detoxification, adaptation",insertion seqs and phages,lipid metabolism,PE/PPE,unknown,NumAsm_WiGene,KmerMatch_RvGeneCat
0,moaE1,444,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,151,intermediary metabolism and respiration
1,group_4467,603,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151,conserved hypotheticals
2,cyp141,1203,0.0,0.0,0.0,0.0,0.973572,0.0,0.0,0.0,0.0,0.0,0.0,150,intermediary metabolism and respiration
3,group_1592,471,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151,conserved hypotheticals
4,group_4812,270,0.0,0.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151,conserved hypotheticals


In [140]:
len(list(i_Ref_DictOf_Hashes.keys()))

4956

In [144]:
PG_GeneSeq_KmerCatMatch_DF_Dict['Roary_NoSplitParalogs_I80'].shape

(4252, 15)

In [145]:
PG_Gene_DF_Dict['Roary_NoSplitParalogs_I90'].shape

(4293, 3)

In [146]:
PG_Gene_DF_Dict['Roary_NoSplitParalogs_I90'].shape

(4293, 3)

In [135]:
PG_GeneRefFA_PATH_Dict.keys()

dict_keys(['Panaroo_Strict_MP', 'Panaroo_Moderate_MP', 'Panaroo_Sens_MP', 'Roary_Default', 'Roary_NoSplitParalogs', 'Roary_NoSplitParalogs_I80', 'Roary_NoSplitParalogs_I90', 'SR_Panaroo_Strict_MP', 'SR_Panaroo_Sens_MP', 'SR_Roary_Default', 'SR_Roary_NoSplitParalogs', 'SR_Roary_NoSplitParalogs_I80', 'SR_Roary_NoSplitParalogs_I90'])

In [147]:
PG_Gene_DF_Dict.keys()

dict_keys(['Panaroo_Strict_MP', 'Panaroo_Moderate_MP', 'Panaroo_Sens_MP', 'Roary_Default', 'Roary_NoSplitParalogs', 'Roary_NoSplitParalogs_I80', 'Roary_NoSplitParalogs_I90', 'SR_Panaroo_Strict_MP', 'SR_Panaroo_Sens_MP', 'SR_Roary_Default', 'SR_Roary_NoSplitParalogs', 'SR_Roary_NoSplitParalogs_I80', 'SR_Roary_NoSplitParalogs_I90'])

In [148]:
for i_Param, i_PG_Ref_FA_PATH in (PG_GeneRefFA_PATH_Dict.items()):
    print()
    print(i_Param)
    !grep ^">" $i_PG_Ref_FA_PATH | wc -l 
    print(PresAbs_DF_Dict[i_Param].shape)
    print(PG_Gene_DF_Dict[i_Param].shape)
    print()


Panaroo_Strict_MP
4200
(4200, 153)
(4200, 3)


Panaroo_Moderate_MP
4280
(4280, 153)
(4280, 3)


Panaroo_Sens_MP
4281
(4281, 153)
(4281, 3)


Roary_Default
5366
(5366, 153)
(5366, 3)


Roary_NoSplitParalogs
4366
(4366, 153)
(4366, 3)


Roary_NoSplitParalogs_I80
4252
(4252, 153)
(4252, 3)


Roary_NoSplitParalogs_I90
4293
(4293, 153)
(4293, 3)


SR_Panaroo_Strict_MP
4211
(4211, 153)
(4211, 3)


SR_Panaroo_Sens_MP
4600
(4600, 153)
(4600, 3)


SR_Roary_Default
6006
(6006, 153)
(6006, 3)


SR_Roary_NoSplitParalogs
5025
(5025, 153)
(5025, 3)


SR_Roary_NoSplitParalogs_I80
4866
(4866, 153)
(4866, 3)


SR_Roary_NoSplitParalogs_I90
4956
(4956, 153)
(4956, 3)



In [196]:
PG_GeneSeq_CategorySumm_DF_Dict["SR_Roary_NoSplitParalogs_I90"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Relative Size (%)
information pathways,unknown,290325,278,6.93
conserved hypotheticals,unknown,759468,1084,18.14
cell wall and cell processes,unknown,845541,907,20.2
stable RNAs,unknown,0,0,0.0
intermediary metabolism and respiration,unknown,1092735,1086,26.1
regulatory proteins,unknown,166440,209,3.98
"virulence, detoxification, adaptation",unknown,154077,239,3.68
insertion seqs and phages,unknown,72984,110,1.74
lipid metabolism,unknown,429372,354,10.26
PE/PPE,unknown,368139,472,8.79


In [195]:
PG_GeneSeq_CategorySumm_DF_Dict["Roary_NoSplitParalogs_I90"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Relative Size (%)
information pathways,unknown,268203,237,6.78
conserved hypotheticals,unknown,736572,1029,18.62
cell wall and cell processes,unknown,803622,788,20.31
stable RNAs,unknown,0,0,0.0
intermediary metabolism and respiration,unknown,1034316,946,26.14
regulatory proteins,unknown,160791,194,4.06
"virulence, detoxification, adaptation",unknown,151686,230,3.83
insertion seqs and phages,unknown,74781,106,1.89
lipid metabolism,unknown,406257,269,10.27
PE/PPE,unknown,311778,260,7.88


# Analysis of matching analysis of SV Nodes to Accessory Genes

In [208]:
# X_SummCols = ["GeneID", "SeqLength", "NumAsm_WiGene", "Matched_UnqSeq_NodeIDs"]
X_SummCols = ["GeneID", "SeqLength", "NumAsm_WiGene", "KmerMatch_RvGeneCat"]


In [209]:
PG_OutDir_Dict.keys()

dict_keys(['Panaroo_Strict_MP', 'Panaroo_Moderate_MP', 'Panaroo_Sens_MP', 'Roary_Default', 'Roary_NoSplitParalogs', 'Roary_NoSplitParalogs_I80', 'Roary_NoSplitParalogs_I90', 'SR_Panaroo_Strict_MP', 'SR_Panaroo_Sens_MP', 'SR_Roary_Default', 'SR_Roary_NoSplitParalogs', 'SR_Roary_NoSplitParalogs_I80', 'SR_Roary_NoSplitParalogs_I90'])

### Create DFs of interest for each PG analysis output 

In [210]:
PStrict_MP_X = PG_GeneSeq_KmerCatMatch_DF_Dict['Panaroo_Strict_MP'][X_SummCols]
PStrict_MP_Acc = PStrict_MP_X.query("NumAsm_WiGene < 151")
#PStrict_MP_X2 = PStrict_MP_X.query("Matched_UnqSeq_NodeIDs != 'None'")

In [211]:
SR_PStrict_MP_X = PG_GeneSeq_KmerCatMatch_DF_Dict['SR_Panaroo_Strict_MP'][X_SummCols]
SR_PStrict_MP_Acc = SR_PStrict_MP_X.query("NumAsm_WiGene < 151")
#SR_PStrict_MP_X2 = SR_PStrict_MP_X.query("Matched_UnqSeq_NodeIDs != 'None'")

In [212]:
PSens_MP_X = PG_GeneSeq_KmerCatMatch_DF_Dict['Panaroo_Sens_MP'][X_SummCols]
PSens_MP_Acc = PSens_MP_X.query("NumAsm_WiGene < 151")
#PSens_MP_X2 = PSens_MP_X.query("Matched_UnqSeq_NodeIDs != 'None'") 

In [213]:
RDefault_X = PG_GeneSeq_KmerCatMatch_DF_Dict['Roary_Default'][X_SummCols]
RDefault_Acc = RDefault_X.query("NumAsm_WiGene < 151")
#RDefault_X2 = RDefault_X.query("Matched_UnqSeq_NodeIDs != 'None'") 

In [214]:
SR_RDefault_X = PG_GeneSeq_KmerCatMatch_DF_Dict['SR_Roary_Default'][X_SummCols]
SR_RDefault_Acc = SR_RDefault_X.query("NumAsm_WiGene < 151")
#SR_RDefault_X2 = SR_RDefault_X.query("Matched_UnqSeq_NodeIDs != 'None'") 

In [215]:
RDefault_MP_X = PG_GeneSeq_KmerCatMatch_DF_Dict['Roary_NoSplitParalogs'][X_SummCols]
RDefault_MP_Acc = RDefault_MP_X.query("NumAsm_WiGene < 151")
#RDefault_MP_X2 = RDefault_MP_X.query("Matched_UnqSeq_NodeIDs != 'None'") 

In [216]:
RI80_MP_X = PG_GeneSeq_KmerCatMatch_DF_Dict['Roary_NoSplitParalogs_I80'][X_SummCols]
RI80_MP_Acc = RI80_MP_X.query("NumAsm_WiGene < 151")
#RI80_MP_X2 = RI80_MP_X.query("Matched_UnqSeq_NodeIDs != 'None'") 

In [217]:
RI90_MP_X = PG_GeneSeq_KmerCatMatch_DF_Dict['Roary_NoSplitParalogs_I90'][X_SummCols]
RI90_MP_Acc = RI80_MP_X.query("NumAsm_WiGene < 151")
#RI90_MP_X2 = RI80_MP_X.query("Matched_UnqSeq_NodeIDs != 'None'") 

In [218]:
RI80_MP_X.shape

(4252, 4)

In [219]:
RI90_MP_X.shape

(4293, 4)

In [190]:
PStrict_MP_Acc.shape

(537, 3)

In [167]:
RDefault_MP_X.shape

(4366, 3)

In [168]:
SR_RDefault_X.shape

(6006, 3)

In [193]:
PStrict_MP_Acc.shape

(537, 3)

In [194]:
RI90_MP_Acc.shape

(663, 3)

In [220]:
SR_PStrict_MP_X.head(3)

Unnamed: 0,GeneID,SeqLength,NumAsm_WiGene,KmerMatch_RvGeneCat
0,moaE1,444,151,intermediary metabolism and respiration
1,group_1487,603,151,conserved hypotheticals
2,group_69,270,151,conserved hypotheticals


In [221]:
PStrict_MP_X.head(2)

Unnamed: 0,GeneID,SeqLength,NumAsm_WiGene,KmerMatch_RvGeneCat
0,dnaA,1491,151,information pathways
1,dnaN,1209,151,information pathways


In [200]:
SR_PStrict_MP_Acc.head(3)

Unnamed: 0,GeneID,SeqLength,NumAsm_WiGene
37,group_105,249,147
38,group_2184,339,147
40,group_176,210,150


In [225]:
PStrict_MP_Acc["KmerMatch_RvGeneCat"].value_counts()

conserved hypotheticals                                    106
None                                                        84
PE/PPE                                                      79
intermediary metabolism and respiration                     75
cell wall and cell processes                                73
insertion seqs and phages                                   51
lipid metabolism                                            24
regulatory proteins                                         19
virulence, detoxification, adaptation                       15
information pathways                                         5
unknown                                                      5
(conserved hypotheticals, cell wall and cell processes)      1
Name: KmerMatch_RvGeneCat, dtype: int64

In [226]:
RI90_MP_Acc["KmerMatch_RvGeneCat"].value_counts()

PE/PPE                                                     140
None                                                       138
conserved hypotheticals                                    135
cell wall and cell processes                                75
intermediary metabolism and respiration                     56
insertion seqs and phages                                   54
virulence, detoxification, adaptation                       21
lipid metabolism                                            17
regulatory proteins                                         16
information pathways                                         5
unknown                                                      5
(conserved hypotheticals, cell wall and cell processes)      1
Name: KmerMatch_RvGeneCat, dtype: int64

### Look at cumulative length for each PG output

In [171]:
MG_SVNodes_UnqSeq_DF["SeqLength"].sum()

301511

#### `PStrict_MP`

In [172]:
PStrict_MP_Acc["SeqLength"].sum()

510334

In [174]:
#PStrict_MP_X2["SeqLength"].sum() 

In [175]:
510334 / 301511

1.6925883301106759

#### `PSens_MP`

In [177]:
PSens_MP_Acc["SeqLength"].sum()

531574

In [176]:
#PSens_MP_X2["SeqLength"].sum()

In [178]:
531574 / 301511

1.7630335211650654

#### `RDefault`

In [179]:
RDefault_Acc["SeqLength"].sum()

2024058

In [181]:
#RDefault_X2["SeqLength"].sum()

In [182]:
1777722 / 301511

5.896043593766065

#### `RDefault_MP` 

In [183]:
RDefault_MP_X.shape

(4366, 3)

In [184]:
RDefault_MP_Acc.shape

(796, 3)

In [185]:
RDefault_MP_Acc["SeqLength"].sum()

476934

In [187]:
#RDefault_MP_X2["SeqLength"].sum()

In [189]:
476934 / 301511

1.5818129355147905

In [134]:
SR_RDefault_X.shape

(5609, 4)

In [135]:
SR_RDefault_Acc["SeqLength"].sum()

2833326

In [136]:
SR_RDefault_X2["SeqLength"].sum()

401448

In [137]:
RDefault_MP_X.head(3)

Unnamed: 0,GeneID,SeqLength,NumAsm_WiGene,Matched_UnqSeq_NodeIDs
0,dnaA,1491,151.0,
1,dnaN,1209,151.0,
2,recF,1158,151.0,


In [138]:
PSMP_X.shape

NameError: name 'PSMP_X' is not defined

In [None]:
PSMP_X2.shape

In [None]:
PS_X.shape

In [None]:
PS_X2.shape

In [None]:
PSMP_X2.head(3)

In [None]:
PS_X2.head(3)

In [145]:
RI80_MP_Acc.shape

(597, 4)

In [146]:
RI80_MP_Acc.shape

(597, 4)

In [147]:
RI90_MP_Acc.shape

(597, 4)

In [None]:
STOP!!!

### Test processing

In [118]:
i_Param = "Roary_NoSplitParalogs_I80"

i_PG_Ref_FA_PATH = PG_GeneRefFA_PATH_Dict[i_Param]

i_Ref_DictOf_Hashes, i_Ref_DictOf_SeqLen = read_kmers_from_file_ToHashesDict(i_PG_Ref_FA_PATH, 31) 

i_N_AsmWiGene_Dict = PresAbs_DF_Dict[i_Param]["NumAsm_WiGene"].to_dict()   

print(i_Param, len(list(i_Ref_DictOf_Hashes.keys())), len(list(i_N_AsmWiGene_Dict.keys())) )


4252  total records were parsed
Roary_NoSplitParalogs_I80 4252 4252


In [119]:
list(i_Ref_DictOf_Hashes.keys())[-10:] 

['group_1766',
 'group_1762',
 'group_1763',
 'NAD(P)-bd-dom domain-containing protein',
 'wcaG',
 'group_1744',
 'group_1775',
 'group_1769',
 'group_1713',
 'group_1714']

In [120]:
list(i_N_AsmWiGene_Dict.keys())[-10:]

['Cytochrome P450',
 'group_1617',
 'group_1618',
 'group_1620',
 'group_1625',
 'group_1627',
 'group_1630',
 'group_1632',
 'group_1637',
 'group_1778']

In [121]:
!grep ^">" $i_PG_Ref_FA_PATH | head

>N0072_00005 dnaA
>N0072_00010 dnaN
>N0072_00015 recF
>N0072_00020 group_3350
>N0072_00025 gyrB
>N0072_00030 gyrA
>N0072_00035 group_1360
>N0072_00040 putative conserved membrane protein
>N0072_00055 group_1361
>N0072_00060 group_1362
grep: write error


In [114]:
!grep ^">" $i_PG_Ref_FA_PATH | wc -l 

4252


In [108]:
len(list(i_Ref_DictOf_Hashes.keys()))

3979

In [107]:
gene_analysis_rows = []

for GeneID, Gene_Hashes_Set in tqdm(i_Ref_DictOf_Hashes.items()):
    Len_Seq = i_Ref_DictOf_SeqLen.get(GeneID, 0)
    record_hashes_set = Gene_Hashes_Set

    # Prepare row for the DataFrame
    row = [GeneID, Len_Seq]
    gene_analysis_rows.append(row)

print(len(gene_analysis_rows))

# Create the DataFrame
columns = ["GeneID", "SeqLength"]
gene_kmer_match_df = pd.DataFrame(gene_analysis_rows, columns=columns)

# Add the number of assemblies matching the gene
gene_kmer_match_df["NumAsm_WiGene"] = gene_kmer_match_df["GeneID"].map(i_N_AsmWiGene_Dict)

gene_kmer_match_df.shape

100%|██████████| 3979/3979 [00:00<00:00, 20549.17it/s]


3979


(3979, 3)

# Compare Gene K-mers to SV Nodes of Unq Seq Bubbles, etc

In [None]:
i_Gene_CompToUnqSeq_SVNodes_DF = compute_kmer_match_df(i_Ref_DictOf_Hashes,
                                                       i_Ref_DictOf_SeqLen,
                                                       MG_SVNodes_UnqSeq_HashDict,
                                                       i_N_AsmWiGene_Dict)

# Apply classification to the main DataFrame
threshold = 0.25  # Set the classification threshold
i_Gene_CompToUnqSeq_SVNodes_DF["Matched_UnqSeq_NodeIDs"] = i_Gene_CompToUnqSeq_SVNodes_DF.apply(
    lambda row: classify_node(row, list(MG_SVNodes_UnqSeq_HashDict.keys()), threshold), axis=1)

i_Gene_CompToUnqSeq_SVNodes_DF.shape

In [None]:
i_Gene_CompToUnqSeq_SVNodes_DF.head(4)

In [None]:
X = i_Gene_CompToUnqSeq_SVNodes_DF.query("NumAsm_WiGene < 151")
X = X[["GeneID", "SeqLength", "NumAsm_WiGene", "Matched_UnqSeq_NodeIDs"]]
X.shape

In [None]:
X.head(1)

In [None]:
MG_SVNodes_PASS_DF.head(1)

In [None]:
MG_SVNodes_PASS_DF.query("NodeID == 's1040'")

In [None]:
MG_SV_BED_DF.query("BubbleID == 'BubbleRegion_345'")

In [None]:
MG_SVNodes_PASS_DF.query("NodeID == 's7'")

In [None]:
MG_SV_BED_DF.query("BubbleID == 'BubbleRegion_4'")

In [None]:
X["Matched_UnqSeq_NodeIDs"].value_counts()

In [None]:
X["Matched_UnqSeq_NodeIDs"].nunique()

In [None]:
X2 = X.query("Matched_UnqSeq_NodeIDs != 'None'")
X2.shape

In [None]:
X2["SeqLength"].sum()

In [None]:
MG_SVNodes_UnqSeq_DF["SeqLength"].sum()

In [None]:
MG_SVNodes_UnqSeq_DF.head(3)

In [None]:
X2.head(10)

In [None]:
X2.head(10)

In [None]:
X["SeqLength"].sum()

In [None]:
len(SVNodeIDs_UnqSeq)

In [None]:
len(SVNodeIDs_UnqSeq_And_UnqToRv)

In [None]:
X2["SeqLength"].sum()

In [None]:
X2.head(10)

In [None]:
X2["Strongest_Match"].nunique()

In [None]:
X2["Strongest_Match"].value_counts()

In [None]:
X2_NodeIDs = X2["Strongest_Match"].unique()
len(X2_NodeIDs)

In [None]:
X2_SVNodes_DF = MG_SVNodes_PASS_DF[ MG_SVNodes_PASS_DF["NodeID"].isin(X2_NodeIDs) ]
X2_SVNodes_DF.shape


In [None]:
X2_SVNodes_DF["SeqLength"].sum()

In [None]:
i_Gene_CompToUnqSeqUnqRv_SVNodes_DF = compute_kmer_match_df(i_Ref_DictOf_Hashes,
                                                       i_Ref_DictOf_SeqLen,
                                                       MG_SVNodes_UnqSeqAndUnqToRv_HashDict,
                                                       i_N_AsmWiGene_Dict)

threshold = 0.25  # Set the classification threshold
i_Gene_CompToUnqSeqUnqRv_SVNodes_DF["Strongest_Match"] = i_Gene_CompToUnqSeqUnqRv_SVNodes_DF.apply(
    lambda row: classify_node(row, list(MG_SVNodes_UnqSeqAndUnqToRv_HashDict.keys()), threshold), axis=1
)

i_Gene_CompToUnqSeqUnqRv_SVNodes_DF.shape

In [None]:
Z = i_Gene_CompToUnqSeqUnqRv_SVNodes_DF.query("NumAsm_WiGene < 151")
Z.shape

In [None]:
Z.head(3)

In [None]:
Z["Strongest_Match"].value_counts()

In [None]:
Z["Strongest_Match"].unique()

In [None]:
Z["Strongest_Match"].nunique()

In [None]:
Z2 = Z.query("Strongest_Match != 'None'")
Z2.shape

In [None]:
Z2["SeqLength"].sum()

In [None]:
Z2.head(10)

In [None]:
Z2_NodeIDs = Z2["Strongest_Match"].unique()
len(Z2_NodeIDs)

In [None]:
Z2_SVNodes_DF = MG_SVNodes_PASS_DF[ MG_SVNodes_PASS_DF["NodeID"].isin(Z2_NodeIDs) ]
Z2_SVNodes_DF.shape


In [None]:
Z2_SVNodes_DF.head(3)

In [None]:
Z2_SVNodes_DF["SeqLength"].sum()

In [None]:
Z2["Strongest_Match"].nunique()

In [None]:
Z2["Strongest_Match"].value_counts()

In [None]:
MG_SVNodes_PASS_DF.head(4)

In [None]:
# MG_SVNodes_UnqSeq_HashDict
# MG_SVNodes_UnqSeqAndUnqToRv_HashDict

In [None]:
i_Gene_CompToUnqSeq_SVNodes_DF.head(3)

In [None]:
i_Gene_CompToUnqSeq_SVNodes_DF.query("")

In [None]:
i_Gene_CompToUnqSeq_SVNodes_DF

In [None]:
def compute_kmer_match_df_toSVNodeDict(Ref_DictOf_Hashes, Ref_DictOf_SeqLen, category_hash_sets, N_AsmWiGene_Dict):
    """
    Computes k-mer match Jaccard containment for genes to SV Nodes (defined in a dict).
    
    Returns:
        pd.DataFrame: DataFrame summarizing Jaccard containment results for all genes.
    """
    gene_analysis_rows = []

    for GeneID, Gene_Hashes_Set in tqdm(Ref_DictOf_Hashes.items()):
        Len_Seq = Ref_DictOf_SeqLen.get(GeneID, 0)
        record_hashes_set = Gene_Hashes_Set

        # Initialize results for Jaccard containment
        jc_results = {}

        if len(record_hashes_set) != 0:
            # Calculate Jaccard containment for each category
            for category, hash_set in category_hash_sets.items():
                jc_results[category] = jaccard_containment_FromSets(record_hashes_set, hash_set)
        else:
            # Set all results to 0 if no hashes exist
            jc_results = {category: 0 for category in category_hash_sets}
            if Len_Seq < 31:
                print(f"No kmers were produced for segment: {GeneID}")

        # Prepare row for the DataFrame
        row = [GeneID, Len_Seq] + list(jc_results.values())
        gene_analysis_rows.append(row)

    # Create the DataFrame
    columns = ["GeneID", "SeqLength"] + list(category_hash_sets.keys())
    gene_kmer_match_df = pd.DataFrame(gene_analysis_rows, columns=columns)

    # Add the number of assemblies matching the gene
    gene_kmer_match_df["NumAsm_WiGene"] = gene_kmer_match_df["GeneID"].map(N_AsmWiGene_Dict)

    return gene_kmer_match_df

# Extra

In [None]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict["Panaroo_Strict_MP"].query("NumAsm_WiGene < 150"),
                                                 ListOf_Rv_GeneCats)


In [None]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict["Roary_Default"].query("NumAsm_WiGene < 150"),
                                                 ListOf_Rv_GeneCats)


In [None]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict["Panaroo_Strict"].query("NumAsm_WiGene < 150"),
                                                 ListOf_Rv_GeneCats)
