In [33]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm
%matplotlib inline


In [93]:
%reload_ext autoreload
%autoreload 2

### import panqc (pan-genome quality control) toolkit functions
# from panqc.ava import ava
# from panqc.nscluster import clusterBy_KmerJC, summarize_NSClusters, create_MaxKmerSim_JC_Dict, create_MST_FiltByJC, make_ClusterID_Maps 
# from panqc.nscluster import make_NS_ClusterMerged_Pres_DF

from panqc.kmerlib import read_kmers_from_file_ToHashesDict

from panqc.utils import parse_PresAbs_Rtab, parse_PresAbs_CSV_Roary, parse_PresAbs_CSV_Panaroo, get_PG_Stats_FromPresAbs

from panqc.utils import  parse_PG_Ref_FA, get_PG_Stats_FromDNASeqPresAbs

# from panqc.asm_gene_search import parse_AlnHits_To_DF
# from panqc.asm_gene_search import PresAbsQC_CheckAsmForGeneSeq, SRAsm_PresAbsQC_CheckInLRAsm
# from panqc.asm_gene_search import get_SRAsm_Vs_LRAsm_QCStats


In [35]:
import time

In [36]:
import screed

In [37]:
import mappy as mp

In [38]:
# Set max column width to a specific value (e.g., 100 characters)
pd.set_option('display.max_colwidth', 100)
# Set to display a specific number of columns (e.g., 20 columns)
pd.set_option('display.max_columns', 180)

## Define useful Kmer analysis functions

In [39]:
import screed

In [40]:
import mmh3

In [41]:
def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1
    
    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)
        
    return kmers

In [42]:
#import screed a library for reading in FASTA/FASTQ

def read_kmers_from_file(filename, ksize):
    all_kmers = []
    for record in screed.open(filename):
        sequence = record.sequence
        
        kmers = build_kmers(sequence, ksize)
        all_kmers += kmers

    return all_kmers

In [43]:
def hash_kmer(kmer):
    # calculate the reverse complement
    rc_kmer = screed.rc(kmer)
    
    # determine whether original k-mer or reverse complement is lesser
    if kmer < rc_kmer:
        canonical_kmer = kmer
    else:
        canonical_kmer = rc_kmer
        
    # calculate murmurhash using a hash seed of 42
    hash = mmh3.hash64(canonical_kmer, 42)[0]
    if hash < 0: hash += 2**64

    return hash

In [44]:
# def hash_kmers(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
#     return hashes

def hash_kmers_ToSet(kmers):
    hashes = set()
    for kmer in kmers:
        hashes.add(hash_kmer(kmer))
    return hashes

In [45]:

def jaccard_containment_FromSets(a, b):
    '''
    This function returns the Jaccard Containment between sets a and b.
    '''
    
    intersection = len(a.intersection(b))
    
    return intersection / len(a)

def jaccard_similarity_FromSets(a, b):
    '''
    This function returns the Jaccard Similarity between sets a and b.
    '''
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    
    return intersection / union


In [46]:
def getAllHash_ExceptTargets_Set_V2(dictOfHashes, targetsToRemove):
    # Convert targetsToRemove to a set for faster lookup
    targetsToRemoveSet = set(targetsToRemove)

    # Use set comprehension for more efficient construction of the result set
    return {hash for seqID, seqInfoDict in dictOfHashes.items() if seqID not in targetsToRemoveSet
            for hash in seqInfoDict["Kmer_Hashes_Set"]}

In [47]:
def getAllHash_InTargetSeqs_Set(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_SeqInfoDict in dictOfHashes.items():
        
        i_Hashes = i_SeqInfoDict["Kmer_Hashes_Set"]
        
        if i_SeqID not in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

## Import/parse processed H37rv genome annotations

In [63]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"

## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[["H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]

RvID_To_Symbol_Dict = dict(H37Rv_GeneInfo_Subset_DF[['H37rv_GeneID', 'Symbol']].values)
Symbol_To_FuncCat_Dict = dict(H37Rv_GeneInfo_Subset_DF[['Symbol', 'Functional_Category']].values)


# Part 2: Generate reference k-mer sets (ie H37Rv, IS6110, Phages + ISs)  

## Generate k-mer info for H37Rv and a representative IS6110 sequence 

In [48]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"

H37rv_Ref_FA_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.fasta"
IS6110_Example_FA_PATH = f"{Mtb_RefDir}/IS6110_From_Rv0795_Rv0796.DNA.fasta"

#### H37Rv - k-mer generation & hashing

In [49]:
H37Rv_kmers = read_kmers_from_file(H37rv_Ref_FA_PATH, 31)

H37Rv_Hashes_Set = hash_kmers_ToSet(H37Rv_kmers)

print(len(H37Rv_kmers))

4411502


#### IS6110 (Rv0795 & Rv0796) - k-mer generation & hashing

In [50]:
IS6110_Ex1_kmers = read_kmers_from_file(IS6110_Example_FA_PATH, 31)

IS6110_Ex1_Hashes_Set = hash_kmers_ToSet(IS6110_Ex1_kmers)

print(len(IS6110_Ex1_kmers))

1254


## Generate k-mer info for all H37Rv gene DNA sequences (Mycobrowser)

In [54]:
from Bio import SeqIO


In [55]:
O2_RefDir = "/n/data1/hms/dbmi/farhat/mm774/References"
MycoBrowser_RefFiles_Dir = f"{O2_RefDir}/190619_Mycobrowser_H37rv_ReferenceFiles"

H37Rv_Genes_MycoBro_FA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_genes_v3.fasta"


In [56]:
!grep ^">" $H37Rv_Genes_MycoBro_FA | grep "dnaA"

>Rv0001|dnaA|CDS|1-1524|+|Chromosomal replication initiator protein DnaA


### Get 31-mer hashes for all annotated gene DNA sequences

In [57]:
dictOf_H37Rv_MycoBrow_GeneSeq = {}
dictOf_H37Rv_MycoBrow_Gene_KmerHashes = {}

for index, record in tqdm(enumerate(SeqIO.parse(H37Rv_Genes_MycoBro_FA, "fasta"))):
    
    RecordName = record.name
    RvID = RecordName.split("|")[0]
    GeneID = RecordName.split("|")[1]
    S_Seq = str(record.seq).upper()
    
    dictOf_H37Rv_MycoBrow_GeneSeq[GeneID] = S_Seq

    record_Hashes_Set = hash_kmers_ToSet(build_kmers(S_Seq, 31))

    dictOf_H37Rv_MycoBrow_Gene_KmerHashes[GeneID] = record_Hashes_Set
    

4187it [00:31, 131.55it/s]


In [58]:
len(dictOf_H37Rv_MycoBrow_GeneSeq["dnaA"])

1524

In [59]:
list(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"])[:2]

[13580233940393664509, 5138456728421695490]

## Generate k-mer info for each gene category annotation in H37Rv

In [None]:
def getAllHashes_InTargetSeqs(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_Hashes in dictOfHashes.items():
                
        if i_SeqID in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [70]:
Rv_Gene_Category_List = list(H37Rv_GenomeAnno_Genes_DF["Functional_Category"].unique())

RvGeneCat_To_Symbol_Dict = {}
RvGeneCat_To_RvID_Dict = {}
RvGeneCat_To_KmerHashes_Dict = {}

for i_GeneCat in Rv_Gene_Category_List:
    
    Genes_Subset_DF = H37Rv_GenomeAnno_Genes_DF.query(f"Functional_Category == '{i_GeneCat}'")     
    
    Subset_GeneSymbols = Genes_Subset_DF["Symbol"].values
    Subset_RvIDs = Genes_Subset_DF["H37rv_GeneID"].values

    N_Genes = len(Subset_GeneSymbols)
    

    RvGeneCat_To_Symbol_Dict[i_GeneCat] = Subset_GeneSymbols
    RvGeneCat_To_RvID_Dict[i_GeneCat] = Subset_RvIDs


    i_GeneCat_Hashes_Set = getAllHashes_InTargetSeqs(dictOf_H37Rv_MycoBrow_Gene_KmerHashes,
                                                     Subset_GeneSymbols)   

    RvGeneCat_To_KmerHashes_Dict[i_GeneCat] = i_GeneCat_Hashes_Set
    
    print(i_GeneCat, N_Genes, len(list(i_GeneCat_Hashes_Set)) )


information pathways 242 265284
conserved hypotheticals 1042 723221
cell wall and cell processes 772 783261
stable RNAs 48 6735
intermediary metabolism and respiration 936 1011332
regulatory proteins 198 161826
virulence, detoxification, adaptation 239 150509
insertion seqs and phages 147 69102
lipid metabolism 272 407894
PE/PPE 168 265028
unknown 15 7938


In [71]:
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']

In [72]:
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']
Rv_MGEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['insertion seqs and phages']

In [73]:
print(len(Rv_MGEs_Hashes_Set))

69102


# Parse sample metadata & preprocessed genome info/results

In [74]:
!pwd

/n/data1/hms/dbmi/farhat/mm774/Snakemake_Pipelines/mtb-pg-benchmarking-2024paper/Analysis/Part3_Mtb_PG_Comparison


In [75]:
#!ls -1 ../../Data

## Parse sample Metadata (N = 151)

In [76]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"

WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)
WGA151CI_AsmSummary_DF.shape

(151, 7)

## PARSE PATHs FOR ALL assemblies processed by this pipeline

In [77]:
WGA151CI_LRandSR_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")
print(WGA151CI_LRandSR_Asm_Path_DF.columns)
WGA151CI_LRandSR_Asm_Path_DF.columns = ['SampleID', 'Dataset_Tag',
                                        'Genome_LR_ASM_PATH', 'Genome_SR_ASM_PATH']


Index(['SampleID', 'Dataset_Tag', 'Genome_ASM_PATH',
       'ShortRead_Genome_ASM_PATH'],
      dtype='object')


In [78]:
WGA151CI_LRandSR_Asm_Path_DF.head(1)

Unnamed: 0,SampleID,Dataset_Tag,Genome_LR_ASM_PATH,Genome_SR_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/...,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/...


#### Create Dict of Asm FA PATHs

In [79]:

LR_AsmFA_Dict = dict(WGA151CI_LRandSR_Asm_Path_DF[['SampleID', 'Genome_LR_ASM_PATH']].values)
SR_AsmFA_Dict = dict(WGA151CI_LRandSR_Asm_Path_DF[['SampleID', 'Genome_SR_ASM_PATH']].values)


### Define Phylo order of samples:

In [80]:
OrderOfSampleIDs_Phylo = ['N0153', 'N0072', 'mada_2-46', 'mada_1-44', 'mada_107',
                          'mada_1-1', 'mada_1-51', 'mada_1-39', 'mada_1-36',
                          'mada_117', 'mada_122', 'mada_118', 'mada_1-10', 'R27252',
                          'R23887', 'TB3091', '9050-05', '3003-06', '702-06', '696-05',
                          '8651-04', 'TB3396', '4549-04', 'TB1612', 'TB2780', 'TB3368',
                          'TB1236', 'TB2659', '8129-04', 'R30215', 'R25048', 'TB2512',
                          'TB2981', 'TB2995', 'TB3113', '706-05', 'R30078', 'R28012',
                          'R27657', 'R30234', 'R31095', 'R28703', 'R24120', 'R36431',
                          'R29816', 'S0070-08', 'N0155', 'N0145', 'R29598', 'R24100',
                          'S0107-01', 'R28581', 'S0256-08', 'S0085-01', 'S0089-01',
                          'mada_1-11', 'M0003941_3', 'mada_115', 'mada_2-42', 'R37765',
                          '18_0621851', 'R22601', 'R27937', 'R18040', 'R18043', 'R27725',
                          'R26791', 'R20574', 'R20260', 'R21408', 'R23146', 'R28980', 'R32929',
                          'R26778', 'R30420', 'R21893', 'QC-9', 'QC-5', 'QC-3', 'N0004',
                          'mada_1-30', 'N0054', 'N1274', '01_R1134', 'TB2968', 'mada_1-53',
                          'mada_2-53', 'mada_1-50', 'mada_2-1', 'R23571', 'mada_123',
                          'mada_1-12', 'mada_1-15', 'mada_128', 'mada_1-38', 'TB3054',
                          'mada_126', 'mada_120', 'TB4620', 'M0016737_0', 'M0016395_7',
                          'R15311', 'TB2661', 'TB3386', 'TB3162', '02_R1179', 'M0010874_7',
                          'QC-7', 'QC-6', 'QC-1', '01_R1430', 'M0011368_9', '02_R1896',
                          'mada_2-25', 'TB3237', 'mada_103', 'mada_112', 'mada_124',
                          'S0123-01', 'S0262-02', 'TB3251', 'M0017522_5', 'R30396', 'R20896',
                          'mada_1-32', 'S0106-01', 'R21839', 'R21363', 'R21770', 'MT_0080','mada_102',
                          'TB3334', 'M0014888_3', 'mada_151', 'TB3169', 'mada_105', 'QC-8',
                          'QC-10', 'QC-4', 'mada_129', 'mada_139', '02_R1708', '02_R0894',
                          'mada_2-31', 'mada_1-41', 'N1272', 'N1176', 'N1202', 'N0091',
                          'N1177','RW-TB008']



In [81]:
WGA151CI_AsmSummary_DF.head(5)

Unnamed: 0,SampleID,numContigs_Complete,Flye_CircContig_Cov,PrimaryLineage,Lineage,Dataset_Tag,AsmApproach
0,N0072,1,358,lineage1,"lineage1,lineage1.1,lineage1.1.2",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
1,N0153,1,372,lineage1,"lineage1,lineage1.1,lineage1.1.1,lineage1.1.1.1",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
2,TB3113,1,933,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
3,TB1236,1,374,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
4,TB2659,1,421,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon


#### Make sample lineage & color mapping

In [82]:
# Dictionary for lineage to color mapping
LinToColor_Dict = {
    "lineage1": "#DF83AC",
    "lineage2": "#7098CB",
    "lineage3": "#815D9F",
    "lineage4": "#E76956",
    "lineage5": "#B67548",
    "lineage6": "#6AB79E",
    "lineage8": "#E4515B",
    "None": "black",
}

# Extracting the mapping between IsolateID and PrimaryLineage_Ill
lineage_mapping = WGA151CI_AsmSummary_DF.set_index('SampleID')['PrimaryLineage'].to_dict()

# Creating a color mapping for the samples
sample_colors = {sample: LinToColor_Dict.get(lineage, "black") for sample, lineage in lineage_mapping.items()}


# Define output dir of the Mtb-WGA-SMK processing pipeline

In [83]:
# Define varaint calling pipeline output directories

WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"

MtbWGA_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir


## Define output dirs of Pangenome Analysis pipelines

In [84]:
target_OutputDir = MtbWGA_SMK_Pipeline_OutputDir

i_Pangenome_Dir = f"{target_OutputDir}/PanGenome_Analysis"


### Define path to Panaroo output files (151 LR genomes, MergeParalogs Parameters)

In [192]:

PG_OutDir_Dict = {   "Panaroo_Strict_MP": f"{i_Pangenome_Dir}/Panaroo_Strict_MergeParalogs_AllIsolates",
                     "Panaroo_Moderate_MP": f"{i_Pangenome_Dir}/Panaroo_Moderate_MergeParalogs_AllIsolates",
                     "Panaroo_Sens_MP": f"{i_Pangenome_Dir}/Panaroo_Sensitive_MergeParalogs_AllIsolates",
                     "Panaroo_Strict": f"{i_Pangenome_Dir}/Panaroo_Strict_AllIsolates",
                     "Panaroo_Moderate": f"{i_Pangenome_Dir}/Panaroo_Moderate_AllIsolates",
                     "Panaroo_Sens": f"{i_Pangenome_Dir}/Panaroo_Sensitive_AllIsolates",
                     "Roary_Default": f"{i_Pangenome_Dir}/Roary_Default_AllIsolates",
                     "Roary_NoSplitParalogs": f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_AllIsolates",
                     "Roary_NoSplitParalogs_I80": f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_I80_AllIsolates",
                     "Roary_NoSplitParalogs_I90": f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_I90_AllIsolates" }


PG_PresAbs_CSV_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_PresAbs_CSV_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.csv"  
    
PG_PresAbs_Rtab_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_PresAbs_Rtab_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.Rtab"  

PG_GeneRefFA_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_GeneRefFA_PATH_Dict[i_param] = f"{i_outdir}/pan_genome_reference.fa"  

PG_AvA_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_AvA_PATH_Dict[i_param] = f"{i_outdir}/pan_genome_reference.KmerComparison.AllVsAll.MaxJC.tsv"




# Parse in processed data

### A) Parse in processed All vs All Kmer analysis

In [194]:
# AvA_DF_Dict = {}

# for i_Param, AvA_TSV_PATH in PG_AvA_PATH_Dict.items():

#     PG_AvA_DF = pd.read_csv(AvA_TSV_PATH, sep = "\t" )
#     AvA_DF_Dict[i_Param] = PG_AvA_DF


### B) Parse in Gene PresAbs Info

In [195]:
PresAbs_DF_Dict = {}

for i_Param, PresAbs_CSV_PATH in PG_PresAbs_CSV_PATH_Dict.items():

    if "Roary" in i_Param: 
        i_Gene_PresAbs_DF = parse_PresAbs_CSV_Roary(PresAbs_CSV_PATH)
    else:
        i_Gene_PresAbs_DF = parse_PresAbs_CSV_Panaroo(PresAbs_CSV_PATH)

    ### Relabel Columns for presence/absence tracking
    i_Gene_PresAbs_DF.columns = [ x.split(".Bakta")[0] for x in i_Gene_PresAbs_DF.columns ]

    print(i_Param, i_Gene_PresAbs_DF.shape)
    
    PresAbs_DF_Dict[i_Param] = i_Gene_PresAbs_DF
    

  


Panaroo_Strict_MP (4200, 153)


  


Panaroo_Moderate_MP (4280, 153)
Panaroo_Sens_MP (4281, 153)


  


Panaroo_Strict (4305, 153)
Panaroo_Moderate (4387, 153)
Panaroo_Sens (4388, 153)
Roary_Default (5366, 153)


  


Roary_NoSplitParalogs (4366, 153)


  


Roary_NoSplitParalogs_I80 (4252, 153)


  


Roary_NoSplitParalogs_I90 (4293, 153)


### C) Parse in PG Gene Reference FASTAs for each PG output

In [196]:

PG_RefSeqs_DF_Dict = {}

for i_Param, i_PG_Ref_FA_PATH in PG_GeneRefFA_PATH_Dict.items():

    PG_RefSeqs_DF_Dict[i_Param] = parse_PG_Ref_FA(i_PG_Ref_FA_PATH)

    LR_PG_Ref_IDs = list( PG_RefSeqs_DF_Dict[i_Param].keys())
    print(i_Param, len(LR_PG_Ref_IDs))
    

Panaroo_Strict_MP 4200
Panaroo_Moderate_MP 4280
Panaroo_Sens_MP 4281
Panaroo_Strict 4248
Panaroo_Moderate 4330
Panaroo_Sens 4331
Roary_Default 5366
Roary_NoSplitParalogs 4366
Roary_NoSplitParalogs_I80 4252
Roary_NoSplitParalogs_I90 4293


### Define functions for gene sequence classification

In [197]:
# Define function to classify a node based on a threshold
def classify_node(row, i_listOf_JC_Cols, threshold=0.25):
    filtered_matches = {col: row[col] for col in i_listOf_JC_Cols if row[col] >= threshold}
    if filtered_matches:
        max_value = max(filtered_matches.values())
        strongest_categories = [
            col for col, value in filtered_matches.items() if value == max_value
        ]
        return strongest_categories
    return None  # No valid matches

In [198]:
# Update the summarize_sv_categories function to include "Functional_Category"
def summarize_sv_categories_with_functional_category(sv_nodes_df, listOf_JC_Cols):
    category_summary = {}
    category_mapping = {
        "Jaccard_Cont_WiRv_PEPPEs": "PE/PPE",
        "Jaccard_Cont_WiRv_CellWallCellProc": "cell wall and cell processes",
        "Jaccard_Cont_WiRv_ConservedHypo": "conserved hypotheticals",
        "Jaccard_Cont_WiRv_InfoPathways": "information pathways",
        "Jaccard_Cont_WiRv_InsSeqAndPhages": "insertion seqs and phages",
        "Jaccard_Cont_WiRv_StableRNAs": "stable RNAs",
        "Jaccard_Cont_WiRv_InterMetabolism": "intermediary metabolism and respiration",
        "Jaccard_Cont_WiRv_RegProteins": "regulatory proteins",
        "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation": "virulence, detoxification, adaptation",
        "Jaccard_Cont_WiRv_LipidMetabolism": "lipid metabolism",
        "Jaccard_Cont_WiRv_Unknown": "unknown",
    }

    for col in listOf_JC_Cols:
        total_length = sv_nodes_df.loc[
            sv_nodes_df["Strongest_Match"].apply(
                lambda matches: col in matches if matches else False
            ),
            "SeqLength"
        ].sum()
        node_count = sv_nodes_df["Strongest_Match"].apply(
            lambda matches: col in matches if matches else False
        ).sum()
        category_summary[col] = {
            "Functional_Category": col, #category_mapping.get(col, "unknown"),
            "Total Length": total_length,
            "Node Count": node_count,
        }

    # Add "NoMatch" to the summary
    no_match_length = sv_nodes_df.loc[
        sv_nodes_df["Strongest_Match"].isnull(), "SeqLength"
    ].sum()
    no_match_count = sv_nodes_df["Strongest_Match"].isnull().sum()

    category_summary["NoMatch"] = {
        "Functional_Category": "NoMatch",
        "Total Length": no_match_length,
        "Gene Count": no_match_count,
    }

    # Convert to DataFrame
    summary_df = pd.DataFrame.from_dict(category_summary, orient="index")
    summary_df["Relative Size (%)"] = (
        summary_df["Total Length"] / summary_df["Total Length"].sum() * 100
    ).round(2)

    return summary_df


In [199]:
def compute_kmer_match_df(Ref_DictOf_Hashes, Ref_DictOf_SeqLen, category_hash_sets, N_AsmWiGene_Dict):
    """
    Computes k-mer match Jaccard containment for genes.
    
    Args:
        Ref_DictOf_Hashes (dict): Dictionary mapping GeneID to sets of k-mers (hashes).
        Ref_DictOf_SeqLen (dict): Dictionary mapping GeneID to their sequence lengths.
        category_hash_sets (dict): Dictionary of category names mapping to hash sets.
        N_AsmWiGene_Dict (dict): Dictionary mapping GeneID to number of assemblies matching the gene.
    
    Returns:
        pd.DataFrame: DataFrame summarizing Jaccard containment results for all genes.
    """
    gene_analysis_rows = []

    for GeneID, Gene_Hashes_Set in tqdm(Ref_DictOf_Hashes.items()):
        Len_Seq = Ref_DictOf_SeqLen.get(GeneID, 0)
        record_hashes_set = Gene_Hashes_Set

        # Initialize results for Jaccard containment
        jc_results = {}

        if len(record_hashes_set) != 0:
            # Calculate Jaccard containment for each category
            for category, hash_set in category_hash_sets.items():
                jc_results[category] = jaccard_containment_FromSets(record_hashes_set, hash_set)
        else:
            # Set all results to 0 if no hashes exist
            jc_results = {category: 0 for category in category_hash_sets}
            if Len_Seq < 31:
                print(f"No kmers were produced for segment: {GeneID}")

        # Prepare row for the DataFrame
        row = [GeneID, Len_Seq] + list(jc_results.values())
        gene_analysis_rows.append(row)

    # Create the DataFrame
    columns = ["GeneID", "SeqLength"] + list(category_hash_sets.keys())
    gene_kmer_match_df = pd.DataFrame(gene_analysis_rows, columns=columns)

    # Add the number of assemblies matching the gene
    gene_kmer_match_df["NumAsm_WiGene"] = gene_kmer_match_df["GeneID"].map(N_AsmWiGene_Dict)

    return gene_kmer_match_df

In [200]:
ListOf_Rv_GeneCats = ['information pathways', 'conserved hypotheticals', 'cell wall and cell processes', 'stable RNAs', 'intermediary metabolism and respiration', 'regulatory proteins', 'virulence, detoxification, adaptation', 'insertion seqs and phages', 'lipid metabolism', 'PE/PPE', 'unknown']


In [201]:
RvGeneCat_To_KmerHashes_Dict.keys()

dict_keys(['information pathways', 'conserved hypotheticals', 'cell wall and cell processes', 'stable RNAs', 'intermediary metabolism and respiration', 'regulatory proteins', 'virulence, detoxification, adaptation', 'insertion seqs and phages', 'lipid metabolism', 'PE/PPE', 'unknown'])

## D) Perform gene-level classification

In [202]:
PG_GeneSeq_KmerCatMatch_DF_Dict = {}

PG_GeneSeq_CategorySumm_DF_Dict = {}

for i_Param, i_PG_Ref_FA_PATH in tqdm(PG_GeneRefFA_PATH_Dict.items()):
    print(i_Param)
    i_Ref_DictOf_Hashes, i_Ref_DictOf_SeqLen = read_kmers_from_file_ToHashesDict(i_PG_Ref_FA_PATH, 31)  
        
    i_N_AsmWiGene_Dict = PresAbs_DF_Dict[i_Param]["NumAsm_WiGene"].to_dict()    

    i_Gene_KmerCatMatch_DF = compute_kmer_match_df(i_Ref_DictOf_Hashes,
                                                   i_Ref_DictOf_SeqLen,
                                                   RvGeneCat_To_KmerHashes_Dict,
                                                   i_N_AsmWiGene_Dict)

    # Apply classification to the main DataFrame
    threshold = 0.25  # Set the classification threshold
    i_Gene_KmerCatMatch_DF["Strongest_Match"] = i_Gene_KmerCatMatch_DF.apply(
        lambda row: classify_node(row, ListOf_Rv_GeneCats, threshold), axis=1
    )

    
    PG_GeneSeq_KmerCatMatch_DF_Dict[i_Param] = i_Gene_KmerCatMatch_DF
    
    PG_GeneSeq_CategorySumm_DF_Dict[i_Param] = summarize_sv_categories_with_functional_category(i_Gene_KmerCatMatch_DF,
                                                                                                ListOf_Rv_GeneCats)    

    




  0%|          | 0/10 [00:00<?, ?it/s][A[A[A

Panaroo_Strict_MP
4200  total records were parsed






  0%|          | 0/4200 [00:00<?, ?it/s][A[A[A[A



  1%|▏         | 59/4200 [00:00<00:07, 585.26it/s][A[A[A[A



  4%|▎         | 149/4200 [00:00<00:06, 651.83it/s][A[A[A[A



  6%|▌         | 247/4200 [00:00<00:05, 723.94it/s][A[A[A[A



  8%|▊         | 337/4200 [00:00<00:05, 767.08it/s][A[A[A[A



 11%|█         | 451/4200 [00:00<00:04, 845.42it/s][A[A[A[A



 14%|█▎        | 573/4200 [00:00<00:03, 929.64it/s][A[A[A[A



 16%|█▌        | 666/4200 [00:00<00:03, 914.18it/s][A[A[A[A



 19%|█▊        | 783/4200 [00:00<00:03, 976.64it/s][A[A[A[A



 21%|██        | 882/4200 [00:00<00:03, 938.89it/s][A[A[A[A



 23%|██▎       | 977/4200 [00:01<00:03, 926.27it/s][A[A[A[A



 26%|██▌       | 1085/4200 [00:01<00:03, 966.52it/s][A[A[A[A



 28%|██▊       | 1186/4200 [00:01<00:03, 978.03it/s][A[A[A[A



 31%|███       | 1292/4200 [00:01<00:02, 1001.10it/s][A[A[A[A



 34%|███▎      | 1417/4200 [00:01<00:02, 1061.84it/s][A[A[A[A


Panaroo_Moderate_MP






  0%|          | 0/4280 [00:00<?, ?it/s][A[A[A[A

4280  total records were parsed






  2%|▏         | 85/4280 [00:00<00:04, 847.64it/s][A[A[A[A



  4%|▍         | 184/4280 [00:00<00:04, 885.65it/s][A[A[A[A



  7%|▋         | 290/4280 [00:00<00:04, 931.25it/s][A[A[A[A



 10%|▉         | 407/4280 [00:00<00:03, 989.84it/s][A[A[A[A



 13%|█▎        | 538/4280 [00:00<00:03, 1067.99it/s][A[A[A[A



 15%|█▍        | 640/4280 [00:00<00:03, 1036.08it/s][A[A[A[A



 17%|█▋        | 743/4280 [00:00<00:03, 1033.90it/s][A[A[A[A



 20%|█▉        | 841/4280 [00:00<00:03, 1015.31it/s][A[A[A[A



 22%|██▏       | 939/4280 [00:00<00:03, 1002.17it/s][A[A[A[A



 24%|██▍       | 1038/4280 [00:01<00:03, 994.69it/s][A[A[A[A



 27%|██▋       | 1136/4280 [00:01<00:03, 986.88it/s][A[A[A[A



 29%|██▉       | 1237/4280 [00:01<00:03, 993.41it/s][A[A[A[A



 31%|███       | 1336/4280 [00:01<00:04, 612.80it/s][A[A[A[A



 34%|███▍      | 1446/4280 [00:01<00:04, 704.61it/s][A[A[A[A



 36%|███▌      | 1548/4280 [00:01<00:03, 773.61it

Panaroo_Sens_MP
4281  total records were parsed






  0%|          | 0/4281 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 70/4281 [00:00<00:06, 696.78it/s][A[A[A[A



  4%|▍         | 171/4281 [00:00<00:05, 766.95it/s][A[A[A[A



  6%|▋         | 274/4281 [00:00<00:04, 829.64it/s][A[A[A[A



  9%|▉         | 391/4281 [00:00<00:04, 908.04it/s][A[A[A[A



 12%|█▏        | 528/4281 [00:00<00:03, 1009.79it/s][A[A[A[A



 15%|█▍        | 641/4281 [00:00<00:03, 1041.68it/s][A[A[A[A



 17%|█▋        | 743/4281 [00:00<00:03, 1033.34it/s][A[A[A[A



 20%|█▉        | 849/4281 [00:00<00:03, 1040.58it/s][A[A[A[A



 22%|██▏       | 952/4281 [00:00<00:03, 1022.22it/s][A[A[A[A



 25%|██▍       | 1054/4281 [00:01<00:03, 1014.16it/s][A[A[A[A



 27%|██▋       | 1156/4281 [00:01<00:03, 1015.90it/s][A[A[A[A



 29%|██▉       | 1258/4281 [00:01<00:02, 1008.21it/s][A[A[A[A



 33%|███▎      | 1399/4281 [00:01<00:02, 1101.73it/s][A[A[A[A



 35%|███▌      | 1512/4281 [00:01<00:02, 1071.53it/s][A

Panaroo_Strict
4248  total records were parsed






  0%|          | 0/4248 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 85/4248 [00:00<00:04, 841.57it/s][A[A[A[A



  4%|▍         | 183/4248 [00:00<00:04, 872.50it/s][A[A[A[A



  7%|▋         | 285/4248 [00:00<00:04, 911.43it/s][A[A[A[A



 10%|▉         | 405/4248 [00:00<00:03, 981.85it/s][A[A[A[A



 12%|█▏        | 530/4248 [00:00<00:03, 1047.71it/s][A[A[A[A



 15%|█▍        | 637/4248 [00:00<00:03, 1053.13it/s][A[A[A[A



 17%|█▋        | 735/4248 [00:00<00:03, 884.34it/s] [A[A[A[A



 20%|█▉        | 840/4248 [00:00<00:03, 924.64it/s][A[A[A[A



 22%|██▏       | 939/4248 [00:00<00:03, 942.70it/s][A[A[A[A



 25%|██▍       | 1043/4248 [00:01<00:03, 969.19it/s][A[A[A[A



 27%|██▋       | 1142/4248 [00:01<00:03, 971.35it/s][A[A[A[A



 29%|██▉       | 1242/4248 [00:01<00:03, 977.77it/s][A[A[A[A



 33%|███▎      | 1386/4248 [00:01<00:02, 1081.74it/s][A[A[A[A



 35%|███▌      | 1498/4248 [00:01<00:02, 1058.63it/s][A[A[A

Panaroo_Moderate
4330  total records were parsed






  0%|          | 0/4330 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 70/4330 [00:00<00:06, 691.54it/s][A[A[A[A



  3%|▎         | 151/4330 [00:00<00:05, 720.50it/s][A[A[A[A



  6%|▌         | 255/4330 [00:00<00:05, 793.40it/s][A[A[A[A



  8%|▊         | 359/4330 [00:00<00:04, 852.43it/s][A[A[A[A



 11%|█▏        | 491/4330 [00:00<00:04, 953.25it/s][A[A[A[A



 14%|█▍        | 596/4330 [00:00<00:03, 979.45it/s][A[A[A[A



 16%|█▌        | 691/4330 [00:00<00:03, 943.62it/s][A[A[A[A



 19%|█▊        | 804/4330 [00:00<00:03, 990.80it/s][A[A[A[A



 21%|██        | 903/4330 [00:00<00:03, 928.08it/s][A[A[A[A



 23%|██▎       | 996/4330 [00:01<00:03, 927.62it/s][A[A[A[A



 25%|██▌       | 1096/4330 [00:01<00:03, 946.03it/s][A[A[A[A



 28%|██▊       | 1197/4330 [00:01<00:03, 959.67it/s][A[A[A[A



 31%|███       | 1326/4330 [00:01<00:02, 1038.23it/s][A[A[A[A



 33%|███▎      | 1443/4330 [00:01<00:02, 1073.00it/s][A[A[A[A


Panaroo_Sens
4331  total records were parsed






  0%|          | 0/4331 [00:00<?, ?it/s][A[A[A[A



  1%|▏         | 59/4331 [00:00<00:07, 585.96it/s][A[A[A[A



  3%|▎         | 151/4331 [00:00<00:06, 656.01it/s][A[A[A[A



  6%|▌         | 257/4331 [00:00<00:05, 739.30it/s][A[A[A[A



  8%|▊         | 363/4331 [00:00<00:04, 811.53it/s][A[A[A[A



 11%|█▏        | 497/4331 [00:00<00:04, 917.41it/s][A[A[A[A



 14%|█▍        | 606/4331 [00:00<00:03, 960.53it/s][A[A[A[A



 16%|█▌        | 702/4331 [00:00<00:03, 914.35it/s][A[A[A[A



 19%|█▊        | 810/4331 [00:00<00:03, 957.79it/s][A[A[A[A



 21%|██        | 907/4331 [00:00<00:03, 956.78it/s][A[A[A[A



 23%|██▎       | 1011/4331 [00:01<00:03, 978.32it/s][A[A[A[A



 26%|██▌       | 1114/4331 [00:01<00:03, 992.58it/s][A[A[A[A



 28%|██▊       | 1219/4331 [00:01<00:03, 1003.77it/s][A[A[A[A



 31%|███▏      | 1354/4331 [00:01<00:02, 1085.79it/s][A[A[A[A



 34%|███▍      | 1468/4331 [00:01<00:02, 1100.12it/s][A[A[A[

Roary_Default
5366  total records were parsed






  0%|          | 0/4951 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 75/4951 [00:00<00:06, 745.84it/s][A[A[A[A



  3%|▎         | 168/4951 [00:00<00:06, 792.51it/s][A[A[A[A



  5%|▌         | 257/4951 [00:00<00:05, 817.23it/s][A[A[A[A



  7%|▋         | 344/4951 [00:00<00:05, 832.35it/s][A[A[A[A



  9%|▊         | 422/4951 [00:00<00:05, 814.71it/s][A[A[A[A



 11%|█         | 527/4951 [00:00<00:05, 871.00it/s][A[A[A[A



 13%|█▎        | 638/4951 [00:00<00:04, 929.05it/s][A[A[A[A



 15%|█▌        | 746/4951 [00:00<00:04, 968.66it/s][A[A[A[A



 17%|█▋        | 850/4951 [00:00<00:04, 985.90it/s][A[A[A[A



 19%|█▉        | 947/4951 [00:01<00:04, 972.52it/s][A[A[A[A



 21%|██        | 1044/4951 [00:01<00:04, 954.75it/s][A[A[A[A



 23%|██▎       | 1150/4951 [00:01<00:03, 981.47it/s][A[A[A[A



 25%|██▌       | 1248/4951 [00:01<00:03, 957.39it/s][A[A[A[A



 27%|██▋       | 1344/4951 [00:01<00:03, 952.41it/s][A[A[A[A




Roary_NoSplitParalogs
4366  total records were parsed






  0%|          | 0/4074 [00:00<?, ?it/s][A[A[A[A



  2%|▏         | 95/4074 [00:00<00:04, 948.92it/s][A[A[A[A



  4%|▍         | 159/4074 [00:00<00:04, 826.93it/s][A[A[A[A



  6%|▌         | 242/4074 [00:00<00:04, 826.95it/s][A[A[A[A



  8%|▊         | 329/4074 [00:00<00:04, 839.22it/s][A[A[A[A



 10%|▉         | 392/4074 [00:00<00:04, 740.94it/s][A[A[A[A



 12%|█▏        | 480/4074 [00:00<00:04, 777.22it/s][A[A[A[A



 14%|█▍        | 581/4074 [00:00<00:04, 832.67it/s][A[A[A[A



 16%|█▌        | 660/4074 [00:01<00:06, 515.63it/s][A[A[A[A



 19%|█▉        | 773/4074 [00:01<00:05, 615.76it/s][A[A[A[A



 22%|██▏       | 880/4074 [00:01<00:04, 704.36it/s][A[A[A[A



 24%|██▍       | 976/4074 [00:01<00:04, 764.94it/s][A[A[A[A



 27%|██▋       | 1083/4074 [00:01<00:03, 836.39it/s][A[A[A[A



 29%|██▉       | 1181/4074 [00:01<00:03, 873.94it/s][A[A[A[A



 31%|███▏      | 1277/4074 [00:01<00:03, 895.61it/s][A[A[A[A





Roary_NoSplitParalogs_I80






  0%|          | 0/3979 [00:00<?, ?it/s][A[A[A[A

4252  total records were parsed






  2%|▏         | 83/3979 [00:00<00:04, 828.64it/s][A[A[A[A



  4%|▍         | 159/3979 [00:00<00:04, 805.25it/s][A[A[A[A



  6%|▌         | 235/3979 [00:00<00:04, 787.11it/s][A[A[A[A



  8%|▊         | 322/3979 [00:00<00:04, 809.51it/s][A[A[A[A



 10%|█         | 398/3979 [00:00<00:04, 792.52it/s][A[A[A[A



 13%|█▎        | 498/3979 [00:00<00:04, 844.17it/s][A[A[A[A



 15%|█▍        | 592/3979 [00:00<00:03, 869.59it/s][A[A[A[A



 18%|█▊        | 698/3979 [00:00<00:03, 918.59it/s][A[A[A[A



 20%|██        | 805/3979 [00:00<00:03, 958.40it/s][A[A[A[A



 23%|██▎       | 901/3979 [00:01<00:03, 954.79it/s][A[A[A[A



 25%|██▌       | 997/3979 [00:01<00:03, 954.26it/s][A[A[A[A



 28%|██▊       | 1106/3979 [00:01<00:02, 989.89it/s][A[A[A[A



 30%|███       | 1205/3979 [00:01<00:02, 970.90it/s][A[A[A[A



 33%|███▎      | 1302/3979 [00:01<00:02, 961.32it/s][A[A[A[A



 35%|███▌      | 1399/3979 [00:01<00:02, 946.35it/s][A

Roary_NoSplitParalogs_I90






  0%|          | 0/4015 [00:00<?, ?it/s][A[A[A[A

4293  total records were parsed






  2%|▏         | 73/4015 [00:00<00:05, 729.08it/s][A[A[A[A



  4%|▍         | 163/4015 [00:00<00:04, 772.90it/s][A[A[A[A



  6%|▌         | 249/4015 [00:00<00:04, 796.59it/s][A[A[A[A



  9%|▊         | 345/4015 [00:00<00:04, 836.89it/s][A[A[A[A



 11%|█         | 426/4015 [00:00<00:04, 828.03it/s][A[A[A[A



 13%|█▎        | 532/4015 [00:00<00:03, 885.23it/s][A[A[A[A



 16%|█▌        | 639/4015 [00:00<00:03, 932.10it/s][A[A[A[A



 19%|█▊        | 745/4015 [00:00<00:03, 967.05it/s][A[A[A[A



 21%|██        | 843/4015 [00:00<00:03, 970.15it/s][A[A[A[A



 23%|██▎       | 939/4015 [00:01<00:03, 963.71it/s][A[A[A[A



 26%|██▌       | 1045/4015 [00:01<00:03, 989.86it/s][A[A[A[A



 29%|██▊       | 1150/4015 [00:01<00:02, 1007.07it/s][A[A[A[A



 31%|███       | 1251/4015 [00:01<00:02, 969.47it/s] [A[A[A[A



 34%|███▎      | 1348/4015 [00:01<00:02, 963.50it/s][A[A[A[A



 36%|███▌      | 1445/4015 [00:01<00:02, 961.42it/s]

In [224]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict["Panaroo_Strict_MP"].query("NumAsm_WiGene < 150"),
                                                 ListOf_Rv_GeneCats)


Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,4323,3.0,,1.08
conserved hypotheticals,conserved hypotheticals,44001,73.0,,10.98
cell wall and cell processes,cell wall and cell processes,31262,52.0,,7.8
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,45753,55.0,,11.42
regulatory proteins,regulatory proteins,12735,12.0,,3.18
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",9381,11.0,,2.34
insertion seqs and phages,insertion seqs and phages,28132,44.0,,7.02
lipid metabolism,lipid metabolism,33939,15.0,,8.47
PE/PPE,PE/PPE,151566,72.0,,37.83


In [225]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict["Roary_Default"].query("NumAsm_WiGene < 150"),
                                                 ListOf_Rv_GeneCats)


Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,33747,30.0,,2.08
conserved hypotheticals,conserved hypotheticals,162666,239.0,,10.05
cell wall and cell processes,cell wall and cell processes,193143,220.0,,11.93
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,225819,212.0,,13.95
regulatory proteins,regulatory proteins,54828,45.0,,3.39
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",21564,32.0,,1.33
insertion seqs and phages,insertion seqs and phages,129801,195.0,,8.02
lipid metabolism,lipid metabolism,140880,77.0,,8.7
PE/PPE,PE/PPE,591798,388.0,,36.55


In [223]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict["Panaroo_Strict"].query("NumAsm_WiGene < 150"),
                                                 ListOf_Rv_GeneCats)


Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,4323,3.0,,0.96
conserved hypotheticals,conserved hypotheticals,53397,80.0,,11.89
cell wall and cell processes,cell wall and cell processes,31946,55.0,,7.11
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,48690,58.0,,10.84
regulatory proteins,regulatory proteins,12735,12.0,,2.84
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",9381,11.0,,2.09
insertion seqs and phages,insertion seqs and phages,47592,67.0,,10.6
lipid metabolism,lipid metabolism,35346,16.0,,7.87
PE/PPE,PE/PPE,165254,80.0,,36.8


In [None]:
PG_GeneSeq_KmerCatMatch_DF_Dict[i_Param].query("NumAsm_WiGene < 150")

In [None]:
i_Gene_KmerMatch_DF.query("NumAsm_WiGene < 150")

In [203]:
PG_OutDir_Dict.keys()

dict_keys(['Panaroo_Strict_MP', 'Panaroo_Moderate_MP', 'Panaroo_Sens_MP', 'Panaroo_Strict', 'Panaroo_Moderate', 'Panaroo_Sens', 'Roary_Default', 'Roary_NoSplitParalogs', 'Roary_NoSplitParalogs_I80', 'Roary_NoSplitParalogs_I90'])

In [210]:
PG_GeneSeq_CategorySumm_DF_Dict["Panaroo_Strict_MP"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,276069,238.0,,6.48
conserved hypotheticals,conserved hypotheticals,762381,1020.0,,17.9
cell wall and cell processes,cell wall and cell processes,825020,785.0,,19.37
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,1073272,958.0,,25.2
regulatory proteins,regulatory proteins,175465,200.0,,4.12
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",156345,229.0,,3.67
insertion seqs and phages,insertion seqs and phages,79382,102.0,,1.86
lipid metabolism,lipid metabolism,448584,273.0,,10.53
PE/PPE,PE/PPE,391231,200.0,,9.19


In [205]:
PG_GeneSeq_CategorySumm_DF_Dict["Panaroo_Moderate_MP"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,276069,238.0,,6.43
conserved hypotheticals,conserved hypotheticals,767291,1032.0,,17.87
cell wall and cell processes,cell wall and cell processes,825263,786.0,,19.23
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,1074117,960.0,,25.02
regulatory proteins,regulatory proteins,175465,200.0,,4.09
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",157059,232.0,,3.66
insertion seqs and phages,insertion seqs and phages,81260,106.0,,1.89
lipid metabolism,lipid metabolism,449729,274.0,,10.48
PE/PPE,PE/PPE,405937,217.0,,9.46


In [206]:
PG_GeneSeq_CategorySumm_DF_Dict["Panaroo_Sens_MP"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,276069,238.0,,6.43
conserved hypotheticals,conserved hypotheticals,767291,1032.0,,17.87
cell wall and cell processes,cell wall and cell processes,825263,786.0,,19.22
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,1074117,960.0,,25.02
regulatory proteins,regulatory proteins,175465,200.0,,4.09
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",157059,232.0,,3.66
insertion seqs and phages,insertion seqs and phages,81260,106.0,,1.89
lipid metabolism,lipid metabolism,449729,274.0,,10.48
PE/PPE,PE/PPE,406256,218.0,,9.46


In [209]:
PG_GeneSeq_CategorySumm_DF_Dict["Panaroo_Strict"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,276069,238.0,,6.41
conserved hypotheticals,conserved hypotheticals,769230,1025.0,,17.86
cell wall and cell processes,cell wall and cell processes,826868,792.0,,19.2
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,1074106,959.0,,24.94
regulatory proteins,regulatory proteins,175465,200.0,,4.07
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",156345,229.0,,3.63
insertion seqs and phages,insertion seqs and phages,101241,126.0,,2.35
lipid metabolism,lipid metabolism,449991,274.0,,10.45
PE/PPE,PE/PPE,404919,208.0,,9.4


In [207]:
PG_GeneSeq_CategorySumm_DF_Dict["Roary_Default"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,278877,249.0,,5.8
conserved hypotheticals,conserved hypotheticals,780720,1103.0,,16.24
cell wall and cell processes,cell wall and cell processes,848430,868.0,,17.65
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,1095447,1017.0,,22.78
regulatory proteins,regulatory proteins,171756,196.0,,3.57
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",154605,234.0,,3.22
insertion seqs and phages,insertion seqs and phages,160734,232.0,,3.34
lipid metabolism,lipid metabolism,493935,320.0,,10.27
PE/PPE,PE/PPE,724206,478.0,,15.06


In [208]:
PG_GeneSeq_CategorySumm_DF_Dict["Roary_NoSplitParalogs"]

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,information pathways,265803,234.0,,6.99
conserved hypotheticals,conserved hypotheticals,702522,989.0,,18.46
cell wall and cell processes,cell wall and cell processes,749826,745.0,,19.71
stable RNAs,stable RNAs,0,0.0,,0.0
intermediary metabolism and respiration,intermediary metabolism and respiration,962472,882.0,,25.29
regulatory proteins,regulatory proteins,141864,168.0,,3.73
"virulence, detoxification, adaptation","virulence, detoxification, adaptation",149187,224.0,,3.92
insertion seqs and phages,insertion seqs and phages,61809,87.0,,1.62
lipid metabolism,lipid metabolism,397434,259.0,,10.44
PE/PPE,PE/PPE,303774,270.0,,7.98


In [219]:
PG_GeneSeq_CategorySumm_DF_Dict["Roary_NoSplitParalogs"]["Node Count"] / PG_GeneSeq_CategorySumm_DF_Dict["Roary_NoSplitParalogs"]["Node Count"].sum()   

information pathways                       0.060449
conserved hypotheticals                    0.255490
cell wall and cell processes               0.192457
stable RNAs                                0.000000
intermediary metabolism and respiration    0.227848
regulatory proteins                        0.043400
virulence, detoxification, adaptation      0.057866
insertion seqs and phages                  0.022475
lipid metabolism                           0.066908
PE/PPE                                     0.069749
unknown                                    0.003358
NoMatch                                         NaN
Name: Node Count, dtype: float64

In [221]:
PG_GeneSeq_CategorySumm_DF_Dict["Roary_NoSplitParalogs"]["Total Length"] / PG_GeneSeq_CategorySumm_DF_Dict["Roary_NoSplitParalogs"]["Total Length"].sum()   

information pathways                       0.069852
conserved hypotheticals                    0.184619
cell wall and cell processes               0.197051
stable RNAs                                0.000000
intermediary metabolism and respiration    0.252933
regulatory proteins                        0.037281
virulence, detoxification, adaptation      0.039206
insertion seqs and phages                  0.016243
lipid metabolism                           0.104444
PE/PPE                                     0.079830
unknown                                    0.002152
NoMatch                                    0.016389
Name: Total Length, dtype: float64

In [176]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict[i_Param], ListOf_Rv_GeneCats)


Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
information pathways,unknown,276069,238.0,,6.48
conserved hypotheticals,unknown,762381,1020.0,,17.9
cell wall and cell processes,unknown,825020,785.0,,19.37
stable RNAs,unknown,0,0.0,,0.0
intermediary metabolism and respiration,unknown,1073272,958.0,,25.2
regulatory proteins,unknown,175465,200.0,,4.12
"virulence, detoxification, adaptation",unknown,156345,229.0,,3.67
insertion seqs and phages,unknown,79382,102.0,,1.86
lipid metabolism,unknown,448584,273.0,,10.53
PE/PPE,unknown,391231,200.0,,9.19


In [None]:
summarize_sv_categories_with_functional_category(PG_GeneSeq_KmerCatMatch_DF_Dict[i_Param], listOf_JC_Cols)


In [None]:
STOP!!!

# Test A - Infer k-mer overlap with all gene categories for each gene DNA sequence (of `Roary Default`)

In [159]:
print(RvGeneCat_To_KmerHashes_Dict.keys())
Rv_InfoPathways_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['information pathways']
Rv_ConservedHypo_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['conserved hypotheticals']
Rv_CellWallCellProc_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['cell wall and cell processes']
Rv_StableRNAs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['stable RNAs']
Rv_InterMetabolism_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['intermediary metabolism and respiration']
Rv_RegProteins_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['regulatory proteins']
Rv_VirulenceDetoxAdaptation_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['virulence, detoxification, adaptation']
Rv_LipidMetabolism_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['lipid metabolism']
Rv_PEPPEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['PE/PPE']
Rv_MGEs_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['insertion seqs and phages']
Rv_Unknown_Hashes_Set = RvGeneCat_To_KmerHashes_Dict['unknown']

dict_keys(['information pathways', 'conserved hypotheticals', 'cell wall and cell processes', 'stable RNAs', 'intermediary metabolism and respiration', 'regulatory proteins', 'virulence, detoxification, adaptation', 'insertion seqs and phages', 'lipid metabolism', 'PE/PPE', 'unknown'])


In [160]:
Roary_Default_Gene_FA = PG_GeneRefFA_PATH_Dict["Roary_Default"]
print(Roary_Default_Gene_FA)

/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/PanGenome_Analysis/Roary_Default_AllIsolates/pan_genome_reference.fa


In [161]:
N_AsmWiGene_Dict = PresAbs_DF_Dict["Roary_Default"]["NumAsm_WiGene"].to_dict()

In [162]:
Ref_DictOf_Hashes, Ref_DictOf_SeqLen = read_kmers_from_file_ToHashesDict(Roary_Default_Gene_FA, 31)             

All_SeqIDs = list(Ref_DictOf_Hashes.keys())

5366  total records were parsed


In [163]:

GeneAnalysis_listOfRows = []

for GeneID, Gene_Hashes_Set in tqdm( Ref_DictOf_Hashes.items() ) :
    
    Len_Seq = Ref_DictOf_SeqLen[GeneID]    
    
    record_Hashes_Set = Gene_Hashes_Set 
                       
    if len(record_Hashes_Set) != 0:
        Record_Hash_JC_WiH37Rv = jaccard_containment_FromSets( record_Hashes_Set, H37Rv_Hashes_Set)
        Record_Hash_JC_WiIS6110 = jaccard_containment_FromSets( record_Hashes_Set, IS6110_Ex1_Hashes_Set)

        Record_Hash_JC_WiRv_InsSeqAndPhages = jaccard_containment_FromSets( record_Hashes_Set, Rv_MGEs_Hashes_Set)
        Record_Hash_JC_WiRv_PEPPEs = jaccard_containment_FromSets( record_Hashes_Set, Rv_PEPPEs_Hashes_Set)
        Record_Hash_JC_WiRv_InfoPathways = jaccard_containment_FromSets( record_Hashes_Set, Rv_InfoPathways_Hashes_Set)
        Record_Hash_JC_WiRv_ConservedHypo = jaccard_containment_FromSets( record_Hashes_Set, Rv_ConservedHypo_Hashes_Set)
        Record_Hash_JC_WiRv_CellWallCellProc = jaccard_containment_FromSets( record_Hashes_Set, Rv_CellWallCellProc_Hashes_Set)
        Record_Hash_JC_WiRv_StableRNAs = jaccard_containment_FromSets( record_Hashes_Set, Rv_StableRNAs_Hashes_Set)
        Record_Hash_JC_WiRv_InterMetabolism = jaccard_containment_FromSets( record_Hashes_Set, Rv_InterMetabolism_Hashes_Set)
        Record_Hash_JC_WiRv_RegProteins = jaccard_containment_FromSets( record_Hashes_Set, Rv_RegProteins_Hashes_Set)
        Record_Hash_JC_WiRv_VirulenceDetoxAdaptation = jaccard_containment_FromSets( record_Hashes_Set, Rv_VirulenceDetoxAdaptation_Hashes_Set)
        Record_Hash_JC_WiRv_LipidMetabolism = jaccard_containment_FromSets( record_Hashes_Set, Rv_LipidMetabolism_Hashes_Set)
        Record_Hash_JC_WiRv_Unknown = jaccard_containment_FromSets( record_Hashes_Set, Rv_Unknown_Hashes_Set)


    else:
        Record_Hash_JC_WiH37Rv = 0
        Record_Hash_JC_WiIS6110 = 0
        Record_Hash_JC_WiRv_InsSeqAndPhages = 0 
        Record_Hash_JC_WiRv_PEPPEs = 0 
        Record_Hash_JC_WiRv_InfoPathways = 0 
        Record_Hash_JC_WiRv_ConservedHypo = 0 
        Record_Hash_JC_WiRv_CellWallCellProc = 0 
        Record_Hash_JC_WiRv_StableRNAs = 0 
        Record_Hash_JC_WiRv_InterMetabolism = 0 
        Record_Hash_JC_WiRv_RegProteins = 0 
        Record_Hash_JC_WiRv_VirulenceDetoxAdaptation = 0 
        Record_Hash_JC_WiRv_LipidMetabolism = 0 
        Record_Hash_JC_WiRv_Unknown = 0 
        
        if Len_Seq < 31:
            print(f"No kmers were produced for segment: {GeneID}")
                    
    i_Row = (GeneID,
             Len_Seq,
             Record_Hash_JC_WiH37Rv,
             Record_Hash_JC_WiIS6110,
             Record_Hash_JC_WiRv_InsSeqAndPhages,
             Record_Hash_JC_WiRv_PEPPEs, 
             Record_Hash_JC_WiRv_InfoPathways, 
             Record_Hash_JC_WiRv_ConservedHypo, 
             Record_Hash_JC_WiRv_CellWallCellProc, 
             Record_Hash_JC_WiRv_StableRNAs, 
             Record_Hash_JC_WiRv_InterMetabolism, 
             Record_Hash_JC_WiRv_RegProteins, 
             Record_Hash_JC_WiRv_VirulenceDetoxAdaptation, 
             Record_Hash_JC_WiRv_LipidMetabolism, 
             Record_Hash_JC_WiRv_Unknown )
    
    GeneAnalysis_listOfRows.append(i_Row)

    
i_Gene_KmerMatch_DF = pd.DataFrame(GeneAnalysis_listOfRows)

i_Gene_KmerMatch_DF.columns = ["GeneID",
                                "SeqLength",
                                "Jaccard_Cont_WiRv",
                                "Jaccard_Cont_WiIS6110",
                                "Jaccard_Cont_WiRv_InsSeqAndPhages",
                                "Jaccard_Cont_WiRv_PEPPEs",
                                "Jaccard_Cont_WiRv_InfoPathways",
                                "Jaccard_Cont_WiRv_ConservedHypo", 
                                "Jaccard_Cont_WiRv_CellWallCellProc",
                                "Jaccard_Cont_WiRv_StableRNAs",
                                "Jaccard_Cont_WiRv_InterMetabolism",
                                "Jaccard_Cont_WiRv_RegProteins",
                                "Jaccard_Cont_WiRv_VirulenceDetoxAdaptation",
                                "Jaccard_Cont_WiRv_LipidMetabolism",
                                "Jaccard_Cont_WiRv_Unknown" ]


i_Gene_KmerMatch_DF["NumAsm_WiGene"] = i_Gene_KmerMatch_DF["GeneID"].map(N_AsmWiGene_Dict)




  0%|          | 0/4951 [00:00<?, ?it/s][A[A

  1%|          | 38/4951 [00:00<00:13, 370.42it/s][A[A

  2%|▏         | 104/4951 [00:00<00:11, 426.43it/s][A[A

  3%|▎         | 164/4951 [00:00<00:10, 466.55it/s][A[A

  5%|▍         | 225/4951 [00:00<00:09, 501.28it/s][A[A

  6%|▌         | 289/4951 [00:00<00:08, 532.71it/s][A[A

  7%|▋         | 351/4951 [00:00<00:08, 554.35it/s][A[A

  8%|▊         | 411/4951 [00:00<00:08, 566.22it/s][A[A

 10%|▉         | 488/4951 [00:00<00:07, 613.48it/s][A[A

 11%|█▏        | 567/4951 [00:00<00:06, 656.91it/s][A[A

 13%|█▎        | 646/4951 [00:01<00:06, 691.43it/s][A[A

 15%|█▍        | 734/4951 [00:01<00:05, 738.65it/s][A[A

 16%|█▋        | 813/4951 [00:01<00:05, 749.32it/s][A[A

 18%|█▊        | 890/4951 [00:01<00:05, 753.87it/s][A[A

 20%|█▉        | 967/4951 [00:01<00:05, 730.13it/s][A[A

 21%|██        | 1041/4951 [00:01<00:05, 732.69it/s][A[A

 23%|██▎       | 1127/4951 [00:01<00:04, 766.42it/s][A[A

 24%|

In [117]:
i_Gene_KmerMatch_DF.shape

(4951, 16)

In [118]:
i_Gene_KmerMatch_DF.head()

Unnamed: 0,GeneID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,NumAsm_WiGene
0,dnaA,1491,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0
1,dnaN,1209,0.973707,0.0,0.0,0.0,0.973707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0
2,recF,1158,0.972518,0.0,0.0,0.0,0.972518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0
3,group_5031,516,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0
4,gyrB,2145,0.979669,0.0,0.0,0.0,0.92435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0


In [164]:
# Apply classification to the main DataFrame
threshold = 0.25  # Set the classification threshold
i_Gene_KmerMatch_DF["Strongest_Match"] = i_Gene_KmerMatch_DF.apply(
    lambda row: classify_node(row, threshold), axis=1
)

TypeError: 'float' object is not iterable

In [138]:
i_Gene_KmerMatch_DF.head()

Unnamed: 0,GeneID,SeqLength,Jaccard_Cont_WiRv,Jaccard_Cont_WiIS6110,Jaccard_Cont_WiRv_InsSeqAndPhages,Jaccard_Cont_WiRv_PEPPEs,Jaccard_Cont_WiRv_InfoPathways,Jaccard_Cont_WiRv_ConservedHypo,Jaccard_Cont_WiRv_CellWallCellProc,Jaccard_Cont_WiRv_StableRNAs,Jaccard_Cont_WiRv_InterMetabolism,Jaccard_Cont_WiRv_RegProteins,Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,Jaccard_Cont_WiRv_LipidMetabolism,Jaccard_Cont_WiRv_Unknown,NumAsm_WiGene,Strongest_Match
0,dnaA,1491,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,[Jaccard_Cont_WiRv_InfoPathways]
1,dnaN,1209,0.973707,0.0,0.0,0.0,0.973707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,[Jaccard_Cont_WiRv_InfoPathways]
2,recF,1158,0.972518,0.0,0.0,0.0,0.972518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,[Jaccard_Cont_WiRv_InfoPathways]
3,group_5031,516,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,[Jaccard_Cont_WiRv_ConservedHypo]
4,gyrB,2145,0.979669,0.0,0.0,0.0,0.92435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,[Jaccard_Cont_WiRv_InfoPathways]


In [145]:
All_Core_DF = i_Gene_KmerMatch_DF.query("NumAsm_WiGene >= 150")

In [146]:
Super_Core_DF = i_Gene_KmerMatch_DF.query("NumAsm_WiGene >= 151")
Soft_Core_DF = i_Gene_KmerMatch_DF.query("NumAsm_WiGene == 150")
Acc_DF = i_Gene_KmerMatch_DF.query("NumAsm_WiGene < 150")

In [147]:
Core_GeneSumm = summarize_sv_categories_with_functional_category(All_Core_DF, listOf_JC_Cols)

Acc_GeneSumm = summarize_sv_categories_with_functional_category(Acc_DF, listOf_JC_Cols)


In [148]:
Core_GeneSumm

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
Jaccard_Cont_WiRv_InsSeqAndPhages,insertion seqs and phages,28098,34.0,,0.93
Jaccard_Cont_WiRv_PEPPEs,PE/PPE,113853,81.0,,3.78
Jaccard_Cont_WiRv_InfoPathways,information pathways,240771,215.0,,8.0
Jaccard_Cont_WiRv_ConservedHypo,conserved hypotheticals,571695,796.0,,18.99
Jaccard_Cont_WiRv_CellWallCellProc,cell wall and cell processes,631650,618.0,,20.98
Jaccard_Cont_WiRv_StableRNAs,stable RNAs,0,0.0,,0.0
Jaccard_Cont_WiRv_InterMetabolism,intermediary metabolism and respiration,816660,747.0,,27.13
Jaccard_Cont_WiRv_RegProteins,regulatory proteins,110751,140.0,,3.68
Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,"virulence, detoxification, adaptation",131640,200.0,,4.37
Jaccard_Cont_WiRv_LipidMetabolism,lipid metabolism,339420,228.0,,11.27


In [149]:
Acc_GeneSumm

Unnamed: 0,Functional_Category,Total Length,Node Count,Gene Count,Relative Size (%)
Jaccard_Cont_WiRv_InsSeqAndPhages,insertion seqs and phages,129801,195.0,,8.02
Jaccard_Cont_WiRv_PEPPEs,PE/PPE,591798,388.0,,36.55
Jaccard_Cont_WiRv_InfoPathways,information pathways,33747,30.0,,2.08
Jaccard_Cont_WiRv_ConservedHypo,conserved hypotheticals,162666,239.0,,10.05
Jaccard_Cont_WiRv_CellWallCellProc,cell wall and cell processes,193143,220.0,,11.93
Jaccard_Cont_WiRv_StableRNAs,stable RNAs,0,0.0,,0.0
Jaccard_Cont_WiRv_InterMetabolism,intermediary metabolism and respiration,225819,212.0,,13.95
Jaccard_Cont_WiRv_RegProteins,regulatory proteins,54828,45.0,,3.39
Jaccard_Cont_WiRv_VirulenceDetoxAdaptation,"virulence, detoxification, adaptation",21564,32.0,,1.33
Jaccard_Cont_WiRv_LipidMetabolism,lipid metabolism,140880,77.0,,8.7


In [125]:
Super_Core_DF.shape

(2966, 16)

In [126]:
Soft_Core_DF.shape

(175, 16)

In [127]:
Acc_DF.shape

(1590, 16)