# Pangraph - Node k-mer analysis & seq redundancy comparison

### Import Statements

In [95]:
import numpy as np
import pandas as pd
import scipy.stats

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [96]:
import ast

In [97]:
# https://bioframe.readthedocs.io/en/latest/guide-intervalops.html
import bioframe as bf


In [98]:
from Bio import SeqIO


In [99]:
#import json

In [100]:
import gfapy

#### Pandas Viewing Settings

In [101]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [102]:
pd.set_option('max_colwidth', 400)

## Define useful Kmer analysis functions

In [103]:
import screed

In [104]:
import mmh3

In [105]:
def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1
    
    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)
        
    return kmers

In [106]:
#import screed a library for reading in FASTA/FASTQ

def read_kmers_from_file(filename, ksize):
    all_kmers = []
    for record in screed.open(filename):
        sequence = record.sequence
        
        kmers = build_kmers(sequence, ksize)
        all_kmers += kmers

    return all_kmers

In [107]:
def hash_kmer(kmer):
    # calculate the reverse complement
    rc_kmer = screed.rc(kmer)
    
    # determine whether original k-mer or reverse complement is lesser
    if kmer < rc_kmer:
        canonical_kmer = kmer
    else:
        canonical_kmer = rc_kmer
        
    # calculate murmurhash using a hash seed of 42
    hash = mmh3.hash64(canonical_kmer, 42)[0]
    if hash < 0: hash += 2**64

    return hash

In [108]:
# def hash_kmers(kmers):
#     hashes = []
#     for kmer in kmers:
#         hashes.append(hash_kmer(kmer))
#     return hashes

def hash_kmers_ToSet(kmers):
    hashes = set()
    for kmer in kmers:
        hashes.add(hash_kmer(kmer))
    return hashes

In [109]:

def jaccard_containment_FromSets(a, b):
    '''
    This function returns the Jaccard Containment between sets a and b.
    '''
    
    intersection = len(a.intersection(b))
    
    return intersection / len(a)

def jaccard_similarity_FromSets(a, b):
    '''
    This function returns the Jaccard Similarity between sets a and b.
    '''
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    
    return intersection / union


In [110]:
def getAllHash_ExceptTargets_Set_V2(dictOfHashes, targetsToRemove):
    # Convert targetsToRemove to a set for faster lookup
    targetsToRemoveSet = set(targetsToRemove)

    # Use set comprehension for more efficient construction of the result set
    return {hash for seqID, seqInfoDict in dictOfHashes.items() if seqID not in targetsToRemoveSet
            for hash in seqInfoDict["Kmer_Hashes_Set"]}

In [111]:
def getAllHash_InTargetSeqs_Set(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_SeqInfoDict in dictOfHashes.items():
        
        i_Hashes = i_SeqInfoDict["Kmer_Hashes_Set"]
        
        if i_SeqID not in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [112]:
#%reload_ext autoreload
#%autoreload 2

### import panqc toolkit functions
#from panqc.kmerlib import hash_kmers_ToSet, jaccard_similarity_FromSets, jaccard_containment_FromSets


# Define pangraph processing functions

In [113]:
def parse_pangraph_fasta(fasta_path):
    """
    Parses the FASTA file to extract node sequences.
    
    Args:
        fasta_path (str): Path to the FASTA file.
    
    Returns:
        dict: A dictionary with node IDs as keys and sequences as values.
    """
    node_seqs = {}
    for record in tqdm(SeqIO.parse(fasta_path, "fasta"), desc="Parsing FASTA"):
        node_seqs[record.name] = str(record.seq).upper()
    return node_seqs

In [114]:
def parse_pangraph_gfa(gfa_path, node_seqs, k_size, num_assemblies):
    """
    Parses the GFA file and adds sequence information, k-mer hashes, and statistics.
    
    Args:
        gfa_path (str): Path to the GFA file.
        node_seqs (dict): Dictionary of node sequences.
        k_size (int): Size of k-mers for hashing.
        num_assemblies (int): Total number of assemblies in Pangraph.
    
    Returns:
        dict: A dictionary containing node information and statistics.
    """
    gfa = gfapy.Gfa.from_file(gfa_path, version="gfa1", vlevel=0)
    node_info = {}
    
    # Process S-lines
    for line in tqdm(gfa.lines, desc="Processing GFA S-lines"):
        if str(line).startswith("S"):
            parts = str(line).split("\t")
            node_id = parts[1]
            seq = node_seqs[node_id]
            seq_len = len(seq)
            rc = int(parts[4].split(":i:")[-1])
            
            # Generate k-mers and hashes
            kmers = build_kmers(seq, k_size)
            hashes = hash_kmers_ToSet(kmers)
            
            node_info[node_id] = {
                "Len": seq_len,
                "N_Asms": 0,
                "N_Occurrences": 0,
                "Kmers": kmers,
                "Kmer_Hashes_Set": hashes,
                "RC": rc
            }
    
    # Process P-lines
    for line in tqdm(gfa.lines, desc="Processing GFA P-lines"):
        if str(line).startswith("P"):
            parts = str(line).split("\t")
            node_path = parts[2].split(",")
            unique_nodes = {node[:-1] for node in node_path}
            
            for node_id in unique_nodes:
                node_info[node_id]["N_Asms"] += 1
            for node in node_path:
                node_id = node[:-1]
                node_info[node_id]["N_Occurrences"] += 1
    
    return node_info

In [115]:
def classify_pangraph_nodes(node_info, num_input_assemblies):
    """
    Classifies nodes as Core or SV based on the number of assemblies.
    
    Args:
        node_info (dict): Node information and statistics.
        num_assemblies (int): Total number of assemblies in Pangraph.
    
    Returns:
        tuple: Lists of core nodes and SV nodes.
    """
    core_nodes = []
    sv_nodes = []
    
    for node_id, info in node_info.items():
        if info["N_Asms"] > num_input_assemblies:
            raise ValueError(f"Node {node_id} has too many counts: {info['N_Asms']}")
        
        if info["N_Asms"] == num_input_assemblies:
            info["Type"] = "Core"
            core_nodes.append(node_id)
        else:
            info["Type"] = "SV"
            sv_nodes.append(node_id)
    
    return core_nodes, sv_nodes

In [116]:

def pangraph_pairwise_kmerset_comparison(node_info):
    """
    Perform pairwise analysis of k-mer content between all nodes.
    
    Args:
        node_info (dict): Dictionary containing node information.
    
    Returns:
        pd.DataFrame: DataFrame containing pairwise comparisons with Jaccard metrics.
    """
    pairwise_results = []

    AllNodeIDs = list(node_info.keys())
    
    for node_1 in tqdm(AllNodeIDs, desc="Running pairwise comparisons of k-mer content"):
        for node_2 in AllNodeIDs:
            # Skip comparisons if k-mer hashes are missing
            if "Kmer_Hashes_Set" not in node_info[node_1] or "Kmer_Hashes_Set" not in node_info[node_2]:
                continue

            hashes_1 = node_info[node_1]["Kmer_Hashes_Set"]
            hashes_2 = node_info[node_2]["Kmer_Hashes_Set"]

            # Skip if no overlap of k-mers
            if not (hashes_1 & hashes_2):
                continue

            # Calculate Jaccard Similarity
            js = jaccard_similarity_FromSets(hashes_1, hashes_2)

            # Skip if Jaccard Similarity is 0
            if js == 0:
                continue

            # Calculate Jaccard Containment
            jc = jaccard_containment_FromSets(hashes_1, hashes_2)

            len_1 = node_info[node_1]["Len"]
            len_2 = node_info[node_2]["Len"]

            # Append results as a tuple
            pairwise_results.append((node_1, node_2, len_1, len_2, js, jc))
    
    # Convert results to a DataFrame
    columns = ["RecordID_1", "RecordID_2", "Record1_Len", "Record2_Len", "JaccardSim", "JaccardContain"]
    AvA_results_df = pd.DataFrame(pairwise_results, columns=columns)
    
    # Exclude self-comparisons
    AvA_results_df = AvA_results_df.query("RecordID_1 != RecordID_2")
    
    return AvA_results_df


In [117]:
def compare_kmers_to_profiles_with_max_jc(node_info, sv_node_ids, ref_kmer_sets, ava_nodes_df):
    """
    Compares k-mer content of graph nodes to predefined k-mer profiles and calculates MaxJC.
    
    Args:
        node_info (dict): Dictionary containing node information.
        sv_node_ids (list): List of SV node IDs.
        kmer_sets (dict): Dictionary of predefined k-mer sets (e.g., H37Rv, IS6110, etc.).
        jaccard_containment_fn (function): Function to calculate Jaccard Containment.
        ava_nodes_df (pd.DataFrame): DataFrame of pairwise comparisons with Jaccard metrics.
    
    Returns:
        pd.DataFrame: DataFrame containing comparison results and MaxJC values.
    """

    AllNodeIDs = list(node_info.keys())
    
    # Step 1: Create a dict mapping NodeID to the maximum Jaccard Containment (MaxJC)
    max_jc_dict = ava_nodes_df.groupby("RecordID_1")["JaccardContain"].max().to_dict()
    
    results = []

    for node_id, info in tqdm(node_info.items(), desc="Comparing node to reference k-mer profiles"):
        record_hashes = info.get("Kmer_Hashes_Set", set())
        len_seq = info.get("Len", 0)
        n_asms = info.get("N_Asms", 0)
        n_occurrences = info.get("N_Occurrences", 0)
        
        # Initialize results with zero if no k-mers exist
        jc_results = {key: 0 for key in ref_kmer_sets.keys()}
        if record_hashes:
            for key, kmer_set in ref_kmer_sets.items():
                jc_results[key] = jaccard_containment_FromSets(record_hashes, kmer_set)
        else:
            if len_seq >= 31:
                print(f"No k-mers were produced for segment: {node_id}")
        
        # Determine if the node is an SV node
        is_sv_node = node_id in sv_node_ids

        # Get MaxJC value for the node
        max_jc_to_other_node = max_jc_dict.get(node_id, 0)

        # Collect results for this node
        row = (
            node_id, len_seq, n_asms, n_occurrences,
            jc_results.get("H37Rv", 0),
            jc_results.get("IS6110", 0),
            jc_results.get("Rv_InsSeqAndPhages", 0),
            is_sv_node, max_jc_to_other_node
        )
        results.append(row)
    
    # Convert results to a DataFrame
    columns = [
        "NodeID", "SeqLength", "N_Asms", "N_Occurrences",
        "Jaccard_Cont_WiRv", "Jaccard_Cont_WiIS6110",
        "Jaccard_Cont_WiRv_InsSeqAndPhages", "IsSVNode", "MaxJC_ToOtherNode"
    ]
    results_df = pd.DataFrame(results, columns=columns)
    
    return results_df


# Part 1: Parse sample metadata & preprocessed genome info/results

In [118]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"
MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"   


## Parse sample Metadata (N = 151)

In [119]:
WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values) 


## Parse FASTA paths for 151 Mtb assemblies

In [120]:
WGA151CI_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")
WGA151CI_Asm_Path_DF.shape

(151, 4)

In [121]:
WGA151CI_Asm_Path_DF.head(4)

Unnamed: 0,SampleID,Dataset_Tag,Genome_ASM_PATH,ShortRead_Genome_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0072.SR.Asm.fasta
1,N0153,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/N0153.SR.Asm.fasta
2,TB3113,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB3113.SR.Asm.fasta
3,TB1236,TB_Portals_24CI_R1,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.LR.Asm.fasta,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/TB_Portals_24CI_R1/TB1236.SR.Asm.fasta


In [122]:
SampleID_To_LRAsmFA_PATH_Dict = dict(WGA151CI_Asm_Path_DF[['SampleID', 'Genome_ASM_PATH']].values)

## Import/parse processed H37rv genome annotations

In [123]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"

## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")
H37Rv_GeneInfo_Subset_DF = H37Rv_GenomeAnno_Genes_DF[["H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]

RvID_To_Symbol_Dict = dict(H37Rv_GeneInfo_Subset_DF[['H37rv_GeneID', 'Symbol']].values)
Symbol_To_FuncCat_Dict = dict(H37Rv_GeneInfo_Subset_DF[['Symbol', 'Functional_Category']].values)


In [124]:
H37Rv_GenomeAnno_Genes_DF.head(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucleotidyltransferase),,NotExcluded
2,NC_000962.3,3279,4437,+,Rv0003,recF,CDS,information pathways,No,DNA replication and repair protein RecF (single-strand DNA binding protein),,NotExcluded


# Part 2: Generate reference k-mer sets (ie H37Rv, IS6110, Phages + ISs)  

### Define k-mer size for entire analysis

In [125]:
k_size = 31

## Generate k-mer info for H37Rv and a representative IS6110 sequence 

In [126]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"

H37rv_Ref_FA_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.fasta"
IS6110_Example_FA_PATH = f"{Mtb_RefDir}/IS6110_From_Rv0795_Rv0796.DNA.fasta"

#### H37Rv - k-mer generation & hashing

In [127]:
H37Rv_kmers = read_kmers_from_file(H37rv_Ref_FA_PATH, k_size)

H37Rv_Hashes_Set = hash_kmers_ToSet(H37Rv_kmers)

print(len(H37Rv_kmers))

4411502


#### IS6110 (Rv0795 & Rv0796) - k-mer generation & hashing

In [128]:
IS6110_Ex1_kmers = read_kmers_from_file(IS6110_Example_FA_PATH, k_size)

IS6110_Ex1_Hashes_Set = hash_kmers_ToSet(IS6110_Ex1_kmers)

print(len(IS6110_Ex1_kmers))

1254


## Generate k-mer info for all H37Rv gene DNA sequences (Mycobrowser)

In [129]:
O2_RefDir = "/n/data1/hms/dbmi/farhat/mm774/References"
MycoBrowser_RefFiles_Dir = f"{O2_RefDir}/190619_Mycobrowser_H37rv_ReferenceFiles"

H37Rv_Genes_MycoBro_FA = f"{MycoBrowser_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_genes_v3.fasta"


In [130]:
!grep ^">" $H37Rv_Genes_MycoBro_FA | grep "dnaA"

>Rv0001|dnaA|CDS|1-1524|+|Chromosomal replication initiator protein DnaA


### Get 31-mer hashes for all annotated gene DNA sequences

In [131]:
dictOf_H37Rv_MycoBrow_GeneSeq = {}
dictOf_H37Rv_MycoBrow_Gene_KmerHashes = {}

for index, record in tqdm(enumerate(SeqIO.parse(H37Rv_Genes_MycoBro_FA, "fasta"))):
    
    RecordName = record.name
    RvID = RecordName.split("|")[0]
    GeneID = RecordName.split("|")[1]
    S_Seq = str(record.seq).upper()
    
    dictOf_H37Rv_MycoBrow_GeneSeq[GeneID] = S_Seq

    record_Hashes_Set = hash_kmers_ToSet(build_kmers(S_Seq, k_size))

    dictOf_H37Rv_MycoBrow_Gene_KmerHashes[GeneID] = record_Hashes_Set
    

4187it [00:33, 126.86it/s]


In [132]:
len(dictOf_H37Rv_MycoBrow_GeneSeq["dnaA"])

1524

In [133]:
list(dictOf_H37Rv_MycoBrow_Gene_KmerHashes["dnaA"])[:2]

[13580233940393664509, 5138456728421695490]

## Generate k-mer info for "Phage Sequences & Insertion Sequences) in H37Rv

## a) Create DF for only MGEs (Insertion seqs & phages)

In [134]:
H37Rv_Anno_InsSeqAndPhages_DF = H37Rv_GenomeAnno_Genes_DF.query("Functional_Category == 'insertion seqs and phages'")     

InsSeqAndPhages_GeneIDs = H37Rv_Anno_InsSeqAndPhages_DF["Symbol"].values
len(InsSeqAndPhages_GeneIDs)
     

147

In [135]:
#### Peak at the first 5 geneIDs

In [136]:
InsSeqAndPhages_GeneIDs[:5]

array(['Rv0031', 'Rv0094c', 'Rv0095c', 'Rv0336', 'Rv0393'], dtype=object)

In [137]:
H37Rv_Anno_InsSeqAndPhages_DF.head(1)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
33,NC_000962.3,33581,33794,+,Rv0031,Rv0031,CDS,insertion seqs and phages,No,Possible remnant of a transposase,,InsertionSeqs_And_Phages


## b) Generate union of all 31 bp k-mer hashes for "Insertion seqs & phages" genes (N=147)

In [138]:
def getAllHashes_InTargetSeqs(dictOfHashes, targetsToKeep):
    
    all_Hashes_InTarget = set()

    for i_SeqID, i_Hashes in dictOfHashes.items():
                
        if i_SeqID in targetsToKeep:
            all_Hashes_InTarget.update(i_Hashes) 
    
    return all_Hashes_InTarget

In [139]:
Rv_MGEs_Hashes_Set = getAllHashes_InTargetSeqs(dictOf_H37Rv_MycoBrow_Gene_KmerHashes,
                                                    InsSeqAndPhages_GeneIDs)   

print(len(Rv_MGEs_Hashes_Set))

69102


#### Look at Jaccard Containment between 31-mers of H37Rv and H37Rv's MGEs (ISs + Phages)

In [140]:
jaccard_containment_FromSets(H37Rv_Hashes_Set, Rv_MGEs_Hashes_Set)

0.01589562466616704

# Part 3: Generate k-mer info per node from Pangraph GFA 

## Define output dir of the Mtb-WGA-SMK processing pipeline

In [141]:
WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"
MtbWGA_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir


In [142]:
#!ls -1 $MtbWGA_SMK_Pipeline_OutputDir

In [143]:
target_OutputDir = MtbWGA_SMK_Pipeline_OutputDir

# Minigraph_151CI_OutDir = f"{target_OutputDir}/Minigraph"

Pangraph_MainOutDir = f"{target_OutputDir}/Pangraph_Mtb151CIWiRv_Analysis"

Pangraph_V1_Try1_Out = f"{Pangraph_MainOutDir}/OLD_Pangraph_V1"
Pangraph_Try1_FA = f"{Pangraph_V1_Try1_Out}/pangraph.fa"
Pangraph_Try1_GFA = f"{Pangraph_V1_Try1_Out}/pangraph.CigarMod.gfa"

Pangraph_Try2_Out = f"{Pangraph_MainOutDir}/Pangraph_151CI_V3_WiDup"
Pangraph_Try2_FA = f"{Pangraph_Try2_Out}/pangraph.fa"
Pangraph_Try2_GFA = f"{Pangraph_Try2_Out}/pangraph.CigarMod.gfa"

Pangraph_Try3_Out = f"{Pangraph_MainOutDir}/Pangraph_151CI_V4_WiDup"
Pangraph_Try3_FA = f"{Pangraph_Try3_Out}/pangraph.fa"
Pangraph_Try3_GFA = f"{Pangraph_Try3_Out}/pangraph.CigarMod.gfa"



In [144]:
!ls -1 $Pangraph_151CI_OutDir

OLD_Pangraph_V1
Pangraph_151CI_V3_NoDup
Pangraph_151CI_V3_WiDup
Pangraph_151CI_V4_WiDup


In [145]:
!ls -1 $Pangraph_151CI_OutDir/OLD_Pangraph_V1

pangraph.CigarMod.gfa
pangraph.fa
pangraph.gfa


In [146]:
!ls -1 $Pangraph_151CI_OutDir/Pangraph_151CI_V3_NoDup

pangraph.CigarMod.gfa
pangraph.fa
pangraph.gfa


In [147]:
!ls -1 $Pangraph_151CI_OutDir/Pangraph_151CI_V3_WiDup

pangraph.CigarMod.gfa
pangraph.fa


In [148]:
!ls -1 $Pangraph_151CI_OutDir/Pangraph_151CI_V4_WiDup

pangraph.CigarMod.gfa
pangraph.fa
pangraph.gfa


# Pangraph data processing

In [149]:
# Define predefined k-mer sets
Rv_ref_kmer_sets = {
    "H37Rv": H37Rv_Hashes_Set,
    "IS6110": IS6110_Ex1_Hashes_Set,
    "Rv_InsSeqAndPhages": Rv_MGEs_Hashes_Set
}


## Process Try1 output of Pangraph

In [151]:
k_size = 31
num_assemblies = 152

# Step 1: Parse FASTA
T1_node_seqs = parse_pangraph_fasta(Pangraph_Try1_FA)

# Step 2: Parse GFA
T1_node_info = parse_pangraph_gfa(Pangraph_Try1_GFA, T1_node_seqs, k_size, num_assemblies)

# Step 3: Classify nodes
T1_core_nodes, T1_sv_nodes = classify_pangraph_nodes(T1_node_info, num_assemblies)

print("# of Core Nodes:", len(T1_core_nodes))
print("# of SV Nodes:", len(T1_sv_nodes))
print("# of All Nodes:", len(T1_node_info))

T1_AvA_DF = pangraph_pairwise_kmerset_comparison(T1_node_info)

T1_NodeSummary_DF = compare_kmers_to_profiles_with_max_jc(T1_node_info,
                                                          T1_sv_nodes,
                                                          Rv_ref_kmer_sets,
                                                          T1_AvA_DF)


Parsing FASTA: 1715it [00:00, 22939.21it/s]
Processing GFA S-lines: 100%|██████████| 2423/2423 [00:42<00:00, 57.60it/s]
Processing GFA P-lines: 100%|██████████| 2423/2423 [00:12<00:00, 193.09it/s]  
Running pairwise comparisons of k-mer content:   0%|          | 1/1119 [00:00<02:39,  7.01it/s]

# of Core Nodes: 470
# of SV Nodes: 649
# of All Nodes: 1119


Running pairwise comparisons of k-mer content: 100%|██████████| 1119/1119 [00:44<00:00, 25.34it/s]
Comparing node to reference k-mer profiles: 100%|██████████| 1119/1119 [00:01<00:00, 589.05it/s]


## Process Try2 output of Pangraph

In [152]:
k_size = 31
num_assemblies = 152

# Step 1: Parse FASTA
T2_node_seqs = parse_pangraph_fasta(Pangraph_Try2_FA)

# Step 2: Parse GFA
T2_node_info = parse_pangraph_gfa(Pangraph_Try2_GFA, T2_node_seqs, k_size, num_assemblies)

# Step 3: Classify nodes
T2_core_nodes, T2_sv_nodes = classify_pangraph_nodes(T2_node_info, num_assemblies)

print("# of Core Nodes:", len(T2_core_nodes))
print("# of SV Nodes:", len(T2_sv_nodes))
print("# of All Nodes:", len(T2_node_info))

T2_AvA_DF = pangraph_pairwise_kmerset_comparison(T2_node_info)

T2_NodeSummary_DF = compare_kmers_to_profiles_with_max_jc(T2_node_info,
                                                          T2_sv_nodes,
                                                          Rv_ref_kmer_sets,
                                                          T2_AvA_DF)

Parsing FASTA: 1341it [00:00, 24321.48it/s]
Processing GFA S-lines: 100%|██████████| 3371/3371 [00:44<00:00, 75.64it/s] 
Processing GFA P-lines: 100%|██████████| 3371/3371 [00:15<00:00, 223.20it/s]  
Running pairwise comparisons of k-mer content:   0%|          | 4/1341 [00:00<00:34, 38.79it/s]

# of Core Nodes: 496
# of SV Nodes: 845
# of All Nodes: 1341


Running pairwise comparisons of k-mer content: 100%|██████████| 1341/1341 [00:47<00:00, 28.01it/s]
Comparing node to reference k-mer profiles: 100%|██████████| 1341/1341 [00:02<00:00, 642.55it/s]


## Process Try3 output of Pangraph

In [153]:
k_size = 31
num_assemblies = 152

# Step 1: Parse FASTA
T3_node_seqs = parse_pangraph_fasta(Pangraph_Try3_FA)

# Step 2: Parse GFA
T3_node_info = parse_pangraph_gfa(Pangraph_Try3_GFA, T3_node_seqs, k_size, num_assemblies)

# Step 3: Classify nodes
T3_core_nodes, T3_sv_nodes = classify_pangraph_nodes(T3_node_info, num_assemblies)

print("# of Core Nodes:", len(T3_core_nodes))
print("# of SV Nodes:", len(T3_sv_nodes))
print("# of All Nodes:", len(T3_node_info))

T3_AvA_DF = pangraph_pairwise_kmerset_comparison(T3_node_info)

T3_NodeSummary_DF = compare_kmers_to_profiles_with_max_jc(T3_node_info,
                                                          T3_sv_nodes,
                                                          Rv_ref_kmer_sets,
                                                          T3_AvA_DF)

Parsing FASTA: 1481it [00:00, 23347.76it/s]
Processing GFA S-lines: 100%|██████████| 3568/3568 [00:45<00:00, 78.35it/s]
Processing GFA P-lines: 100%|██████████| 3568/3568 [00:15<00:00, 224.13it/s]  
Running pairwise comparisons of k-mer content:   0%|          | 2/1481 [00:00<02:16, 10.85it/s]

# of Core Nodes: 524
# of SV Nodes: 957
# of All Nodes: 1481


Running pairwise comparisons of k-mer content: 100%|██████████| 1481/1481 [00:51<00:00, 28.82it/s]
Comparing node to reference k-mer profiles: 100%|██████████| 1481/1481 [00:01<00:00, 881.52it/s] 


# Output node summaries for exploration

In [154]:
T1_NodeSummary_DF.to_csv("T1_NodeSummary_DF.csv", index=False)

In [155]:
T2_NodeSummary_DF.to_csv("T2_NodeSummary_DF.csv", index=False)

In [156]:
T3_NodeSummary_DF.to_csv("T3_NodeSummary_DF.csv", index=False)