### Import Statements

In [2]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import seaborn as sns
import pickle
import scipy.stats

%matplotlib inline

In [3]:
from Bio import SeqIO


### Set matplotlib text export settings for Adobe Illustrator

In [4]:
import matplotlib

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

#### Pandas Viewing Settings

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Set matplotlib text export settings for Adobe Illustrator

In [6]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# Parse sample metadata & preprocessed genome info/results

## Parse sample Metadata (N = 151)

In [7]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"

WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)
WGA151CI_AsmSummary_DF.shape

(151, 7)

In [8]:
High_SRBusco_SampleIDs_143CI = ['N0072', 'N0153', 'DNA028', 'N0155', 'N0145', 'DNA075',
       'M0003941_3', 'DNA019_Vash', 'DNA091', 'DNA096', 'AZE_02_067',
       'AZE_02_041', 'ZRB10578980', 'DNA089', 'N0004', 'N1274', 'N0054',
       'DNA124', 'DNA044', 'DNA020', 'AZE_02_042', 'DNA182', 'DNA120',
       'DNA188', 'DNA086', 'M0011368_9', 'MT_0080', 'M0016737_0',
       'DNA054', 'M0014888_3', '02_R1179', '01_R1134', 'M0017522_5',
       'M0016395_7', '02_R1708', '02_R0894', '01_R1430', 'M0010874_7',
       'N1272', 'N0091', 'N1202', '9050-05', '4549-04', '696-05',
       '702-06', '706-05', '8129-04', '3003-06', '8651-04', 'QC-3',
       'QC-9', 'QC-5', 'QC-8', 'QC-10', 'QC-4', 'QC-6', 'QC-1', '8644-04',
       'mada_1-10', 'mada_2-46', 'mada_1-1', 'mada_1-36', 'mada_1-39',
       'mada_1-40', 'mada_1-51', 'mada_1-44', 'mada_117', 'mada_118',
       'mada_122', 'R27252', 'R23887', 'R30078', 'R29816', 'R30234',
       '18_0621851', 'R36431', 'R28703', 'mada_115', 'mada_2-42',
       'R31095', 'R28012', 'R37765', 'R27657', 'R24120', 'R28581',
       'R29598', 'mada_1-11', 'R24100', 'R20574', 'R20260', 'R18043',
       'R22601', 'R23146', 'R32929', 'R21893', 'R30420', 'R26778',
       'R26791', 'R28980', 'R27725', 'R18040', 'R27937', 'mada_1-30',
       'mada_2-31', 'mada_1-41', 'R21839', 'mada_1-32', 'R30396',
       'R21363', 'mada_102', 'mada_129', 'mada_139', 'mada_151',
       'mada_105', 'R15311', 'mada_103', 'mada_2-25', 'mada_112',
       'mada_124', 'mada_126', 'mada_120', 'R23571', 'mada_128',
       'mada_1-12', 'mada_1-15', 'mada_1-2', 'mada_2-1', 'mada_1-53',
       'mada_1-50', 'mada_123', 'mada_2-53', 'MFS-3', 'MFS-59', 'MFS-173',
       'MFS-54', 'MFS-58', 'MFS-51', 'MFS-42', 'MFS-52', 'MFS-56',
       'MFS-60', 'MFS-61', 'MFS-181']

In [9]:

def read_PanarooSummaryStats(input_Panaroo_SummaryStats_TXT):
    
    i_Panaroo_Stats_DF = pd.read_csv(input_Panaroo_SummaryStats_TXT, sep = "\t", header=None)
    i_Panaroo_Stats_DF.columns = ["Category", "Range", "GeneCount"]

    Acc_GenesCount = i_Panaroo_Stats_DF["GeneCount"].values[-1] - i_Panaroo_Stats_DF["GeneCount"].values[0]

    i_Panaroo_AccGenesRow = ["Accessory genes","(0% <= strains < 99%)", Acc_GenesCount]

    i_Panaroo_Stats_DF.loc[len(i_Panaroo_Stats_DF.index)] = i_Panaroo_AccGenesRow

    #listOfCat_ToKeep = ["Core genes", "Accessory Genes", "Total genes"]

    i_Panaroo_Stats_DF = i_Panaroo_Stats_DF[ i_Panaroo_Stats_DF["Category"].isin(listOfCat_ToKeep)]   

    # https://www.statology.org/pandas-add-row-to-dataframe/#:~:text=You%20can%20use%20the%20df,loc%5Blen(df. 

    return i_Panaroo_Stats_DF

In [10]:
def read_Ppanggolin_ContextStats(i_Ppanggolin_Content_TXT, verbose = False):
    if verbose:
        print(f"reading Ppangolin stats from {i_Ppanggolin_Content_TXT}")
        
    with open(i_Ppanggolin_Content_TXT, "r") as f:

        #NumCoreGenes = -1
        #NumShellGenes = -1
        #NumCloudGenes = -1
        for line in f:
            #print(line)
            line_split = line.split(": ")

            Key = line_split[0].split(" ")[0]
            Value = line_split[-1].strip()

            if Key == "Persistent": NumCoreGenes = int(Value)
            elif Key == "Shell": NumShellGenes = int(Value)
            elif Key == "Cloud": NumCloudGenes = int(Value)
        print(NumShellGenes, NumCloudGenes)
        NumAccessoryGenes = NumShellGenes + NumCloudGenes
        NumTotalGenes = NumCoreGenes + NumAccessoryGenes

        #print(NumCoreGenes, NumAccessoryGenes, NumTotalGenes)

    listOf_PG_Rows = []
    listOf_PG_Rows.append( ("Total genes", "(0% <= strains <= 100%)", NumTotalGenes) )
    listOf_PG_Rows.append( ("Accessory genes", "(0% <= strains < 99%)", NumAccessoryGenes) )
    listOf_PG_Rows.append( ("Core genes", "(99% <= strains <= 100%)", NumCoreGenes) )

    PPanggolin_Summary_DF = pd.DataFrame(listOf_PG_Rows)
    PPanggolin_Summary_DF.columns = ["Category", "Range", "GeneCount"]
    
    return PPanggolin_Summary_DF

# Define functions for parsing and calculating pan-genome accumulation curves

In [11]:
pd.options.mode.chained_assignment = None  # default='warn'


def parse_Panaroo_GenePresAbs_CSV(input_Gene_PresAbs_CSV_PATH):
    i_Gene_PresAbs_DF = pd.read_csv(input_Gene_PresAbs_CSV_PATH, low_memory=False)

    ### Relabel Columns for presence/absence tracking
    i_Gene_PresAbs_DF.columns = [ x.split(".Bakta")[0] for x in i_Gene_PresAbs_DF.columns ]
    i_Gene_PresAbs_DF.columns = [ x.split(".PGAP.WiDNA")[0] for x in i_Gene_PresAbs_DF.columns ]

    ListOf_SampleID_Cols = list(i_Gene_PresAbs_DF.columns[3:].values)
    len(ListOf_SampleID_Cols)

    # https://stackoverflow.com/questions/12741092/pandas-dataframe-apply-function-to-all-columns
    i_Gene_PresAbs_DF[ListOf_SampleID_Cols] = i_Gene_PresAbs_DF[ListOf_SampleID_Cols].applymap(lambda x: 1 if isinstance(x, str) else 0)                     
    i_Gene_PresAbs_DF["NumAsm_WiGene"] = i_Gene_PresAbs_DF[ListOf_SampleID_Cols].sum(axis = 1)

    i_Gene_PresAbs_DF["FracPres"] = i_Gene_PresAbs_DF["NumAsm_WiGene"] / 158

    return i_Gene_PresAbs_DF

def parse_Panaroo_GenePresAbs_Rtab(input_Gene_PresAbs_Rtab_PATH):
    i_Gene_PresAbs_DF = pd.read_csv(input_Gene_PresAbs_Rtab_PATH, sep = "\t", low_memory=False)

    ### Relabel Columns for presence/absence tracking
    i_Gene_PresAbs_DF.columns = [ x.split(".Bakta")[0] for x in i_Gene_PresAbs_DF.columns ]
    i_Gene_PresAbs_DF.columns = [ x.split(".PGAP.WiDNA")[0] for x in i_Gene_PresAbs_DF.columns ]
    
    ListOf_SampleID_Cols = list(i_Gene_PresAbs_DF.columns[1:].values)
    len(ListOf_SampleID_Cols)

    # https://stackoverflow.com/questions/12741092/pandas-dataframe-apply-function-to-all-columns
    # i_Gene_PresAbs_DF[ListOf_SampleID_Cols] = i_Gene_PresAbs_DF[ListOf_SampleID_Cols].applymap(lambda x: 1 if isinstance(x, str) else 0)                     
    i_Gene_PresAbs_DF["NumAsm_WiGene"] = i_Gene_PresAbs_DF[ListOf_SampleID_Cols].sum(axis = 1)

    i_Gene_PresAbs_DF["FracPres"] = i_Gene_PresAbs_DF["NumAsm_WiGene"] / 158

    return i_Gene_PresAbs_DF



def subsample_Pangenome_PresAbs_DF(i_Gene_PresAbs_DF, sampleIDs_Subset):
    
    i_Gene_PresAbs_DF.index = i_Gene_PresAbs_DF["Gene"]

    PresAbs_Subset_DF = i_Gene_PresAbs_DF[sampleIDs_Subset]

    PresAbs_Subset_DF["NumAsm_WiGene"] = PresAbs_Subset_DF.sum(axis = 1)

    PresAbs_Subset_DF["FracPres"] = PresAbs_Subset_DF["NumAsm_WiGene"] / len(sampleIDs_Subset)

    return PresAbs_Subset_DF


In [12]:
list(np.arange(10, 20 + 1, 5)) #.append(20)

[10, 15, 20]

In [13]:
[10, 15, 20] + [10]

[10, 15, 20, 10]

In [14]:
def generate_Pangenome_AccumCurve(i_Gene_PresAbs_DF, MinNumGenomes, MaxNumGenomes, NumIterationsPerN, input_SampleIDS, ):
    
    PG_AccumCurve_listOf_Rows = []

    #MaxNumberOfGenomes = 140
    #NumIterationsPerN = 60
    
    
    RangeOfSampleSizes = list(np.arange(MinNumGenomes, MaxNumGenomes + 1, 5))
    
    # If the maximum # of samplesizes is not found, then add it to the list of samplesizes
    if MaxNumGenomes not in RangeOfSampleSizes:
        RangeOfSampleSizes += [MaxNumGenomes]


    #for N in tqdm( RangeOfSampleSizes ):
    for N in RangeOfSampleSizes:
        for i in np.arange(1, NumIterationsPerN + 1):
            subset_SampleIDs = np.random.choice(input_SampleIDS, size = N) 

            Y = subsample_Pangenome_PresAbs_DF(i_Gene_PresAbs_DF, subset_SampleIDs)

            i_PanGenomeSize = Y.query("FracPres > 0").shape[0]
            i_CoreGenomeSize = Y.query("FracPres >= 0.99").shape[0]
            i_AccGenomeSize = Y.query("FracPres < 0.99 & FracPres > 0").shape[0]


            i_row = (N, i, i_PanGenomeSize, i_CoreGenomeSize, i_AccGenomeSize)

            PG_AccumCurve_listOf_Rows.append(i_row)


    PG_AccumCurves_DF = pd.DataFrame(PG_AccumCurve_listOf_Rows)
    PG_AccumCurves_DF.columns = ["NumGenomes",  "IterationNum", "PanGenomeSize", "CoreGenomeSize", "AccGenomeSize"]


    PG_AccumCurve_Mean_DF = PG_AccumCurves_DF.groupby(["NumGenomes"]).mean().reset_index() # "DataSet", "Method", "SeqTech", 
    PG_AccumCurve_Mean_DF = PG_AccumCurve_Mean_DF.drop("IterationNum", axis = 1)

    PG_AccumCurve_SEM_DF = PG_AccumCurves_DF.groupby(["NumGenomes"]).sem().reset_index() # "DataSet", "Method", "SeqTech", 
    PG_AccumCurve_SEM_DF = PG_AccumCurve_SEM_DF.drop("IterationNum", axis = 1)

    #ColumnsToMergeOn = ["DataSet", "Method", "SeqTech", "NumGenomes"]
    ColumnsToMergeOn = ["NumGenomes"]

    PG_AccumCurve_Summ_DF = PG_AccumCurve_Mean_DF.merge(PG_AccumCurve_SEM_DF,
                                                         left_on = ColumnsToMergeOn,
                                                         right_on = ColumnsToMergeOn,
                                                         suffixes=('_MEAN', '_SEM') ).reset_index(drop=True)

    return PG_AccumCurve_Summ_DF, PG_AccumCurves_DF


# Define output dir of the Mtb-WGA-SMK processing pipeline

In [15]:
# Define pipeline output directories

WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

#MtbWGA_SMK_Pipeline_OutputDir = WGA_SMK_Outputs_Dir + "/220427_WGA158CI_V1"
WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"
Mtb_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir

## Define output dirs of pipeline

In [16]:

PipeName_WiParam_List = ['Panaroo_Strict_MergeParalogs', 'Panaroo_Moderate_MergeParalogs',
                         'Panaroo_Sensitive_MergeParalogs', 'Panaroo_Strict',
                         'Panaroo_Moderate', 'Panaroo_Sensitive',  'Roary_NoSplitParalogs_I80', 'Roary_NoSplitParalogs_I90',
                         'Roary_NoSplitParalogs',  'Roary_Default', 'Ppanggolin_Default',] #'Ppanggolin_I95', 'Ppanggolin_I90' ]


target_OutputDir = Mtb_SMK_Pipeline_OutputDir

i_Pangenome_Dir = f"{target_OutputDir}/PanGenome_Analysis"

listOfCat_ToKeep = ["Core genes", "Accessory genes", "Total genes"]


dictOf_PG_OutInfo = {}
dictOf_PG_PresAbs_DF = {}
dictOf_PG_OutDir_PATHs = {}

for i_SampleSet in ["All"]:
    
    dictOf_PG_OutInfo[i_SampleSet] = {}
    dictOf_PG_OutDir_PATHs[i_SampleSet] = {}
    dictOf_PG_PresAbs_DF[i_SampleSet] = {}
    
    for AnnoPipeline in ["Bakta", "PGAP"]:
        
        dictOf_PG_OutInfo[i_SampleSet][AnnoPipeline] = {}
        dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline] = {}
        dictOf_PG_PresAbs_DF[i_SampleSet][AnnoPipeline] = {}
        
        for AsmTech in ["LongRead", "ShortRead"]:       
            
            dictOf_PG_OutInfo[i_SampleSet][AnnoPipeline][AsmTech] = {}
            dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech] = {}
            dictOf_PG_PresAbs_DF[i_SampleSet][AnnoPipeline][AsmTech] = {}
            
            
            for i_PipeNameWiParam in tqdm(PipeName_WiParam_List):
                
                dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam] = {}
                                
                if i_SampleSet == "All": DataSetSuffix = "_AllIsolates"  

                if AsmTech == "LongRead": SeqTechPrefix = ""
                elif AsmTech == "ShortRead": SeqTechPrefix = "SR_"

                if AnnoPipeline == "Bakta": AnnoSuffix = ""
                elif AnnoPipeline == "PGAP": AnnoSuffix = "_WiPGAPAnno_V1"


                i_Pipeline_OutDir = f"{i_Pangenome_Dir}/{SeqTechPrefix}{i_PipeNameWiParam}{DataSetSuffix}{AnnoSuffix}"

                if "Panaroo" in i_PipeNameWiParam:
                    i_PG_Summary_Txt = f"{i_Pipeline_OutDir}/summary_statistics.txt"

                    i_PG_Stats_DF = read_PanarooSummaryStats(i_PG_Summary_Txt)
                    i_PG_Stats_DF["Annotation"] = AnnoPipeline
                    i_PG_Stats_DF["Method"] = i_PipeNameWiParam
                    i_PG_Stats_DF["SeqType"] = AsmTech
                    i_PG_Stats_DF["DataSet"] = i_SampleSet
                    
                    i_PG_Stats_DF = i_PG_Stats_DF[ i_PG_Stats_DF["Category"].isin(listOfCat_ToKeep)]   
                    
                    i_Gene_PresAbs_CSV_PATH = f"{i_Pipeline_OutDir}/gene_presence_absence.csv"    
                    i_Pangenome_Ref_FA_PATH = f"{i_Pipeline_OutDir}/pan_genome_reference.fa"
                    
                    #i_Gene_PresAbs_DF = parse_Panaroo_GenePresAbs_CSV(i_Gene_PresAbs_CSV_PATH)  

                    dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam]["PresAbs_CSV"] = i_Gene_PresAbs_CSV_PATH
                    dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam]["Pangenome_Ref_FA"] = i_Pangenome_Ref_FA_PATH


                elif "Roary" in i_PipeNameWiParam:
                    i_PG_Summary_Txt = f"{i_Pipeline_OutDir}/summary_statistics.txt"

                    i_PG_Stats_DF = read_PanarooSummaryStats(i_PG_Summary_Txt)
                    i_PG_Stats_DF["Annotation"] = AnnoPipeline
                    i_PG_Stats_DF["Method"] = i_PipeNameWiParam
                    i_PG_Stats_DF["SeqType"] = AsmTech
                    i_PG_Stats_DF["DataSet"] = i_SampleSet
                    
                    i_PG_Stats_DF = i_PG_Stats_DF[ i_PG_Stats_DF["Category"].isin(listOfCat_ToKeep)]   

                    i_Gene_PresAbs_CSV_PATH = f"{i_Pipeline_OutDir}/gene_presence_absence.csv"    

                    i_Pangenome_Ref_FA_PATH = f"{i_Pipeline_OutDir}/pan_genome_reference.fa"
                    
                    dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam]["PresAbs_CSV"] = i_Gene_PresAbs_CSV_PATH
                    dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam]["Pangenome_Ref_FA"] = i_Pangenome_Ref_FA_PATH

                    
                elif "Ppanggolin" in i_PipeNameWiParam:
                    #print(i_PipeNameWiParam)
                    i_PG_Summary_Txt = f"{i_Pipeline_OutDir}/pangenome.ContentSummary.txt"
                    #print(i_PG_Summary_Txt)
                    #!grep 'Families' $i_PG_Summary_Txt 
                    
                    i_PG_Stats_DF = read_Ppanggolin_ContextStats(i_PG_Summary_Txt, False)
                    i_PG_Stats_DF["Annotation"] = AnnoPipeline
                    i_PG_Stats_DF["Method"] = i_PipeNameWiParam
                    i_PG_Stats_DF["SeqType"] = AsmTech
                    i_PG_Stats_DF["DataSet"] = i_SampleSet
                    #print(i_PG_Stats_DF)
                    
                    i_PG_Stats_DF = i_PG_Stats_DF[ i_PG_Stats_DF["Category"].isin(listOfCat_ToKeep)]   

                    i_Gene_PresAbs_Rtab_PATH = f"{i_Pipeline_OutDir}/gene_presence_absence.Rtab"    
                    
                    #i_Gene_PresAbs_DF = parse_Panaroo_GenePresAbs_Rtab(i_Gene_PresAbs_Rtab_PATH)
                                          
                    dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam]["PresAbs_Rtab"] = i_Gene_PresAbs_Rtab_PATH


                dictOf_PG_OutInfo[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam] = i_PG_Stats_DF
                
                #dictOf_PG_PresAbs_DF[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam] = i_Gene_PresAbs_DF
                
                dictOf_PG_OutDir_PATHs[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam]["Stats_TXT"] = i_PG_Summary_Txt


100%|██████████| 11/11 [00:00<00:00, 95.33it/s]
100%|██████████| 11/11 [00:00<00:00, 89.53it/s]
  0%|          | 0/11 [00:00<?, ?it/s]

54 739
290 1045


100%|██████████| 11/11 [00:00<00:00, 106.89it/s]
100%|██████████| 11/11 [00:00<00:00, 106.05it/s]

11 209
1 314





In [17]:
dictOf_PG_OutInfo["All"]["Bakta"]["LongRead"]["Ppanggolin_Default"]

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet
0,Total genes,(0% <= strains <= 100%),4557,Bakta,Ppanggolin_Default,LongRead,All
1,Accessory genes,(0% <= strains < 99%),793,Bakta,Ppanggolin_Default,LongRead,All
2,Core genes,(99% <= strains <= 100%),3764,Bakta,Ppanggolin_Default,LongRead,All


In [143]:
#dictOf_PG_OutInfo["All"]["Bakta"]["LongRead"]["Ppanggolin_I95"]

In [18]:
!ls -1 /n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/PanGenome_Analysis/Ppanggolin_Default_AllIsolates/

gene_presence_absence.Rtab
matrix.csv
mean_persistent_duplication.tsv
organisms_statistics.tsv
pangenome.ContentSummary.txt
pangenomeGraph.gexf.gz
pangenomeGraph.json
pangenomeGraph_light.gexf.gz
pangenome.h5
partitions
projection
tile_plot.html
Ushaped_plot.html


In [145]:
#!ls -1 /n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/PanGenome_Analysis/


In [146]:
#dictOf_PG_OutInfo["All"]["PGAP"]["LongRead"]["Panaroo_Strict_MergeParalogs"]

In [19]:
dictOf_PG_OutInfo["All"]["Bakta"]["LongRead"]["Panaroo_Strict_MergeParalogs"]

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet
0,Core genes,(99% <= strains <= 100%),3779,Bakta,Panaroo_Strict_MergeParalogs,LongRead,All
4,Total genes,(0% <= strains <= 100%),4200,Bakta,Panaroo_Strict_MergeParalogs,LongRead,All
5,Accessory genes,(0% <= strains < 99%),421,Bakta,Panaroo_Strict_MergeParalogs,LongRead,All


In [20]:
dictOf_PG_OutInfo["All"]["PGAP"]["LongRead"]["Panaroo_Strict_MergeParalogs"]

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet
0,Core genes,(99% <= strains <= 100%),3750,PGAP,Panaroo_Strict_MergeParalogs,LongRead,All
4,Total genes,(0% <= strains <= 100%),4063,PGAP,Panaroo_Strict_MergeParalogs,LongRead,All
5,Accessory genes,(0% <= strains < 99%),313,PGAP,Panaroo_Strict_MergeParalogs,LongRead,All


In [21]:
dictOf_PG_OutInfo["All"]["PGAP"]["ShortRead"]["Panaroo_Strict_MergeParalogs"]

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet
0,Core genes,(99% <= strains <= 100%),3489,PGAP,Panaroo_Strict_MergeParalogs,ShortRead,All
4,Total genes,(0% <= strains <= 100%),4014,PGAP,Panaroo_Strict_MergeParalogs,ShortRead,All
5,Accessory genes,(0% <= strains < 99%),525,PGAP,Panaroo_Strict_MergeParalogs,ShortRead,All


In [22]:
dictOf_PG_OutInfo["All"]["Bakta"]["LongRead"]["Ppanggolin_Default"]

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet
0,Total genes,(0% <= strains <= 100%),4557,Bakta,Ppanggolin_Default,LongRead,All
1,Accessory genes,(0% <= strains < 99%),793,Bakta,Ppanggolin_Default,LongRead,All
2,Core genes,(99% <= strains <= 100%),3764,Bakta,Ppanggolin_Default,LongRead,All


In [23]:
dictOf_PG_OutInfo["All"]["Bakta"]["ShortRead"]["Ppanggolin_Default"]

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet
0,Total genes,(0% <= strains <= 100%),4998,Bakta,Ppanggolin_Default,ShortRead,All
1,Accessory genes,(0% <= strains < 99%),1335,Bakta,Ppanggolin_Default,ShortRead,All
2,Core genes,(99% <= strains <= 100%),3663,Bakta,Ppanggolin_Default,ShortRead,All


In [24]:
dictOf_PG_OutInfo["All"]["PGAP"]["ShortRead"]["Ppanggolin_Default"]

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet
0,Total genes,(0% <= strains <= 100%),3829,PGAP,Ppanggolin_Default,ShortRead,All
1,Accessory genes,(0% <= strains < 99%),315,PGAP,Ppanggolin_Default,ShortRead,All
2,Core genes,(99% <= strains <= 100%),3514,PGAP,Ppanggolin_Default,ShortRead,All


# 1) Merge all Pan-genome counts accross all combinations evaluated

In [26]:

PipeName_WiParam_List = ['Panaroo_Strict_MergeParalogs', 'Panaroo_Moderate_MergeParalogs',
                         'Panaroo_Sensitive_MergeParalogs', 'Panaroo_Strict',
                         'Panaroo_Moderate', 'Panaroo_Sensitive',  'Roary_NoSplitParalogs_I80', 'Roary_NoSplitParalogs_I90',
                         'Roary_NoSplitParalogs',  'Roary_Default', 'Ppanggolin_Default', ] #'Ppanggolin_I95', 'Ppanggolin_I90' ]


listOf_PG_InfoDFs = []
for i_SampleSet in ["All"]:
    for AnnoPipeline in ["Bakta", "PGAP"]:
        for AsmTech in ["LongRead", "ShortRead"]:
            for i_PipeNameWiParam in PipeName_WiParam_List:
                i_PG_Stats_DF = dictOf_PG_OutInfo[i_SampleSet][AnnoPipeline][AsmTech][i_PipeNameWiParam]
                
                listOf_PG_InfoDFs.append(i_PG_Stats_DF)


PG_Merged_Stats_LRandSR_DF = pd.concat(listOf_PG_InfoDFs)

PG_Merged_Stats_LRandSR_DF["MethodAndTech"] = PG_Merged_Stats_LRandSR_DF["Method"] + "_" +PG_Merged_Stats_LRandSR_DF["SeqType"]

PG_Merged_Stats_LRandSR_DF["MethodAndTechAndAnno"] = PG_Merged_Stats_LRandSR_DF["Method"] + "_" +PG_Merged_Stats_LRandSR_DF["SeqType"]  + "_" +PG_Merged_Stats_LRandSR_DF["Annotation"]  

PG_Merged_Stats_LRandSR_DF["AnnoAndTech"] = PG_Merged_Stats_LRandSR_DF["Annotation"] + "-" + PG_Merged_Stats_LRandSR_DF["SeqType"]

PG_Merged_Stats_LRandSR_DF["PangenomeTool"] = PG_Merged_Stats_LRandSR_DF["Method"].str.split("_").str[0]


PG_Merged_Stats_LRandSR_DF.shape


(132, 11)

### Output `PG_Merged_Stats_LRandSR_DF` to TSV

In [27]:
PG_Results_OutDir = "../../Data/240116.PG_Results" 

!mkdir $PG_Results_OutDir

PG_Merged_Stats_LRandSR_DF.to_csv(f"{PG_Results_OutDir}/Mtb151CI.PG_Merged_Stats_LRandSR.tsv", sep="\t", index=False)


In [28]:
PG_Merged_Stats_LRandSR_DF.head(4)

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet,MethodAndTech,MethodAndTechAndAnno,AnnoAndTech,PangenomeTool
0,Core genes,(99% <= strains <= 100%),3779,Bakta,Panaroo_Strict_MergeParalogs,LongRead,All,Panaroo_Strict_MergeParalogs_LongRead,Panaroo_Strict_MergeParalogs_LongRead_Bakta,Bakta-LongRead,Panaroo
4,Total genes,(0% <= strains <= 100%),4200,Bakta,Panaroo_Strict_MergeParalogs,LongRead,All,Panaroo_Strict_MergeParalogs_LongRead,Panaroo_Strict_MergeParalogs_LongRead_Bakta,Bakta-LongRead,Panaroo
5,Accessory genes,(0% <= strains < 99%),421,Bakta,Panaroo_Strict_MergeParalogs,LongRead,All,Panaroo_Strict_MergeParalogs_LongRead,Panaroo_Strict_MergeParalogs_LongRead_Bakta,Bakta-LongRead,Panaroo
0,Core genes,(99% <= strains <= 100%),3825,Bakta,Panaroo_Moderate_MergeParalogs,LongRead,All,Panaroo_Moderate_MergeParalogs_LongRead,Panaroo_Moderate_MergeParalogs_LongRead_Bakta,Bakta-LongRead,Panaroo


In [29]:
PG_Merged_Stats_LRandSR_DF["Range"].value_counts()

(99% <= strains <= 100%)    44
(0% <= strains <= 100%)     44
(0% <= strains < 99%)       44
Name: Range, dtype: int64

In [31]:
PG_Merged_Stats_LRandSR_DF[ PG_Merged_Stats_LRandSR_DF["Method"].str.contains("Ppanggolin") ].query("Category == 'Accessory genes' ") 

Unnamed: 0,Category,Range,GeneCount,Annotation,Method,SeqType,DataSet,MethodAndTech,MethodAndTechAndAnno,AnnoAndTech,PangenomeTool
1,Accessory genes,(0% <= strains < 99%),793,Bakta,Ppanggolin_Default,LongRead,All,Ppanggolin_Default_LongRead,Ppanggolin_Default_LongRead_Bakta,Bakta-LongRead,Ppanggolin
1,Accessory genes,(0% <= strains < 99%),1335,Bakta,Ppanggolin_Default,ShortRead,All,Ppanggolin_Default_ShortRead,Ppanggolin_Default_ShortRead_Bakta,Bakta-ShortRead,Ppanggolin
1,Accessory genes,(0% <= strains < 99%),220,PGAP,Ppanggolin_Default,LongRead,All,Ppanggolin_Default_LongRead,Ppanggolin_Default_LongRead_PGAP,PGAP-LongRead,Ppanggolin
1,Accessory genes,(0% <= strains < 99%),315,PGAP,Ppanggolin_Default,ShortRead,All,Ppanggolin_Default_ShortRead,Ppanggolin_Default_ShortRead_PGAP,PGAP-ShortRead,Ppanggolin
