# Run the SR-Pangenome analysis QC w/ the LRAsms

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm
%matplotlib inline


In [2]:
%reload_ext autoreload
%autoreload 2

### import pgqc (pan-genome quality control) toolkit functions
from pgqc.ava import ava
from pgqc.nscluster import clusterBy_KmerJC, summarize_NSClusters, create_MaxKmerSim_JC_Dict, create_MST_FiltByJC, make_ClusterID_Maps 
from pgqc.nscluster import make_NS_ClusterMerged_Pres_DF


from pgqc.utils import parse_PresAbs_Rtab, parse_PresAbs_CSV_Roary, parse_PresAbs_CSV_Panaroo, get_PG_Stats_FromPresAbs

from pgqc.utils import  parse_PG_Ref_FA, get_PG_Stats_FromDNASeqPresAbs

from pgqc.asm_gene_search import parse_AlnHits_To_DF
from pgqc.asm_gene_search import PresAbsQC_CheckAsmForGeneSeq, SRAsm_PresAbsQC_CheckInLRAsm
from pgqc.asm_gene_search import get_SRAsm_Vs_LRAsm_QCStats


In [3]:
import time

In [4]:
import screed

In [5]:
import mappy as mp

In [6]:
# Set max column width to a specific value (e.g., 100 characters)
pd.set_option('display.max_colwidth', 100)
# Set to display a specific number of columns (e.g., 20 columns)
pd.set_option('display.max_columns', 180)

# Parse sample metadata & preprocessed genome info/results

In [7]:
!pwd

/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-VCI-MGM/JupyterNotebooks/240103_2_MtbPangenome_151CI_V5


In [8]:
#!ls -1 ../../Data

## Parse sample Metadata (N = 151)

In [9]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"

WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)
WGA151CI_AsmSummary_DF.shape

(151, 7)

## PARSE PATHs FOR ALL assemblies processed by this pipeline

In [10]:
WGA151CI_LRandSR_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")
print(WGA151CI_LRandSR_Asm_Path_DF.columns)
WGA151CI_LRandSR_Asm_Path_DF.columns = ['SampleID', 'Dataset_Tag',
                                        'Genome_LR_ASM_PATH', 'Genome_SR_ASM_PATH']


Index(['SampleID', 'Dataset_Tag', 'Genome_ASM_PATH',
       'ShortRead_Genome_ASM_PATH'],
      dtype='object')


In [11]:
WGA151CI_LRandSR_Asm_Path_DF.head(1)

Unnamed: 0,SampleID,Dataset_Tag,Genome_LR_ASM_PATH,Genome_SR_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/...,/n/data1/hms/dbmi/farhat/mm774/Projects/231121.MtbSetV3.151CI.CompleteAndSR.Asms/ChinerOms_2019/...


#### Create Dict of Asm FA PATHs

In [12]:

LR_AsmFA_Dict = dict(WGA151CI_LRandSR_Asm_Path_DF[['SampleID', 'Genome_LR_ASM_PATH']].values)
SR_AsmFA_Dict = dict(WGA151CI_LRandSR_Asm_Path_DF[['SampleID', 'Genome_SR_ASM_PATH']].values)


### Define Phylo order of samples:

In [13]:
OrderOfSampleIDs_Phylo = ['N0153', 'N0072', 'mada_2-46', 'mada_1-44', 'mada_107',
                          'mada_1-1', 'mada_1-51', 'mada_1-39', 'mada_1-36',
                          'mada_117', 'mada_122', 'mada_118', 'mada_1-10', 'R27252',
                          'R23887', 'TB3091', '9050-05', '3003-06', '702-06', '696-05',
                          '8651-04', 'TB3396', '4549-04', 'TB1612', 'TB2780', 'TB3368',
                          'TB1236', 'TB2659', '8129-04', 'R30215', 'R25048', 'TB2512',
                          'TB2981', 'TB2995', 'TB3113', '706-05', 'R30078', 'R28012',
                          'R27657', 'R30234', 'R31095', 'R28703', 'R24120', 'R36431',
                          'R29816', 'S0070-08', 'N0155', 'N0145', 'R29598', 'R24100',
                          'S0107-01', 'R28581', 'S0256-08', 'S0085-01', 'S0089-01',
                          'mada_1-11', 'M0003941_3', 'mada_115', 'mada_2-42', 'R37765',
                          '18_0621851', 'R22601', 'R27937', 'R18040', 'R18043', 'R27725',
                          'R26791', 'R20574', 'R20260', 'R21408', 'R23146', 'R28980', 'R32929',
                          'R26778', 'R30420', 'R21893', 'QC-9', 'QC-5', 'QC-3', 'N0004',
                          'mada_1-30', 'N0054', 'N1274', '01_R1134', 'TB2968', 'mada_1-53',
                          'mada_2-53', 'mada_1-50', 'mada_2-1', 'R23571', 'mada_123',
                          'mada_1-12', 'mada_1-15', 'mada_128', 'mada_1-38', 'TB3054',
                          'mada_126', 'mada_120', 'TB4620', 'M0016737_0', 'M0016395_7',
                          'R15311', 'TB2661', 'TB3386', 'TB3162', '02_R1179', 'M0010874_7',
                          'QC-7', 'QC-6', 'QC-1', '01_R1430', 'M0011368_9', '02_R1896',
                          'mada_2-25', 'TB3237', 'mada_103', 'mada_112', 'mada_124',
                          'S0123-01', 'S0262-02', 'TB3251', 'M0017522_5', 'R30396', 'R20896',
                          'mada_1-32', 'S0106-01', 'R21839', 'R21363', 'R21770', 'MT_0080','mada_102',
                          'TB3334', 'M0014888_3', 'mada_151', 'TB3169', 'mada_105', 'QC-8',
                          'QC-10', 'QC-4', 'mada_129', 'mada_139', '02_R1708', '02_R0894',
                          'mada_2-31', 'mada_1-41', 'N1272', 'N1176', 'N1202', 'N0091',
                          'N1177','RW-TB008']



### Parse Asm QC results

In [14]:
Repo_DataDir = "../../Data"

AsmSummary_TSVs_Dir = Repo_DataDir + "/231130_WGA-Mtb_CompleteVsSR_AsmSummary_TSVs"   

WGA151CI_AsmQCSumm_TSV = f"{AsmSummary_TSVs_Dir}/231130.WGA158CI.LRvsSR.AsmQC.V1.tsv"
WGA151CI_AsmQC_Tidy_TSV = f"{AsmSummary_TSVs_Dir}/231130.WGA158CI.LRvsSR.AsmQC.V1.Tidy.tsv"


WGA151CI_Asm_LRvsSR_DF = pd.read_csv(WGA151CI_AsmQCSumm_TSV, sep="\t")
print(WGA151CI_Asm_LRvsSR_DF.shape)

WGA151CI_Asm_LRvsSR_TidyDF = pd.read_csv(WGA151CI_AsmQC_Tidy_TSV, sep="\t")
print(WGA151CI_Asm_LRvsSR_TidyDF.shape)

(151, 16)
(302, 11)


In [15]:
SRAsm_BUSCO_Dict = WGA151CI_Asm_LRvsSR_DF.set_index('SampleID')['SRAsm_Busco_Complete_Score'].to_dict()   


### Get SampleIDs associated w/ High & Low BUSCO scores (>= 99 or < 99)

In [16]:
WGA151CI_Asm_LRvsSR_DF.query("SRAsm_Busco_Complete_Score >= 99").shape

(136, 16)

In [17]:
WGA151CI_Asm_LRvsSR_DF.query("SRAsm_Busco_Complete_Score < 99").shape

(15, 16)

In [18]:
High_SRBusco_SampleIDs_136CI = WGA151CI_Asm_LRvsSR_DF.query("SRAsm_Busco_Complete_Score >= 99")["SampleID"].values     

Low_SRBusco_SampleIDs_15CI = WGA151CI_Asm_LRvsSR_DF.query("SRAsm_Busco_Complete_Score < 99")["SampleID"].values


In [19]:
Low_SRBusco_SampleIDs_15CI

array(['TB1612', 'TB2981', '02_R1896', 'TB3334', 'N1176', 'N1177',
       'RW-TB008', 'QC-7', 'mada_107', 'R30215', 'R25048', 'R21408',
       'R21770', 'R20896', 'mada_1-38'], dtype=object)

In [20]:
len(Low_SRBusco_SampleIDs_15CI)

15

In [21]:
len(High_SRBusco_SampleIDs_136CI)

136

In [22]:
WGA151CI_AsmSummary_DF.head(5)

Unnamed: 0,SampleID,numContigs_Complete,Flye_CircContig_Cov,PrimaryLineage,Lineage,Dataset_Tag,AsmApproach
0,N0072,1,358,lineage1,"lineage1,lineage1.1,lineage1.1.2",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
1,N0153,1,372,lineage1,"lineage1,lineage1.1,lineage1.1.1,lineage1.1.1.1",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
2,TB3113,1,933,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
3,TB1236,1,374,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
4,TB2659,1,421,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon


#### Make sample lineage & color mapping

In [23]:
# Dictionary for lineage to color mapping
LinToColor_Dict = {
    "lineage1": "#DF83AC",
    "lineage2": "#7098CB",
    "lineage3": "#815D9F",
    "lineage4": "#E76956",
    "lineage5": "#B67548",
    "lineage6": "#6AB79E",
    "lineage8": "#E4515B",
    "None": "black",
}

# Extracting the mapping between IsolateID and PrimaryLineage_Ill
lineage_mapping = WGA151CI_AsmSummary_DF.set_index('SampleID')['PrimaryLineage'].to_dict()

# Creating a color mapping for the samples
sample_colors = {sample: LinToColor_Dict.get(lineage, "black") for sample, lineage in lineage_mapping.items()}


# Define output dir of the Mtb-WGA-SMK processing pipeline

In [24]:
# Define varaint calling pipeline output directories

WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"

MtbWGA_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir


## Define output dirs of Pangenome Analysis pipelines

In [25]:
target_OutputDir = MtbWGA_SMK_Pipeline_OutputDir

i_Pangenome_Dir = f"{target_OutputDir}/PanGenome_Analysis"


### Define path to Panaroo output files (158 LR genomes, MergeParalogs Parameters)

In [26]:
# PipeName_WiParam_List = ['Panaroo_Strict_MergeParalogs', 'Panaroo_Moderate_MergeParalogs',
#                          'Panaroo_Sensitive_MergeParalogs', 'Panaroo_Strict',
#                          'Panaroo_Moderate', 'Panaroo_Sensitive',  'Roary_NoSplitParalogs_I80', 'Roary_NoSplitParalogs_I90',
#                          'Roary_NoSplitParalogs',  'Roary_Default', 'Ppanggolin_Default']


PG_OutDir_Dict = { "Panaroo_Strict_MP" : f"{i_Pangenome_Dir}/Panaroo_Strict_MergeParalogs_AllIsolates", 
                     "Panaroo_Moderate_MP" : f"{i_Pangenome_Dir}/Panaroo_Moderate_MergeParalogs_AllIsolates", 
                     "Panaroo_Sens_MP" : f"{i_Pangenome_Dir}/Panaroo_Sensitive_MergeParalogs_AllIsolates",
                     "Roary_Default" : f"{i_Pangenome_Dir}/Roary_Default_AllIsolates",
                     "Roary_NoSplitParalogs" : f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_AllIsolates", 
                     "Roary_NoSplitParalogs_I90" : f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_I90_AllIsolates",
                     "Roary_NoSplitParalogs_I80" : f"{i_Pangenome_Dir}/Roary_NoSplitParalogs_I80_AllIsolates",
                     "SR_Panaroo_Strict_MP": f"{i_Pangenome_Dir}/SR_Panaroo_Strict_MergeParalogs_AllIsolates",
                     "SR_Panaroo_Moderate_MP": f"{i_Pangenome_Dir}/SR_Panaroo_Moderate_MergeParalogs_AllIsolates",
                     "SR_Panaroo_Sens_MP": f"{i_Pangenome_Dir}/SR_Panaroo_Sensitive_MergeParalogs_AllIsolates",
                     "SR_Roary_Default": f"{i_Pangenome_Dir}/SR_Roary_Default_AllIsolates",
                     "SR_Roary_NoSplitParalogs": f"{i_Pangenome_Dir}/SR_Roary_NoSplitParalogs_AllIsolates",
                     "SR_Roary_NoSplitParalogs_I90": f"{i_Pangenome_Dir}/SR_Roary_NoSplitParalogs_I90_AllIsolates",
                     "SR_Roary_NoSplitParalogs_I80": f"{i_Pangenome_Dir}/SR_Roary_NoSplitParalogs_I80_AllIsolates"
                   }


PG_PresAbs_CSV_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_PresAbs_CSV_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.csv"  
    
PG_PresAbs_Rtab_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_PresAbs_Rtab_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.Rtab"  

PG_GeneRefFA_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_GeneRefFA_PATH_Dict[i_param] = f"{i_outdir}/pan_genome_reference.fa"  

PG_AvA_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_AvA_PATH_Dict[i_param] = f"{i_outdir}/pan_genome_reference.KmerComparison.AllVsAll.MaxJC.tsv"


PG_PresAbs_AsmGeneSeqChk_PATH_Dict = {}
for i_param, i_outdir in PG_OutDir_Dict.items():
    PG_PresAbs_AsmGeneSeqChk_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.AsmGeneSeqChk.tsv"




In [27]:
#PG_GeneRefFA_PATH_Dict

# Parse in processed data

### A) Parse in processed All vs All Kmer analysis

In [28]:
AvA_DF_Dict = {}

for i_Param, AvA_TSV_PATH in PG_AvA_PATH_Dict.items():

    PG_AvA_DF = pd.read_csv(AvA_TSV_PATH, sep = "\t" )
    AvA_DF_Dict[i_Param] = PG_AvA_DF


### B) Parse in Gene PresAbs Info

In [29]:
PresAbs_DF_Dict = {}

for i_Param, PresAbs_CSV_PATH in PG_PresAbs_CSV_PATH_Dict.items():

    if "Roary" in i_Param: 
        i_Gene_PresAbs_DF = parse_PresAbs_CSV_Roary(PresAbs_CSV_PATH)
    else:
        i_Gene_PresAbs_DF = parse_PresAbs_CSV_Panaroo(PresAbs_CSV_PATH)

    ### Relabel Columns for presence/absence tracking
    i_Gene_PresAbs_DF.columns = [ x.split(".Bakta")[0] for x in i_Gene_PresAbs_DF.columns ]

    print(i_Param, i_Gene_PresAbs_DF.shape)
    
    PresAbs_DF_Dict[i_Param] = i_Gene_PresAbs_DF
    

  


Panaroo_Strict_MP (4200, 153)


  


Panaroo_Moderate_MP (4280, 153)
Panaroo_Sens_MP (4281, 153)
Roary_Default (5366, 153)


  


Roary_NoSplitParalogs (4366, 153)


  


Roary_NoSplitParalogs_I90 (4293, 153)


  


Roary_NoSplitParalogs_I80 (4252, 153)


  


SR_Panaroo_Strict_MP (4211, 153)
SR_Panaroo_Moderate_MP (4382, 153)
SR_Panaroo_Sens_MP (4600, 153)
SR_Roary_Default (6006, 153)
SR_Roary_NoSplitParalogs (5025, 153)
SR_Roary_NoSplitParalogs_I90 (4956, 153)
SR_Roary_NoSplitParalogs_I80 (4866, 153)


### C) Parse in PG Gene Reference FASTAs for each PG output

In [30]:

PG_RefSeqs_DF_Dict = {}

for i_Param, i_PG_Ref_FA_PATH in PG_GeneRefFA_PATH_Dict.items():
    #print(i_PG_Ref_FA_PATH)
    PG_RefSeqs_DF_Dict[i_Param] = parse_PG_Ref_FA(i_PG_Ref_FA_PATH)

    LR_PG_Ref_IDs = list( PG_RefSeqs_DF_Dict[i_Param].keys())
    print(i_Param, len(LR_PG_Ref_IDs))
    

Panaroo_Strict_MP 4200
Panaroo_Moderate_MP 4280
Panaroo_Sens_MP 4281
Roary_Default 5366
Roary_NoSplitParalogs 4366
Roary_NoSplitParalogs_I90 4293
Roary_NoSplitParalogs_I80 4252
SR_Panaroo_Strict_MP 4211
SR_Panaroo_Moderate_MP 4382
SR_Panaroo_Sens_MP 4600
SR_Roary_Default 6006
SR_Roary_NoSplitParalogs 5025
SR_Roary_NoSplitParalogs_I90 4956
SR_Roary_NoSplitParalogs_I80 4866


## D) PresAbs w/ AsmSeqCheck (For both LR and SR Asms)

In [31]:
PresAbs_DF_AsmSeqChk_Dict = {}

for i_Param, PresAbs_AsmSeqChk_CSV_PATH in PG_PresAbs_AsmGeneSeqChk_PATH_Dict.items():

    i_PresAbs_AsmSeqChk_DF = pd.read_csv(PresAbs_AsmSeqChk_CSV_PATH, sep="\t").set_index("Gene", drop=False)  
    
    PresAbs_DF_AsmSeqChk_Dict[i_Param] = i_PresAbs_AsmSeqChk_DF

    print(i_Param, "-", i_PresAbs_AsmSeqChk_DF.shape)


Panaroo_Strict_MP - (4061, 154)
Panaroo_Moderate_MP - (4280, 154)
Panaroo_Sens_MP - (4281, 154)
Roary_Default - (5366, 154)
Roary_NoSplitParalogs - (4366, 154)
Roary_NoSplitParalogs_I90 - (4293, 154)
Roary_NoSplitParalogs_I80 - (4252, 154)
SR_Panaroo_Strict_MP - (4211, 154)
SR_Panaroo_Moderate_MP - (4382, 154)
SR_Panaroo_Sens_MP - (4600, 154)
SR_Roary_Default - (6006, 154)
SR_Roary_NoSplitParalogs - (5025, 154)
SR_Roary_NoSplitParalogs_I90 - (4956, 154)
SR_Roary_NoSplitParalogs_I80 - (4866, 154)


In [32]:
PresAbs_DF_AsmSeqChk_Dict.keys()

dict_keys(['Panaroo_Strict_MP', 'Panaroo_Moderate_MP', 'Panaroo_Sens_MP', 'Roary_Default', 'Roary_NoSplitParalogs', 'Roary_NoSplitParalogs_I90', 'Roary_NoSplitParalogs_I80', 'SR_Panaroo_Strict_MP', 'SR_Panaroo_Moderate_MP', 'SR_Panaroo_Sens_MP', 'SR_Roary_Default', 'SR_Roary_NoSplitParalogs', 'SR_Roary_NoSplitParalogs_I90', 'SR_Roary_NoSplitParalogs_I80'])

In [33]:
len(SampleIDs_151CI_SOI)

151

In [34]:
len(SR_AsmFA_Dict.keys())

151

## Run SR Asm gene missingness QC steps

In [35]:
PresAbs_DF_SRPG_QC_WiLRAsmSeqChk_Dict = {}

#for i_Param, i_PresAbs_DF in PresAbs_DF_Dict.items():
for i_Param in ["SR_Roary_NoSplitParalogs_I80"]:

    i_PresAbs_DF = PresAbs_DF_Dict[i_Param]
    
    if i_Param.startswith("SR_"):
        print(f'Running Asm DNA seq check for "{i_Param}"')
        i_SR_PresAbs_OG_DF = PresAbs_DF_Dict[i_Param]
        
        i_SR_PresAbs_LRAsmSeqQC_DF = SRAsm_PresAbsQC_CheckInLRAsm(PresAbs_DF_Dict[i_Param] ,
                                                                PG_RefSeqs_DF_Dict[i_Param],
                                                                SR_AsmFA_Dict,
                                                                LR_AsmFA_Dict,
                                                                SampleIDs_151CI_SOI)
        
        PresAbs_DF_SRPG_QC_WiLRAsmSeqChk_Dict[i_Param] = i_SR_PresAbs_LRAsmSeqQC_DF

    else: continue


  0%|          | 0/151 [00:00<?, ?it/s]

Running Asm DNA seq check for "SR_Roary_NoSplitParalogs_I80"


100%|██████████| 151/151 [13:22<00:00,  6.52s/it]


Across all samples, total missing genes - Not in SR Asm, but in LR Asm: 10240
Across all samples, total missing genes - In SR Asm, Not in LR Asm: 75
Across all samples, total missing genes - In BOTH SR Asm and LR Asm: 89639


### Notes
- 0 means CDS AA seq & DNA seq not found in SR
- 1 means CDS AA seq present in SR
- 3 Means DNA Seq "Not in SR, but In LR Asm", ("Incomplete Assembly")
- 4 Means DNA Seq "In SR Asm, NOT In LR Asm" 
- 5 Means DNA Seq "In SR Asm, In LR Asm" ("Annotation Discrepancy")


In [37]:
get_PG_Stats_FromPresAbs(PresAbs_DF_Dict["SR_Panaroo_Moderate_MP"], 151)

Accessory Thresh: 149.49


(4382, 3574, 808)

In [41]:
#get_PG_Stats_FromPresAbs(PresAbs_DF_SRPG_QC_WiLRAsmSeqChk_Dict["SR_Panaroo_Moderate_MP"], 151)

In [40]:
#get_PG_Stats_FromPresAbs(PresAbs_DF_SRPG_QC_WiLRAsmSeqChk_Dict["SR_Panaroo_Moderate_MP"], 151)

# Output the SR-Pangenome QC w/ LR-Asms

In [42]:

PG_PresAbs_LRAsmQC_PATH_Dict = {}
# for i_param in ["SR_Panaroo_Strict_MP", "SR_Panaroo_Moderate_MP", "SR_Panaroo_Sens_MP", #"SR_Roary_Default",  
#                  "SR_Roary_NoSplitParalogs", "SR_Roary_NoSplitParalogs_I90", "SR_Roary_NoSplitParalogs_I80"]:

for i_param in ["SR_Roary_NoSplitParalogs_I80"]:
                     
    i_outdir = PG_OutDir_Dict[i_param]
    PG_PresAbs_LRAsmQC_PATH_Dict[i_param] = f"{i_outdir}/gene_presence_absence.csv.DNAseqQCWiLRAsm.csv" 


In [43]:
for i_Param, i_SR_PresAbs_LRAsmSeqQC_DF in PresAbs_DF_SRPG_QC_WiLRAsmSeqChk_Dict.items():

    i_SR_PresAbs_SeqQCWiLRAsm_TSV_PATH = PG_PresAbs_LRAsmQC_PATH_Dict[i_Param]
    # Output
    i_SR_PresAbs_LRAsmSeqQC_DF.to_csv(i_SR_PresAbs_SeqQCWiLRAsm_TSV_PATH,
                                      sep = "\t", index= False)

    

#### Look at size of output files

In [44]:
for i_Param, i_SR_PresAbs_LRAsmSeqQC_DF in PresAbs_DF_SRPG_QC_WiLRAsmSeqChk_Dict.items():

    i_SR_PresAbs_SeqQCWiLRAsm_TSV_PATH = PG_PresAbs_LRAsmQC_PATH_Dict[i_Param]
    print(i_Param)
    print()
    !md5sum $i_SR_PresAbs_SeqQCWiLRAsm_TSV_PATH
    !wc -l $i_SR_PresAbs_SeqQCWiLRAsm_TSV_PATH
    !du -sh $i_SR_PresAbs_SeqQCWiLRAsm_TSV_PATH
    
    print("------\n")

SR_Roary_NoSplitParalogs_I80

00fa800897c38ecf7c00a7cd9af74ade  /n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/PanGenome_Analysis/SR_Roary_NoSplitParalogs_I80_AllIsolates/gene_presence_absence.csv.DNAseqQCWiLRAsm.csv
4867 /n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/PanGenome_Analysis/SR_Roary_NoSplitParalogs_I80_AllIsolates/gene_presence_absence.csv.DNAseqQCWiLRAsm.csv
984K	/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/PanGenome_Analysis/SR_Roary_NoSplitParalogs_I80_AllIsolates/gene_presence_absence.csv.DNAseqQCWiLRAsm.csv
------

