# Organizing annotated genome assemblies (Mtb-151CI)

### Maximillian Marin (maximillian_marin@hms.harvard.edu)




### Import Statements

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import json
%matplotlib inline

### Set matplotlib text export settings for Adobe Illustrator

In [2]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

#### Pandas Viewing Settings

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
plt.style.use('../nqo.mplstyle')

## Define paths to sample metadata files

In [5]:
!ls -1 ../../Data

230905_Ecoli_50CI_Metadata_Shaw2021
231121_HybridMtbAsm_QCPass_Meta_Set3_AsmPolishStats_OLD
231121.InputAsmTSVs.MtbSetV3.151CI
231130_Mtb_HybridVsSR_AsmSummary_TSVs
240116.PG_Results
Ecoli_50CI_Phylo
Evaluate_SRAsmPGAnalysis_WiHybridAsm
MtbPangenomeAnalysis_SetV3
MtbPangenomeAnalysis_SetV4
README.md
SRAENA_RunMetadata


In [6]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"


## PARSE PATHs FOR ALL assemblies processed by this pipeline

In [7]:
WGA151CI_LRandSR_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")

WGA151CI_LRandSR_Asm_Path_DF.columns = ['SampleID', 'Dataset_Tag',
                                        'Genome_LR_ASM_PATH', 'Genome_SR_ASM_PATH']


In [8]:
WGA151CI_LRandSR_Asm_Path_DF.head(1)

Unnamed: 0,SampleID,Dataset_Tag,Genome_LR_ASM_PATH,Genome_SR_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121...,/n/data1/hms/dbmi/farhat/mm774/Projects/231121...


In [9]:
#WGA158CI_LRandSR_Asm_Path_DF[['SampleID', 'Dataset_Tag', 'Genome_LR_ASM_PATH']]

## Parse sample Metadata (N = 151)

In [10]:

WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

#print(','.join(SampleIDs_151CI_SOI) )

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)
WGA151CI_AsmSummary_DF.shape

(151, 7)

In [11]:
WGA151CI_AsmSummary_DF.head(3)

Unnamed: 0,SampleID,numContigs_Complete,Flye_CircContig_Cov,PrimaryLineage,Lineage,Dataset_Tag,AsmApproach
0,N0072,1,358,lineage1,"lineage1,lineage1.1,lineage1.1.2",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
1,N0153,1,372,lineage1,"lineage1,lineage1.1,lineage1.1.1,lineage1.1.1.1",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
2,TB3113,1,933,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon


#### What is the lineage breakdown?

In [12]:
WGA151CI_AsmSummary_DF["PrimaryLineage"].value_counts()

lineage4    62
lineage2    61
lineage1    15
lineage3     7
lineage6     3
lineage5     2
lineage8     1
Name: PrimaryLineage, dtype: int64

#### What is the datasets are contributing to our 151 Mtb isolates?

In [13]:
WGA151CI_AsmSummary_DF["Dataset_Tag"].value_counts()

Hall2022                78
TB_Portals_24CI_R1      21
Peker2021               17
Farhat_Peru_2019        13
ChinerOms_2019          12
TRUST_PB_Set1            8
Lee2020_Elife            1
Ngabonziza_Lin8_2020     1
Name: Dataset_Tag, dtype: int64

#### What is the breakdown of hybrid assembly approaches used?
- 1 - `PBrs2_LR_Flye_I3_SR_Pilon` (N = 48) <br>
   PacBio RSII subreads assembled and polished (Flye), then polished w/ short-reads (Pilon)

- 2 - `PBccs_LR_Flye_I3_SR_Pilon` (N = 8) <br>
   PacBio Sequel II HiFi reads assembled and polished (Flye), then polished w/ short-reads (Pilon)
  
- 3 - `ONT9.4_LR_FlyeI3M_SR_Pilon_PolyPolish` (N= 95) <br>
   Oxford Nanopore (9.4.1) reads assembled and polished (Flye, Medaka), then polished w/ short-reads (Pilon, PolyPolish)


In [14]:
WGA151CI_AsmSummary_DF["AsmApproach"].value_counts() 

ONT9.4_LR_FlyeI3M_SR_Pilon_PolyPolish    95
PBrs2_LR_Flye_I3_SR_Pilon                48
PBccs_LR_Flye_I3_SR_Pilon                 8
Name: AsmApproach, dtype: int64

# Define output dir of the Mtb-WGA-SMK processing pipeline

In [15]:
# Define pipeline output directories

Mtb_WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = Mtb_WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"

Mtb_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir


In [16]:
#!ls -alh $Mtb_SMK_Pipeline_OutputDir

## Define paths to all BAKTA Annotations (For both Hybrid & SR ASMs)

In [17]:
listOfSample_Tags = WGA151CI_SampleIDs

target_SMK_OutputDir = Mtb_SMK_Pipeline_OutputDir

SampleTag_ToPaths_Dict = {}

for SampleID in listOfSample_Tags:
    sample_Asm_OutputDir = target_SMK_OutputDir + "/AsmAnalysis/" + SampleID
    GenomeAnno_Dir = f"{sample_Asm_OutputDir}/GenomeAnnotation"
    
    # LR-Hybrid Bakta Anno Files
    Bakta_LR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_Asm_Bakta"
    
    i_LRAsm_Bakta_GFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gff3"
    i_LRAsm_Bakta_GBFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gbff"
    i_LRAsm_Bakta_FAA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.faa"
    i_LRAsm_Bakta_FFN = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.ffn"
    i_LRAsm_Bakta_FNA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.fna"
    i_LRAsm_Bakta_TXT = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.txt"

    # SR Bakta Anno Files
    Bakta_SR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_SR_Asm_Bakta"

    i_SRAsm_Bakta_GFF = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.gff3"
    i_SRAsm_Bakta_GBFF = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.gbff"
    i_SRAsm_Bakta_FAA = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.faa"
    i_SRAsm_Bakta_FFN = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.ffn"
    i_SRAsm_Bakta_FNA = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.fna"
    i_SRAsm_Bakta_TXT = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.txt"

    # LR-Hybrid PGAP Anno Files
    PGAP_LR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_Asm_PGAP_V1"
    
    i_LRAsm_PGAP_GFF = f"{PGAP_LR_AsmDir}/{SampleID}.PGAP.gff"
    i_LRAsm_PGAP_FNA = f"{PGAP_LR_AsmDir}/{SampleID}.PGAP.Genome.fasta"
    i_LRAsm_PGAP_WiDNA_GFF = f"{PGAP_LR_AsmDir}/{SampleID}.PGAP.WiDNA.gff"

    # SR PGAP Anno Files
    PGAP_SR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_SR_Asm_PGAP_V1"
    
    i_SRAsm_PGAP_GFF = f"{PGAP_SR_AsmDir}/{SampleID}.PGAP.gff"
    i_SRAsm_PGAP_FNA = f"{PGAP_SR_AsmDir}/{SampleID}.PGAP.Genome.fasta"
    i_SRAsm_PGAP_WiDNA_GFF = f"{PGAP_SR_AsmDir}/{SampleID}.PGAP.WiDNA.gff"

    
    MM2_AsmToRef_Output_Dir = sample_Asm_OutputDir + "/VariantCallingVersusH37Rv/MM2_AsmToH37rv"
    MM2_AsmToH37Rv_BAM = f"{MM2_AsmToRef_Output_Dir}/{SampleID}.mm2.AsmToH37Rv.bam"
    
    dictOfPaths_Temp = {}
    dictOfPaths_Temp["LRAsm_Bakta_GFF"] = i_LRAsm_Bakta_GFF  
    dictOfPaths_Temp["LRAsm_Bakta_GBFF"] = i_LRAsm_Bakta_GBFF
    dictOfPaths_Temp["LRAsm_Bakta_FAA"] = i_LRAsm_Bakta_FAA    
    dictOfPaths_Temp["LRAsm_Bakta_FFN"] = i_LRAsm_Bakta_FFN    
    dictOfPaths_Temp["LRAsm_Bakta_FNA"] = i_LRAsm_Bakta_FNA  
    dictOfPaths_Temp["LRAsm_Bakta_TXT"] = i_LRAsm_Bakta_TXT 

    dictOfPaths_Temp["SRAsm_Bakta_GFF"] = i_SRAsm_Bakta_GFF
    dictOfPaths_Temp["SRAsm_Bakta_GBFF"] = i_SRAsm_Bakta_GBFF
    dictOfPaths_Temp["SRAsm_Bakta_FAA"] = i_SRAsm_Bakta_FAA
    dictOfPaths_Temp["SRAsm_Bakta_FFN"] = i_SRAsm_Bakta_FFN
    dictOfPaths_Temp["SRAsm_Bakta_FNA"] = i_SRAsm_Bakta_FNA
    dictOfPaths_Temp["SRAsm_Bakta_TXT"] = i_SRAsm_Bakta_TXT 

    dictOfPaths_Temp["LRAsm_PGAP_GFF"] = i_LRAsm_PGAP_GFF  
    dictOfPaths_Temp["LRAsm_PGAP_FNA"] = i_LRAsm_PGAP_FNA
    dictOfPaths_Temp["LRAsm_PGAP_WiDNA_GFF"] = i_LRAsm_PGAP_WiDNA_GFF  
    
    dictOfPaths_Temp["SRAsm_PGAP_GFF"] = i_SRAsm_PGAP_GFF  
    dictOfPaths_Temp["SRAsm_PGAP_FNA"] = i_SRAsm_PGAP_FNA
    dictOfPaths_Temp["SRAsm_PGAP_WiDNA_GFF"] = i_SRAsm_PGAP_WiDNA_GFF  

    
    SampleTag_ToPaths_Dict[SampleID] = dictOfPaths_Temp



In [18]:
!ls -1 $GenomeAnno_Dir

S0262-02_Asm_Bakta
S0262-02_Asm_PGAP_V1
S0262-02_SR_Asm_Bakta
S0262-02_SR_Asm_PGAP_V1


In [19]:
!ls -1 $GenomeAnno_Dir/S0262-02_Asm_PGAP_V1

S0262-02.PGAP.Genome.fasta
S0262-02.PGAP.gff
S0262-02.PGAP.gff.InputInfo.tsv
S0262-02.PGAP.WiDNA.gff


In [20]:
!ls -1 $Bakta_LR_AsmDir

S0262-02.Bakta.embl
S0262-02.Bakta.faa
S0262-02.Bakta.ffn
S0262-02.Bakta.fna
S0262-02.Bakta.gbff
S0262-02.Bakta.gbff.InputInfo.tsv
S0262-02.Bakta.gff3
S0262-02.Bakta.hypotheticals.faa
S0262-02.Bakta.hypotheticals.tsv
S0262-02.Bakta.json
S0262-02.Bakta.log
S0262-02.Bakta.png
S0262-02.Bakta.svg
S0262-02.Bakta.tsv
S0262-02.Bakta.txt


In [21]:
!ls -1 $Bakta_SR_AsmDir

S0262-02.Bakta.embl
S0262-02.Bakta.faa
S0262-02.Bakta.ffn
S0262-02.Bakta.fna
S0262-02.Bakta.gbff
S0262-02.Bakta.gbff.InputInfo.tsv
S0262-02.Bakta.gff3
S0262-02.Bakta.hypotheticals.faa
S0262-02.Bakta.hypotheticals.tsv
S0262-02.Bakta.json
S0262-02.Bakta.log
S0262-02.Bakta.png
S0262-02.Bakta.svg
S0262-02.Bakta.tsv
S0262-02.Bakta.txt


In [22]:
Proj_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects"

PGBenchmarking_DataOrg_Dir = f"{Proj_Dir}/240308.PGBenchmarking.DataOrg"
!mkdir $PGBenchmarking_DataOrg_Dir

Mtb_151CI_Hybrid_PGAP_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Mtb.151CI.Hybrid.PGAP.Anno" 
Mtb_151CI_SR_PGAP_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Mtb.151CI.SR.PGAP.Anno" 

Mtb_151CI_Hybrid_Bakta_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Mtb.151CI.Hybrid.Bakta.Anno" 
Mtb_151CI_SR_Bakta_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Mtb.151CI.SR.Bakta.Anno" 


!mkdir $Mtb_151CI_Hybrid_PGAP_AsmDir
!mkdir $Mtb_151CI_SR_PGAP_AsmDir

!mkdir $Mtb_151CI_Hybrid_Bakta_AsmDir
!mkdir $Mtb_151CI_SR_Bakta_AsmDir



mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.Hybrid.PGAP.Anno’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.SR.PGAP.Anno’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.Hybrid.Bakta.Anno’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.SR.Bakta.Anno’: File exists


In [23]:
!ls -1 $PGBenchmarking_DataOrg_Dir

Ecoli.50CI.Hybrid.Bakta.Anno
Ecoli.50CI.SR.Bakta.Anno
Mtb.151CI.Hybrid.Bakta.Anno
Mtb.151CI.Hybrid.PGAP.Anno
Mtb.151CI.SR.Bakta.Anno
Mtb.151CI.SR.PGAP.Anno


In [24]:
SampleTag_ToPaths_Dict['N0153'].keys()

dict_keys(['LRAsm_Bakta_GFF', 'LRAsm_Bakta_GBFF', 'LRAsm_Bakta_FAA', 'LRAsm_Bakta_FFN', 'LRAsm_Bakta_FNA', 'LRAsm_Bakta_TXT', 'SRAsm_Bakta_GFF', 'SRAsm_Bakta_GBFF', 'SRAsm_Bakta_FAA', 'SRAsm_Bakta_FFN', 'SRAsm_Bakta_FNA', 'SRAsm_Bakta_TXT', 'LRAsm_PGAP_GFF', 'LRAsm_PGAP_FNA', 'LRAsm_PGAP_WiDNA_GFF', 'SRAsm_PGAP_GFF', 'SRAsm_PGAP_FNA', 'SRAsm_PGAP_WiDNA_GFF'])

In [25]:
WGA151CI_SampleIDs[:3]

['N0072', 'N0153', 'TB3113']

## Copying assembly (FASTA) and annotations (GBFF & GFF w/ genome seq)

#### Output files generated for each SampleID
- `{SampleID}.HybridAsm.PGAP.Genome.fasta`
- `{SampleID}.HybridAsm.PGAP.WiGenome.gff`
- `{SampleID}.SRAsm.PGAP.Genome.fasta`
- `{SampleID}.SRAsm.PGAP.PGAP.WiGenome.gff`
- `{SampleID}.HybridAsm.Bakta.Genome.fasta`
- `{SampleID}.HybridAsm.Bakta.WiGenome.gff`
- `{SampleID}.SRAsm.Bakta.Genome.fasta`
- `{SampleID}.SRAsm.Bakta.PGAP.WiGenome.gff`



In [26]:
for i_SampleID in tqdm(WGA151CI_SampleIDs):

    #print(i_SampleID)

    i_Sample_LR_PGAP_GFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_PGAP_WiDNA_GFF"]
    i_Sample_LR_PGAP_FA_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_PGAP_FNA"]

    i_Sample_SR_PGAP_GFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["SRAsm_PGAP_WiDNA_GFF"]
    i_Sample_SR_PGAP_FA_PATH = SampleTag_ToPaths_Dict[i_SampleID]["SRAsm_PGAP_FNA"]

    i_Sample_LR_Bakta_GFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_Bakta_GFF"]
    i_Sample_LR_Bakta_FA_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_Bakta_FNA"]
    i_Sample_SR_Bakta_GFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["SRAsm_Bakta_GFF"]
    i_Sample_SR_Bakta_FA_PATH = SampleTag_ToPaths_Dict[i_SampleID]["SRAsm_Bakta_FNA"]

    
    o_Sample_LR_PGAP_GFF_PATH = f"{Mtb_151CI_Hybrid_PGAP_AsmDir}/{i_SampleID}.HybridAsm.PGAP.WiGenome.gff"
    o_Sample_LR_PGAP_FA_PATH  = f"{Mtb_151CI_Hybrid_PGAP_AsmDir}/{i_SampleID}.HybridAsm.PGAP.Genome.fasta"
    
    o_Sample_SR_PGAP_GFF_PATH = f"{Mtb_151CI_SR_PGAP_AsmDir}/{i_SampleID}.SRAsm.PGAP.WiGenome.gff"
    o_Sample_SR_PGAP_FA_PATH  = f"{Mtb_151CI_SR_PGAP_AsmDir}/{i_SampleID}.SRAsm.PGAP.Genome.fasta"

    o_Sample_LR_Bakta_GFF_PATH = f"{Mtb_151CI_Hybrid_Bakta_AsmDir}/{i_SampleID}.HybridAsm.Bakta.WiGenome.gff"
    o_Sample_LR_Bakta_FA_PATH = f"{Mtb_151CI_Hybrid_Bakta_AsmDir}/{i_SampleID}.HybridAsm.Bakta.Genome.fasta"
    
    o_Sample_SR_Bakta_GFF_PATH = f"{Mtb_151CI_SR_Bakta_AsmDir}/{i_SampleID}.SRAsm.Bakta.WiGenome.gff"
    o_Sample_SR_Bakta_FA_PATH = f"{Mtb_151CI_SR_Bakta_AsmDir}/{i_SampleID}.SRAsm.Bakta.Genome.fasta"

    
    # Copy PGAP Anno for Hybrid Asm
    !cp $i_Sample_LR_PGAP_GFF_PATH $o_Sample_LR_PGAP_GFF_PATH
    !cp $i_Sample_LR_PGAP_FA_PATH $o_Sample_LR_PGAP_FA_PATH

    # Copy PGAP Anno for SR Asm
    !cp $i_Sample_SR_PGAP_GFF_PATH $o_Sample_SR_PGAP_GFF_PATH
    !cp $i_Sample_SR_PGAP_FA_PATH $o_Sample_SR_PGAP_FA_PATH

    # Copy Bakta Anno for Hybrid Asm
    !cp $i_Sample_LR_Bakta_GFF_PATH $o_Sample_LR_Bakta_GFF_PATH
    !cp $i_Sample_LR_Bakta_FA_PATH $o_Sample_LR_Bakta_FA_PATH
 
    # Copy Bakta Anno for SR Asm
    !cp $i_Sample_SR_Bakta_GFF_PATH $o_Sample_SR_Bakta_GFF_PATH
    !cp $i_Sample_SR_Bakta_FA_PATH $o_Sample_SR_Bakta_FA_PATH
        
    #break 


100%|██████████| 151/151 [03:24<00:00,  1.36s/it]


In [27]:
SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_PGAP_FNA"]

'/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/AsmAnalysis/S0262-02/GenomeAnnotation/S0262-02_Asm_PGAP_V1/S0262-02.PGAP.Genome.fasta'

In [28]:
#!ls -1 $Mtb_69CI_SelectedSet_AsmDir/

## Create tar.gz files of all directories containing genome sequences & annotations

In [36]:
DataOrg_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg"

In [38]:
!cd $DataOrg_Dir && tar -czf Mtb.151CI.Hybrid.Bakta.Anno.tar.gz Mtb.151CI.Hybrid.Bakta.Anno/      


In [39]:
!cd $DataOrg_Dir && tar -czf Mtb.151CI.Hybrid.PGAP.Anno.tar.gz Mtb.151CI.Hybrid.PGAP.Anno/ 


In [40]:
!cd $DataOrg_Dir && tar -czf Mtb.151CI.SR.Bakta.Anno.tar.gz Mtb.151CI.SR.Bakta.Anno/ 


In [41]:
!cd $DataOrg_Dir && tar -czf Mtb.151CI.SR.PGAP.Anno.tar.gz Mtb.151CI.SR.PGAP.Anno/ 


In [49]:
!du -sh $DataOrg_Dir/Mtb.151CI.Hybrid.Bakta.Anno.tar.gz

498M	/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.Hybrid.Bakta.Anno.tar.gz


In [48]:
!du -sh $DataOrg_Dir/Mtb.151CI.Hybrid.PGAP.Anno.tar.gz

469M	/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.Hybrid.PGAP.Anno.tar.gz


In [47]:
!du -sh $DataOrg_Dir/Mtb.151CI.SR.Bakta.Anno.tar.gz

489M	/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.SR.Bakta.Anno.tar.gz


In [46]:
!du -sh $DataOrg_Dir/Mtb.151CI.SR.PGAP.Anno.tar.gz

459M	/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.SR.PGAP.Anno.tar.gz


In [None]:
Mtb.151CI.Hybrid.Bakta.Anno
Mtb.151CI.Hybrid.PGAP.Anno
Mtb.151CI.SR.Bakta.Anno
Mtb.151CI.SR.PGAP.Anno

In [29]:
!ls -1 $Mtb_151CI_Hybrid_PGAP_AsmDir

01_R1134.HybridAsm.PGAP.Genome.fasta
01_R1134.HybridAsm.PGAP.WiGenome.gff
01_R1430.HybridAsm.PGAP.Genome.fasta
01_R1430.HybridAsm.PGAP.WiGenome.gff
02_R0894.HybridAsm.PGAP.Genome.fasta
02_R0894.HybridAsm.PGAP.WiGenome.gff
02_R1179.HybridAsm.PGAP.Genome.fasta
02_R1179.HybridAsm.PGAP.WiGenome.gff
02_R1708.HybridAsm.PGAP.Genome.fasta
02_R1708.HybridAsm.PGAP.WiGenome.gff
02_R1896.HybridAsm.PGAP.Genome.fasta
02_R1896.HybridAsm.PGAP.WiGenome.gff
18_0621851.HybridAsm.PGAP.Genome.fasta
18_0621851.HybridAsm.PGAP.WiGenome.gff
3003-06.HybridAsm.PGAP.Genome.fasta
3003-06.HybridAsm.PGAP.WiGenome.gff
4549-04.HybridAsm.PGAP.Genome.fasta
4549-04.HybridAsm.PGAP.WiGenome.gff
696-05.HybridAsm.PGAP.Genome.fasta
696-05.HybridAsm.PGAP.WiGenome.gff
702-06.HybridAsm.PGAP.Genome.fasta
702-06.HybridAsm.PGAP.WiGenome.gff
706-05.HybridAsm.PGAP.Genome.fasta
706-05.HybridAsm.PGAP.WiGenome.gff
8129-04.HybridAsm.PGAP.Genome.fasta
8129-04.HybridAsm.PGAP.WiGenome.gff
8651-04.HybridAsm.PGAP.Genome.fasta
8651-04.HybridAs

In [30]:
#!ls -1 $Mtb_151CI_SR_PGAP_AsmDir

In [31]:
#!ls -1 $Mtb_151CI_Hybrid_Bakta_AsmDir

In [32]:
#!ls -1 $Mtb_151CI_SR_Bakta_AsmDir