# Organizing annotated genome assemblies (Mtb-151CI) - Trial Run

### Maximillian Marin (maximillian_marin@hms.harvard.edu)




### Import Statements

In [68]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import json
%matplotlib inline

### Set matplotlib text export settings for Adobe Illustrator

In [69]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

#### Pandas Viewing Settings

In [70]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [71]:
plt.style.use('../nqo.mplstyle')

## Define paths to sample metadata files

In [72]:
!ls -1 ../../Data

230905_Ecoli_50CI_Metadata_Shaw2021
231121_HybridMtbAsm_QCPass_Meta_Set3_AsmPolishStats_OLD
231121.InputAsmTSVs.MtbSetV3.151CI
231130_Mtb_HybridVsSR_AsmSummary_TSVs
240116.PG_Results
Ecoli_50CI_Phylo
Evaluate_SRAsmPGAnalysis_WiHybridAsm
MtbPangenomeAnalysis_SetV3
README.md
SRAENA_RunMetadata


In [73]:
Repo_DataDir = "../../Data"
InputAsmPath_Dir = f"{Repo_DataDir}/231121.InputAsmTSVs.MtbSetV3.151CI"

MtbSetV3_151CI_InputAsmPATHs_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAndSRAsm.FAPATHs.V1.tsv"

MtbSetV3_151CI_AsmSumm_TSV = f"{InputAsmPath_Dir}/231121.MtbSetV3.151CI.HybridAsm.AsmSummary.V2.tsv"


## PARSE PATHs FOR ALL assemblies processed by this pipeline

In [74]:
WGA158CI_LRandSR_Asm_Path_DF = pd.read_csv(MtbSetV3_151CI_InputAsmPATHs_TSV, sep = "\t")

WGA158CI_LRandSR_Asm_Path_DF.columns = ['SampleID', 'Dataset_Tag',
                                        'Genome_LR_ASM_PATH', 'Genome_SR_ASM_PATH']


In [75]:
WGA158CI_LRandSR_Asm_Path_DF.head(1)

Unnamed: 0,SampleID,Dataset_Tag,Genome_LR_ASM_PATH,Genome_SR_ASM_PATH
0,N0072,ChinerOms_2019,/n/data1/hms/dbmi/farhat/mm774/Projects/231121...,/n/data1/hms/dbmi/farhat/mm774/Projects/231121...


In [76]:
#WGA158CI_LRandSR_Asm_Path_DF[['SampleID', 'Dataset_Tag', 'Genome_LR_ASM_PATH']]

## Parse sample Metadata (N = 151)

In [77]:

WGA151CI_AsmSummary_DF = pd.read_csv(MtbSetV3_151CI_AsmSumm_TSV, sep = "\t")

SampleIDs_151CI_SOI = list( WGA151CI_AsmSummary_DF["SampleID"].values )
WGA151CI_SampleIDs = SampleIDs_151CI_SOI

#print(','.join(SampleIDs_151CI_SOI) )

ID_To_PrimLineage_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'PrimaryLineage']].values)
ID_To_SubLineage_Dict = dict( WGA151CI_AsmSummary_DF[["SampleID", "Lineage"]].values)
ID_To_Dataset_Dict = dict(WGA151CI_AsmSummary_DF[['SampleID', 'Dataset_Tag']].values)
WGA151CI_AsmSummary_DF.shape

(151, 7)

In [78]:
WGA151CI_AsmSummary_DF.head(3)

Unnamed: 0,SampleID,numContigs_Complete,Flye_CircContig_Cov,PrimaryLineage,Lineage,Dataset_Tag,AsmApproach
0,N0072,1,358,lineage1,"lineage1,lineage1.1,lineage1.1.2",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
1,N0153,1,372,lineage1,"lineage1,lineage1.1,lineage1.1.1,lineage1.1.1.1",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
2,TB3113,1,933,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon


#### What is the lineage breakdown?

In [79]:
WGA151CI_AsmSummary_DF["PrimaryLineage"].value_counts()

lineage4    62
lineage2    61
lineage1    15
lineage3     7
lineage6     3
lineage5     2
lineage8     1
Name: PrimaryLineage, dtype: int64

#### What is the datasets are contributing to our 151 Mtb isolates?

In [80]:
WGA151CI_AsmSummary_DF["Dataset_Tag"].value_counts()

Hall2022                78
TB_Portals_24CI_R1      21
Peker2021               17
Farhat_Peru_2019        13
ChinerOms_2019          12
TRUST_PB_Set1            8
Lee2020_Elife            1
Ngabonziza_Lin8_2020     1
Name: Dataset_Tag, dtype: int64

#### What is the breakdown of hybrid assembly approaches used?
- 1 - `PBrs2_LR_Flye_I3_SR_Pilon` (N = 48) <br>
   PacBio RSII subreads assembled and polished (Flye), then polished w/ short-reads (Pilon)

- 2 - `PBccs_LR_Flye_I3_SR_Pilon` (N = 8) <br>
   PacBio Sequel II HiFi reads assembled and polished (Flye), then polished w/ short-reads (Pilon)
  
- 3 - `ONT9.4_LR_FlyeI3M_SR_Pilon_PolyPolish` (N= 95) <br>
   Oxford Nanopore (9.4.1) reads assembled and polished (Flye, Medaka), then polished w/ short-reads (Pilon, PolyPolish)


In [81]:
WGA151CI_AsmSummary_DF["AsmApproach"].value_counts() 

ONT9.4_LR_FlyeI3M_SR_Pilon_PolyPolish    95
PBrs2_LR_Flye_I3_SR_Pilon                48
PBccs_LR_Flye_I3_SR_Pilon                 8
Name: AsmApproach, dtype: int64

In [82]:
Asm_PBonly_DF = WGA151CI_AsmSummary_DF.query("AsmApproach != 'ONT9.4_LR_FlyeI3M_SR_Pilon_PolyPolish'  ")
Asm_PBonly_DF.shape

(56, 7)

In [83]:
Asm_PBonlyOrL1L3_DF = WGA151CI_AsmSummary_DF.query("(AsmApproach != 'ONT9.4_LR_FlyeI3M_SR_Pilon_PolyPolish') | (PrimaryLineage == 'lineage1') | (PrimaryLineage == 'lineage3')") 

SelectedIDs_69CI = Asm_PBonlyOrL1L3_DF["SampleID"].values

Asm_PBonlyOrL1L3_DF.shape  

(73, 7)

In [84]:
Asm_PBonlyOrL1L3_DF["PrimaryLineage"].value_counts()

lineage4    26
lineage2    19
lineage1    15
lineage3     7
lineage6     3
lineage5     2
lineage8     1
Name: PrimaryLineage, dtype: int64

In [85]:
Asm_PBonlyOrL1L3_DF["PrimaryLineage"].value_counts()

lineage4    26
lineage2    19
lineage1    15
lineage3     7
lineage6     3
lineage5     2
lineage8     1
Name: PrimaryLineage, dtype: int64

In [86]:
Asm_PBonlyOrL1L3_DF["Dataset_Tag"].value_counts()

TB_Portals_24CI_R1      21
Hall2022                14
Farhat_Peru_2019        13
ChinerOms_2019          12
TRUST_PB_Set1            8
Peker2021                3
Lee2020_Elife            1
Ngabonziza_Lin8_2020     1
Name: Dataset_Tag, dtype: int64

In [87]:
Asm_PBonly_DF["Dataset_Tag"].value_counts()

TB_Portals_24CI_R1      21
Farhat_Peru_2019        13
ChinerOms_2019          12
TRUST_PB_Set1            8
Lee2020_Elife            1
Ngabonziza_Lin8_2020     1
Name: Dataset_Tag, dtype: int64

In [88]:
WGA151CI_AsmSummary_DF["PrimaryLineage"].value_counts()

lineage4    62
lineage2    61
lineage1    15
lineage3     7
lineage6     3
lineage5     2
lineage8     1
Name: PrimaryLineage, dtype: int64

In [89]:
Asm_PBonly_DF["PrimaryLineage"].value_counts()

lineage4    26
lineage2    19
lineage3     3
lineage6     3
lineage1     2
lineage5     2
lineage8     1
Name: PrimaryLineage, dtype: int64

In [90]:
Asm_PBonly_DF["Lineage"].value_counts()

lineage2,lineage2.2,lineage2.2.1                   13
lineage4,lineage4.3,lineage4.3.3                    5
lineage2.2.1.1                                      4
lineage4,lineage4.1,lineage4.1.2,lineage4.1.2.1     4
lineage3                                            3
lineage6,lineageBOV_AFRI                            3
lineage4,lineage4.5                                 2
lineage4.4.1.1                                      2
lineage4,lineage4.4,lineage4.4.1                    2
lineage4,lineage4.4,lineage4.4.1,lineage4.4.1.1     2
lineage4,lineage4.1,lineage4.1.2                    2
lineage5                                            2
lineage4,lineage4.8                                 1
lineage2.2.1                                        1
lineage8                                            1
lineage1,lineage1.1,lineage1.1.2                    1
lineage4,lineage4.2,lineage4.2.1                    1
lineage4,lineage4.6,lineage4.6.2                    1
lineage1,lineage1.1,lineage1

In [91]:
Asm_PBonlyOrL1L3_DF

Unnamed: 0,SampleID,numContigs_Complete,Flye_CircContig_Cov,PrimaryLineage,Lineage,Dataset_Tag,AsmApproach
0,N0072,1,358,lineage1,"lineage1,lineage1.1,lineage1.1.2",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
1,N0153,1,372,lineage1,"lineage1,lineage1.1,lineage1.1.1,lineage1.1.1.1",ChinerOms_2019,PBrs2_LR_Flye_I3_SR_Pilon
2,TB3113,1,933,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
3,TB1236,1,374,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
4,TB2659,1,421,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
5,TB2780,1,877,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
6,TB1612,1,373,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
7,TB2512,1,913,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
8,TB2981,1,961,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon
9,TB3091,1,955,lineage2,"lineage2,lineage2.2,lineage2.2.1",TB_Portals_24CI_R1,PBrs2_LR_Flye_I3_SR_Pilon


# Define output dir of the Mtb-WGA-SMK processing pipeline

In [92]:
# Define pipeline output directories

Mtb_WGA_SMK_Outputs_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output"

WGA151CI_SMK_OutputDir = Mtb_WGA_SMK_Outputs_Dir + "/231121_MtbSetV3_151CI"

Mtb_SMK_Pipeline_OutputDir = WGA151CI_SMK_OutputDir


In [93]:
#!ls -alh $Mtb_SMK_Pipeline_OutputDir

## Define paths to all BAKTA Annotations (For both Hybrid & SR ASMs)

In [94]:
listOfSample_Tags = WGA151CI_SampleIDs

target_SMK_OutputDir = Mtb_SMK_Pipeline_OutputDir

SampleTag_ToPaths_Dict = {}

for SampleID in listOfSample_Tags:
    sample_Asm_OutputDir = target_SMK_OutputDir + "/AsmAnalysis/" + SampleID
    GenomeAnno_Dir = f"{sample_Asm_OutputDir}/GenomeAnnotation"
    
    # LR Bakta Anno Files
    Bakta_LR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_Asm_Bakta"
    
    i_LRAsm_Bakta_GFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gff3"
    i_LRAsm_Bakta_GBFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gbff"
    i_LRAsm_Bakta_FAA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.faa"
    i_LRAsm_Bakta_FFN = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.ffn"
    i_LRAsm_Bakta_FNA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.fna"
    i_LRAsm_Bakta_TXT = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.txt"

    # LR PGAP Anno Files
    PGAP_LR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_Asm_PGAP_V1"
    
    i_LRAsm_PGAP_GFF = f"{PGAP_LR_AsmDir}/{SampleID}.PGAP.gff"
    i_LRAsm_PGAP_FNA = f"{PGAP_LR_AsmDir}/{SampleID}.PGAP.Genome.fasta"
    i_LRAsm_PGAP_WiDNA_GFF = f"{PGAP_LR_AsmDir}/{SampleID}.PGAP.WiDNA.gff"


    
    # SR Bakta Anno Files
    Bakta_SR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_SR_Asm_Bakta"

    i_SRAsm_Bakta_GFF = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.gff3"
    i_SRAsm_Bakta_GBFF = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.gbff"
    i_SRAsm_Bakta_FAA = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.faa"
    i_SRAsm_Bakta_FFN = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.ffn"
    i_SRAsm_Bakta_FNA = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.fna"
    i_SRAsm_Bakta_TXT = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.txt"

    
    MM2_AsmToRef_Output_Dir = sample_Asm_OutputDir + "/VariantCallingVersusH37Rv/MM2_AsmToH37rv"
    MM2_AsmToH37Rv_BAM = f"{MM2_AsmToRef_Output_Dir}/{SampleID}.mm2.AsmToH37Rv.bam"
    
    dictOfPaths_Temp = {}
    dictOfPaths_Temp["LRAsm_Bakta_GFF"] = i_LRAsm_Bakta_GFF  
    dictOfPaths_Temp["LRAsm_Bakta_GBFF"] = i_LRAsm_Bakta_GBFF
    dictOfPaths_Temp["LRAsm_Bakta_FAA"] = i_LRAsm_Bakta_FAA    
    dictOfPaths_Temp["LRAsm_Bakta_FFN"] = i_LRAsm_Bakta_FFN    
    dictOfPaths_Temp["LRAsm_Bakta_FNA"] = i_LRAsm_Bakta_FNA  
    dictOfPaths_Temp["LRAsm_Bakta_TXT"] = i_LRAsm_Bakta_TXT 

    dictOfPaths_Temp["SRAsm_Bakta_GFF"] = i_SRAsm_Bakta_GFF
    dictOfPaths_Temp["SRAsm_Bakta_GBFF"] = i_SRAsm_Bakta_GBFF
    dictOfPaths_Temp["SRAsm_Bakta_FAA"] = i_SRAsm_Bakta_FAA
    dictOfPaths_Temp["SRAsm_Bakta_FFN"] = i_SRAsm_Bakta_FFN
    dictOfPaths_Temp["SRAsm_Bakta_FNA"] = i_SRAsm_Bakta_FNA
    dictOfPaths_Temp["SRAsm_Bakta_TXT"] = i_SRAsm_Bakta_TXT 

    dictOfPaths_Temp["LRAsm_PGAP_GFF"] = i_LRAsm_PGAP_GFF  
    dictOfPaths_Temp["LRAsm_PGAP_FNA"] = i_LRAsm_PGAP_FNA
    dictOfPaths_Temp["LRAsm_PGAP_WiDNA_GFF"] = i_LRAsm_PGAP_WiDNA_GFF  

    dictOfPaths_Temp["MM2_AsmToH37Rv_BAM"] = MM2_AsmToH37Rv_BAM    
    
    SampleTag_ToPaths_Dict[SampleID] = dictOfPaths_Temp



In [95]:
!ls -1 $GenomeAnno_Dir

S0262-02_Asm_Bakta
S0262-02_Asm_PGAP_V1
S0262-02_SR_Asm_Bakta
S0262-02_SR_Asm_PGAP_V1


In [96]:
!ls -1 $GenomeAnno_Dir/S0262-02_Asm_PGAP_V1

S0262-02.PGAP.Genome.fasta
S0262-02.PGAP.gff
S0262-02.PGAP.gff.InputInfo.tsv
S0262-02.PGAP.WiDNA.gff


In [97]:
Proj_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects"

PGBenchmarking_DataOrg_Dir = f"{Proj_Dir}/240308.PGBenchmarking.DataOrg"
!mkdir $PGBenchmarking_DataOrg_Dir

Mtb_151CI_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Mtb.151CI.Hybrid.PGAP.Anno" 
!mkdir $Mtb_151CI_AsmDir

Mtb_69CI_SelectedSet_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Mtb.69CI.Hybrid.PGAP.Anno_SelectedForMTBCComp" 
!mkdir $Mtb_69CI_SelectedSet_AsmDir

mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.151CI.Hybrid.PGAP.Anno’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Mtb.69CI.Hybrid.PGAP.Anno_SelectedForMTBCComp’: File exists


In [98]:
!ls -1 $Mtb_69CI_SelectedSet_AsmDir

01_R1134.PGAP.Genome.fasta
01_R1134.PGAP.gff
01_R1430.PGAP.Genome.fasta
01_R1430.PGAP.gff
02_R0894.PGAP.Genome.fasta
02_R0894.PGAP.gff
02_R1179.PGAP.Genome.fasta
02_R1179.PGAP.gff
02_R1708.PGAP.Genome.fasta
02_R1708.PGAP.gff
02_R1896.PGAP.Genome.fasta
02_R1896.PGAP.gff
M0003941_3.PGAP.Genome.fasta
M0003941_3.PGAP.gff
M0010874_7.PGAP.Genome.fasta
M0010874_7.PGAP.gff
M0011368_9.PGAP.Genome.fasta
M0011368_9.PGAP.gff
M0014888_3.PGAP.Genome.fasta
M0014888_3.PGAP.gff
M0016395_7.PGAP.Genome.fasta
M0016395_7.PGAP.gff
M0016737_0.PGAP.Genome.fasta
M0016737_0.PGAP.gff
M0017522_5.PGAP.Genome.fasta
M0017522_5.PGAP.gff
mada_107.PGAP.Genome.fasta
mada_107.PGAP.gff
mada_1-10.PGAP.Genome.fasta
mada_1-10.PGAP.gff
mada_117.PGAP.Genome.fasta
mada_117.PGAP.gff
mada_118.PGAP.Genome.fasta
mada_118.PGAP.gff
mada_1-1.PGAP.Genome.fasta
mada_1-1.PGAP.gff
mada_122.PGAP.Genome.fasta
mada_122.PGAP.gff
mada_1-36.PGAP.Genome.fasta
mada_1-36.PGAP.gff
mada_1-39.PGAP.Genome.fasta
mada_1-39.PGAP.gff
mada_1-44.PGAP.Genome

In [99]:
len(SelectedIDs_69CI)

73

In [100]:
for i_SampleID in SelectedIDs_69CI:
    
    i_Sample_LR_PGAP_GFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_PGAP_GFF"]
    i_Sample_LR_PGAP_FA_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_PGAP_FNA"]

    #print(i_SampleID)

    !cp $i_Sample_LR_PGAP_GFF_PATH $Mtb_69CI_SelectedSet_AsmDir/
    !cp $i_Sample_LR_PGAP_FA_PATH $Mtb_69CI_SelectedSet_AsmDir/

    #break 


In [101]:
SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_PGAP_FNA"]

'/n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/AsmAnalysis/S0262-02/GenomeAnnotation/S0262-02_Asm_PGAP_V1/S0262-02.PGAP.Genome.fasta'

In [102]:
#!ls -1 $Mtb_69CI_SelectedSet_AsmDir/

In [106]:
!ls -1 /n/data1/hms/dbmi/farhat/mm774/Projects/Mtb-WGA-SMK-Output/231121_MtbSetV3_151CI/AsmAnalysis/N0072/GenomeAnnotation/N0072_Asm_PGAP_V1

N0072.PGAP.Genome.fasta
N0072.PGAP.gff
N0072.PGAP.gff.InputInfo.tsv
N0072.PGAP.WiDNA.gff


In [107]:

MTBC_PG_MainDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/240308.MTBC_PG_Comparison_ForMarwan"

Asm_PBonlyOrL1L3_DF.to_csv(f"{MTBC_PG_MainDir}/MtbIsolates.AsmInfo.73CI.tsv", sep = "\t")

