# Organizing annotated genome assemblies (Ecoli-50I)

### Maximillian Marin (maximillian_marin@hms.harvard.edu)


### Import Statements

In [165]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import json
%matplotlib inline

### Set matplotlib text export settings for Adobe Illustrator

In [166]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

#### Pandas Viewing Settings

In [167]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [168]:
plt.style.use('../nqo.mplstyle')

## Define paths to sample metadata files

In [169]:

Proj_MainDir = "/n/data1/hms/dbmi/farhat/mm774/Projects"

Ecoli_PG_MainDir = f"{Proj_MainDir}/230905_Ecoli_PG_Analysis"  

Data_MainDir = f"{Ecoli_PG_MainDir}/Data"  

Shaw2021_50Asm_Dir = f"{Data_MainDir}/Shaw2021.50Genomes.Data/data"

Shaw2021_Meta_TSV = f"{Data_MainDir}/Shaw2021.SelectedIsolates.50I.tsv"  

input_FA_PATH_TSVs_Dir = f"{Ecoli_PG_MainDir}/input_PATH_SMK_TSVs"

Ecoli_Meta_Dir = f"../../Data/230905_Ecoli_50CI_Metadata_Shaw2021"  

Shaw2021_Ecoli_50Asm_Meta_TSV = f"{Ecoli_Meta_Dir}/231011.Shaw2021.Ecoli.50I.MetaData.tsv"


## Parse in metadata DFs

In [170]:
Shaw_50A_Info_DF = pd.read_csv(Shaw2021_Ecoli_50Asm_Meta_TSV, sep = "\t")
E50I_SampleIDs = Shaw_50A_Info_DF["SampleID"].values   

In [171]:
Shaw_50A_Info_DF.shape

(50, 24)

### Create lineage mapping for sampleIDs

In [172]:
ID_To_Lineage_Dict = dict(Shaw_50A_Info_DF[['SampleID', 'Phylogroup (E. coli only)']].values)


In [173]:
Shaw_50A_Info_DF["Phylogroup (E. coli only)"].value_counts().index.sort_values()

Index(['A', 'B1', 'B2', 'C', 'D', 'E', 'F', 'G', 'cladeV'], dtype='object')

# Define output dir of the Mtb-WGA-SMK processing pipeline

In [174]:
!ls -1 $Ecoli_PG_MainDir/

Data
input_PATH_SMK_TSVs
SMK_OutDirs


In [175]:
# Define varaint calling pipeline output directories

WGA_SMK_Outputs_Dir = f"{Ecoli_PG_MainDir}/SMK_OutDirs"

PG_Ecoli_50I_OutDir = WGA_SMK_Outputs_Dir + "/231011_Ecoli_Shaw2021_5OI_V1"


## Define paths to all Assembly FAs + BAKTA Annotations (Short + Long read ASMs)

In [176]:
listOfSample_Tags = E50I_SampleIDs

target_SMK_OutputDir = PG_Ecoli_50I_OutDir

SampleTag_ToPaths_Dict = {}

for SampleID in listOfSample_Tags:
    sample_Asm_OutputDir = target_SMK_OutputDir + "/AsmAnalysis/" + SampleID
    GenomeAnno_Dir = f"{sample_Asm_OutputDir}/GenomeAnnotation"
    
    # LR Bakta Anno Files
    Bakta_LR_AsmDir = f"{GenomeAnno_Dir}/{SampleID}_Asm_Bakta"
    
    i_LRAsm_Bakta_GFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gff3"
    i_LRAsm_Bakta_GBFF = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.gbff"
    i_LRAsm_Bakta_FAA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.faa"
    i_LRAsm_Bakta_FFN = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.ffn"
    i_LRAsm_Bakta_FNA = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.fna"
    i_LRAsm_Bakta_TXT = f"{Bakta_LR_AsmDir}/{SampleID}.Bakta.txt"

    
    # SR Bakta Anno Files
    sample_SRAsm_OutputDir = target_SMK_OutputDir + "/SR_DataProcessing/" + SampleID
    SR_GenomeAnno_Dir = f"{sample_SRAsm_OutputDir}/GenomeAnnotation"

    Bakta_SR_AsmDir = f"{SR_GenomeAnno_Dir}/{SampleID}_Asm_Bakta"

    i_SRAsm_Bakta_GFF = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.gff3"
    i_SRAsm_Bakta_GBFF = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.gbff"
    i_SRAsm_Bakta_FAA = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.faa"
    i_SRAsm_Bakta_FFN = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.ffn"
    i_SRAsm_Bakta_FNA = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.fna"
    i_SRAsm_Bakta_TXT = f"{Bakta_SR_AsmDir}/{SampleID}.Bakta.txt"
    

    dictOfPaths_Temp = {}
    dictOfPaths_Temp["LRAsm_Bakta_GFF"] = i_LRAsm_Bakta_GFF  
    dictOfPaths_Temp["LRAsm_Bakta_GBFF"] = i_LRAsm_Bakta_GBFF
    dictOfPaths_Temp["LRAsm_Bakta_FAA"] = i_LRAsm_Bakta_FAA    
    dictOfPaths_Temp["LRAsm_Bakta_FFN"] = i_LRAsm_Bakta_FFN    
    dictOfPaths_Temp["LRAsm_Bakta_FNA"] = i_LRAsm_Bakta_FNA  
    dictOfPaths_Temp["LRAsm_Bakta_TXT"] = i_LRAsm_Bakta_TXT 

    dictOfPaths_Temp["SRAsm_Bakta_GFF"] = i_SRAsm_Bakta_GFF
    dictOfPaths_Temp["SRAsm_Bakta_GBFF"] = i_SRAsm_Bakta_GBFF
    dictOfPaths_Temp["SRAsm_Bakta_FAA"] = i_SRAsm_Bakta_FAA
    dictOfPaths_Temp["SRAsm_Bakta_FFN"] = i_SRAsm_Bakta_FFN
    dictOfPaths_Temp["SRAsm_Bakta_FNA"] = i_SRAsm_Bakta_FNA
    dictOfPaths_Temp["SRAsm_Bakta_TXT"] = i_SRAsm_Bakta_TXT 

    SampleTag_ToPaths_Dict[SampleID] = dictOfPaths_Temp



In [177]:
listOfSample_Tags[0]

'GCA_014109125'

In [178]:
#!ls -1 $Bakta_AsmDir

In [179]:
SampleTag_ToPaths_Dict['GCA_014109125']["LRAsm_Bakta_FNA"] 

'/n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/SMK_OutDirs/231011_Ecoli_Shaw2021_5OI_V1/AsmAnalysis/GCA_014109125/GenomeAnnotation/GCA_014109125_Asm_Bakta/GCA_014109125.Bakta.fna'

In [180]:
SampleTag_ToPaths_Dict['GCA_014109125']["SRAsm_Bakta_FNA"]

'/n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/SMK_OutDirs/231011_Ecoli_Shaw2021_5OI_V1/SR_DataProcessing/GCA_014109125/GenomeAnnotation/GCA_014109125_Asm_Bakta/GCA_014109125.Bakta.fna'

In [181]:
SampleTag_ToPaths_Dict['GCA_014109125']["SRAsm_Bakta_GFF"]

'/n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/SMK_OutDirs/231011_Ecoli_Shaw2021_5OI_V1/SR_DataProcessing/GCA_014109125/GenomeAnnotation/GCA_014109125_Asm_Bakta/GCA_014109125.Bakta.gff3'

In [182]:
!ls -alh $PG_Ecoli_50I_OutDir/FastANI

total 128K
drwxrwsr-x  4 mm774 farhat   63 Oct 12 02:14 .
drwxrwsr-x 59 mm774 farhat 1.8K Jan  1 18:03 ..
drwxrwsr-x  2 mm774 farhat   44 Oct 12 02:24 FastANI_LRAsm
drwxrwsr-x  2 mm774 farhat   41 Oct 12 02:13 FastANI_LRAsms


In [183]:
!ls -alh $PG_Ecoli_50I_OutDir/FastANI/FastANI_LRAsm

total 200K
drwxrwsr-x 2 mm774 farhat    44 Oct 12 02:24 .
drwxrwsr-x 4 mm774 farhat    63 Oct 12 02:14 ..
-rw-rw-r-- 1 mm774 farhat 1006K Oct 12 02:25 FastANI.AllVsAll.LRAsm.txt


In [184]:
!ls -alh $PG_Ecoli_50I_OutDir/FastANI/FastANI_LRAsms

total 88K
drwxrwsr-x 2 mm774 farhat   41 Oct 12 02:13 .
drwxrwsr-x 4 mm774 farhat   63 Oct 12 02:14 ..
-rw-rw-r-- 1 mm774 farhat 9.7K Oct 12 02:13 LRAsms.PathToFASTAs.txt


In [185]:
!ls -1 $GenomeAnno_Dir

GCA_013602835_Asm_Bakta


In [186]:
!ls -1 $GenomeAnno_Dir/S0262-02_Asm_PGAP_V1

ls: cannot access /n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/SMK_OutDirs/231011_Ecoli_Shaw2021_5OI_V1/AsmAnalysis/GCA_013602835/GenomeAnnotation/S0262-02_Asm_PGAP_V1: No such file or directory


In [187]:
!ls -1 $Bakta_LR_AsmDir

GCA_013602835.Bakta.embl
GCA_013602835.Bakta.faa
GCA_013602835.Bakta.ffn
GCA_013602835.Bakta.fna
GCA_013602835.Bakta.gbff
GCA_013602835.Bakta.gbff.InputInfo.tsv
GCA_013602835.Bakta.gff3
GCA_013602835.Bakta.hypotheticals.faa
GCA_013602835.Bakta.hypotheticals.tsv
GCA_013602835.Bakta.json
GCA_013602835.Bakta.log
GCA_013602835.Bakta.png
GCA_013602835.Bakta.svg
GCA_013602835.Bakta.tsv
GCA_013602835.Bakta.txt


In [188]:
!ls -1 $Bakta_SR_AsmDir

GCA_013602835.Bakta.embl
GCA_013602835.Bakta.faa
GCA_013602835.Bakta.ffn
GCA_013602835.Bakta.fna
GCA_013602835.Bakta.gbff
GCA_013602835.Bakta.gff3
GCA_013602835.Bakta.hypotheticals.faa
GCA_013602835.Bakta.hypotheticals.tsv
GCA_013602835.Bakta.json
GCA_013602835.Bakta.log
GCA_013602835.Bakta.png
GCA_013602835.Bakta.svg
GCA_013602835.Bakta.tsv
GCA_013602835.Bakta.txt


In [189]:
Proj_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects"

PGBenchmarking_DataOrg_Dir = f"{Proj_Dir}/240308.PGBenchmarking.DataOrg"
!mkdir $PGBenchmarking_DataOrg_Dir

Ecoli_50CI_Hybrid_Bakta_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Ecoli.50CI.Hybrid.Bakta.Anno" 
Ecoli_50CI_SR_Bakta_AsmDir = f"{PGBenchmarking_DataOrg_Dir}/Ecoli.50CI.SR.Bakta.Anno" 

!mkdir $Ecoli_50CI_Hybrid_Bakta_AsmDir
!mkdir $Ecoli_50CI_SR_Bakta_AsmDir


mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Ecoli.50CI.Hybrid.Bakta.Anno’: File exists
mkdir: cannot create directory ‘/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Ecoli.50CI.SR.Bakta.Anno’: File exists


In [190]:
!ls -1 $PGBenchmarking_DataOrg_Dir

Ecoli.50CI.Hybrid.Bakta.Anno
Ecoli.50CI.SR.Bakta.Anno
Mtb.151CI.Hybrid.Bakta.Anno
Mtb.151CI.Hybrid.PGAP.Anno
Mtb.151CI.SR.Bakta.Anno
Mtb.151CI.SR.PGAP.Anno


In [191]:
SampleTag_ToPaths_Dict['GCA_014109125'].keys()

dict_keys(['LRAsm_Bakta_GFF', 'LRAsm_Bakta_GBFF', 'LRAsm_Bakta_FAA', 'LRAsm_Bakta_FFN', 'LRAsm_Bakta_FNA', 'LRAsm_Bakta_TXT', 'SRAsm_Bakta_GFF', 'SRAsm_Bakta_GBFF', 'SRAsm_Bakta_FAA', 'SRAsm_Bakta_FFN', 'SRAsm_Bakta_FNA', 'SRAsm_Bakta_TXT'])

In [192]:
E50I_SampleIDs[:3] 

array(['GCA_014109125', 'GCA_014109065', 'GCA_013923405'], dtype=object)

In [193]:
for i_SampleID in tqdm(E50I_SampleIDs):

    # Copy Bakta Anno for Hybrid & SR Asms
    i_Sample_LR_Bakta_GFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_Bakta_GFF"]
    i_Sample_LR_Bakta_GBFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_Bakta_GBFF"]
    i_Sample_LR_Bakta_FA_PATH = SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_Bakta_FNA"]

    i_Sample_SR_Bakta_GFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["SRAsm_Bakta_GFF"]
    i_Sample_SR_Bakta_GBFF_PATH = SampleTag_ToPaths_Dict[i_SampleID]["SRAsm_Bakta_GBFF"]
    i_Sample_SR_Bakta_FA_PATH = SampleTag_ToPaths_Dict[i_SampleID]["SRAsm_Bakta_FNA"]


    o_Sample_LR_Bakta_GFF_PATH = f"{Ecoli_50CI_Hybrid_Bakta_AsmDir}/{i_SampleID}.HybridAsm.Bakta.WiGenome.gff"
    o_Sample_LR_Bakta_FA_PATH = f"{Ecoli_50CI_Hybrid_Bakta_AsmDir}/{i_SampleID}.HybridAsm.Bakta.Genome.fasta"
    
    o_Sample_SR_Bakta_GFF_PATH = f"{Ecoli_50CI_SR_Bakta_AsmDir}/{i_SampleID}.SRAsm.Bakta.WiGenome.gff"
    o_Sample_SR_Bakta_FA_PATH = f"{Ecoli_50CI_SR_Bakta_AsmDir}/{i_SampleID}.SRAsm.Bakta.Genome.fasta"

    
    # Copy Bakta Anno for Hybrid Asm
    !cp $i_Sample_LR_Bakta_GFF_PATH $o_Sample_LR_Bakta_GFF_PATH
    !cp $i_Sample_LR_Bakta_FA_PATH $o_Sample_LR_Bakta_FA_PATH
 
    # Copy Bakta Anno for SR Asm
    !cp $i_Sample_SR_Bakta_GFF_PATH $o_Sample_SR_Bakta_GFF_PATH
    !cp $i_Sample_SR_Bakta_FA_PATH $o_Sample_SR_Bakta_FA_PATH
        
    #break 


100%|██████████| 50/50 [00:34<00:00,  1.42it/s]


In [194]:
SampleTag_ToPaths_Dict[i_SampleID]["LRAsm_Bakta_FNA"]

'/n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/SMK_OutDirs/231011_Ecoli_Shaw2021_5OI_V1/AsmAnalysis/GCA_013602835/GenomeAnnotation/GCA_013602835_Asm_Bakta/GCA_013602835.Bakta.fna'

In [195]:
#!ls -1 $Mtb_69CI_SelectedSet_AsmDir/

In [196]:
!ls -1 $PGBenchmarking_DataOrg_Dir

Ecoli.50CI.Hybrid.Bakta.Anno
Ecoli.50CI.SR.Bakta.Anno
Mtb.151CI.Hybrid.Bakta.Anno
Mtb.151CI.Hybrid.PGAP.Anno
Mtb.151CI.SR.Bakta.Anno
Mtb.151CI.SR.PGAP.Anno


In [203]:
!cd /n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg && ls -1

Ecoli.50CI.Hybrid.Bakta.Anno
Ecoli.50CI.SR.Bakta.Anno
Mtb.151CI.Hybrid.Bakta.Anno
Mtb.151CI.Hybrid.PGAP.Anno
Mtb.151CI.SR.Bakta.Anno
Mtb.151CI.SR.PGAP.Anno


In [207]:
DataOrg_Dir = "/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg"

In [204]:
!cd $DataOrg_Dir && tar -czvf Ecoli.50CI.Hybrid.Bakta.Anno.tar.gz Ecoli.50CI.Hybrid.Bakta.Anno/ 


In [205]:
!cd $DataOrg_Dir && tar -czvf Ecoli.50CI.SR.Bakta.Anno.tar.gz Ecoli.50CI.SR.Bakta.Anno/ 


In [210]:
!du -sh $DataOrg_Dir/Ecoli.50CI.Hybrid.Bakta.Anno.tar.gz

199M	/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Ecoli.50CI.Hybrid.Bakta.Anno.tar.gz


In [211]:
!du -sh $DataOrg_Dir/Ecoli.50CI.SR.Bakta.Anno.tar.gz

195M	/n/data1/hms/dbmi/farhat/mm774/Projects/240308.PGBenchmarking.DataOrg/Ecoli.50CI.SR.Bakta.Anno.tar.gz


In [212]:
#!ls -1 $Ecoli_50CI_Hybrid_Bakta_AsmDir

In [214]:
#!ls -1 $Ecoli_50CI_SR_Bakta_AsmDir

In [199]:
!ls -1 /n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/SMK_OutDirs/231011_Ecoli_Shaw2021_5OI_V1/SR_DataProcessing/GCA_013602835 



GenomeAnnotation
IlluminaWGS
