# Create input TSV for PG analysis - Shaw2021 Ecoli 50 Asm Dataset

In [17]:
import numpy as np
import pandas as pd
from tqdm import tqdm
#import matplotlib.pyplot as plt
#import seaborn as sns
#import pickle

%matplotlib inline

In [18]:
import glob

# 0) Define directories

In [19]:

Proj_MainDir = "/n/data1/hms/dbmi/farhat/mm774/Projects"

Ecoli_PG_MainDir = f"{Proj_MainDir}/230905_Ecoli_PG_Analysis"  

Data_MainDir = f"{Ecoli_PG_MainDir}/Data"  

Shaw2021_50Asm_Dir = f"{Data_MainDir}/Shaw2021.50Genomes.Data/data"


In [20]:
print(Data_MainDir)

/n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/Data


In [21]:
!ls -1 $Data_MainDir

NCBI.RefSeq.20.EcoliGenomes.dataset
NCBI.RefSeq.20.EcoliGenomes.dataset.zip
NCBI.RefSeq.20.EcoliGenomes.Metadata.tsv
README.md
Shaw2021.50Genomes.Data
Shaw2021.50Genomes.Marin.zip
Shaw2021.NCBIDownloadNotes.txt
Shaw2021.SelectedIsolates.50I.tsv


In [22]:
!ls -1 $Shaw2021_50Asm_Dir | head -n 5

assembly_data_report.jsonl
dataset_catalog.json
GCA_013602835.1
GCA_013712565.1
GCA_013713325.1


In [23]:
!ls -1 $Shaw2021_50Asm_Dir | wc -l 

52


In [24]:
!ls -1 $Shaw2021_50Asm_Dir | grep "GCA" | wc -l 

50


In [25]:
!ls -1 $Shaw2021_50Asm_Dir/GCA_013602835.1

GCA_013602835.1_ASM1360283v1_genomic.fna
genomic.gff


## 1) Parse Asm INFO for Ecoli dataset (N = 50) 

In [26]:
Shaw2021_Meta_TSV = f"{Data_MainDir}/Shaw2021.SelectedIsolates.50I.tsv"  

In [27]:
Shaw_50A_Info_DF = pd.read_csv(Shaw2021_Meta_TSV, sep = "\t")
Shaw_50A_Info_DF["SampleID"] = Shaw_50A_Info_DF["Assembly Accession"].str.split(".").str[0] 
Shaw_50A_Info_DF.shape

(50, 23)

In [28]:
Shaw_50A_Info_DF.head(4) 

Unnamed: 0,Isolate,Biosample Accession,Short Reads Accession,Long Reads Accession,Assembly Accession,Animal or WwTW,Isolate source information,Timepoint,Niche,Niche.2,...,ST assignment,Circularised plasmids,Number of contigs,Circularised chromosome (1=yes),Total genome size (bp),Chromosome GC (if known),Phylogroup (E. coli only),Alternative isolate name,Long read sequencing method,SampleID
0,RH02|T1-C02,SAMN15147960,SRR11948863,SRR12299015,GCA_014109125.1,Animal,Pooled pig faecal samples collected from floor...,1,Pig,Faeces,...,10.0,1,2,1,4953790,50.7,A,RHB02-C02,PacBio,GCA_014109125
1,RH02|T1-C06,SAMN15147963,SRR11948664,SRR12298958,GCA_014109065.1,Animal,Pooled pig faecal samples collected from floor...,1,Pig,Faeces,...,10.0,1,2,1,4950024,50.7,A,RHB02-C06,Nanopore,GCA_014109065
2,RH02|T1-C10,SAMN15147965,SRR11949072,SRR12298585,GCA_013923405.1,Animal,Pooled pig faecal samples collected from floor...,1,Pig,Faeces,...,10.0,1,2,1,4951893,50.7,A,RHB02-C10,PacBio,GCA_013923405
3,RH10|T3-C14,SAMN15148305,SRR11949198,SRR12298645,GCA_013821965.1,Animal,Pooled cattle faecal samples collected from fl...,3,Cattle,Faeces,...,34.0,3,4,1,4914446,50.9,A,RHB31-C14,Nanopore,GCA_013821965


## 2) Create a dict of Asm FAs

In [29]:
dictOf_AsmFA_PATHs = {}

for i, row in Shaw_50A_Info_DF.iterrows():

    i_AsmAcc = row["Assembly Accession"]
    #print(i, i_AsmAcc)

    i_AsmDir = f"{Shaw2021_50Asm_Dir}/{i_AsmAcc}"
    #!ls -1 $i_AsmDir

    fasta_files = glob.glob(f'{i_AsmDir}/*.fna')

    dictOf_AsmFA_PATHs[i_AsmAcc] = fasta_files[0]
    
    #print("")
    
    #break
    

## 3) Add `Genome_ASM_PATH` to the DF

In [30]:
Shaw_50A_Info_DF["Genome_ASM_PATH"] = Shaw_50A_Info_DF["Assembly Accession"].map(dictOf_AsmFA_PATHs)

In [31]:
Shaw_50A_Info_DF.head(4)  

Unnamed: 0,Isolate,Biosample Accession,Short Reads Accession,Long Reads Accession,Assembly Accession,Animal or WwTW,Isolate source information,Timepoint,Niche,Niche.2,...,Circularised plasmids,Number of contigs,Circularised chromosome (1=yes),Total genome size (bp),Chromosome GC (if known),Phylogroup (E. coli only),Alternative isolate name,Long read sequencing method,SampleID,Genome_ASM_PATH
0,RH02|T1-C02,SAMN15147960,SRR11948863,SRR12299015,GCA_014109125.1,Animal,Pooled pig faecal samples collected from floor...,1,Pig,Faeces,...,1,2,1,4953790,50.7,A,RHB02-C02,PacBio,GCA_014109125,/n/data1/hms/dbmi/farhat/mm774/Projects/230905...
1,RH02|T1-C06,SAMN15147963,SRR11948664,SRR12298958,GCA_014109065.1,Animal,Pooled pig faecal samples collected from floor...,1,Pig,Faeces,...,1,2,1,4950024,50.7,A,RHB02-C06,Nanopore,GCA_014109065,/n/data1/hms/dbmi/farhat/mm774/Projects/230905...
2,RH02|T1-C10,SAMN15147965,SRR11949072,SRR12298585,GCA_013923405.1,Animal,Pooled pig faecal samples collected from floor...,1,Pig,Faeces,...,1,2,1,4951893,50.7,A,RHB02-C10,PacBio,GCA_013923405,/n/data1/hms/dbmi/farhat/mm774/Projects/230905...
3,RH10|T3-C14,SAMN15148305,SRR11949198,SRR12298645,GCA_013821965.1,Animal,Pooled cattle faecal samples collected from fl...,3,Cattle,Faeces,...,3,4,1,4914446,50.9,A,RHB31-C14,Nanopore,GCA_013821965,/n/data1/hms/dbmi/farhat/mm774/Projects/230905...


In [32]:
Shaw_50A_Info_DF["Genome_ASM_PATH"].values[0]

'/n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/Data/Shaw2021.50Genomes.Data/data/GCA_014109125.1/GCA_014109125.1_ASM1410912v1_genomic.fna'

In [33]:
# !ls -lah /n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/Data/Shaw2021.50Genomes.Data/data/GCA_014109125.1/GCA_014109125.1_ASM1410912v1_genomic.fna

In [None]:
231212.7.A.panqc.NRC.Mtb151CI.Analysis.Part1.Try4.ipynb

# 3) Output TSV w/ sample info and Assembly FASTA PATHs

In [35]:
Ecoli_Meta_Dir = f"../../Data/230905_Ecoli_50CI_Metadata_Shaw2021"  

!mkdir $Ecoli_Meta_Dir


Shaw2021_Ecoli_50Asm_AsmPATH_TSV = f"{Ecoli_Meta_Dir}/231011.Shaw2021.Ecoli.50I.AsmFA.tsv"

Shaw_50A_Info_DF[["SampleID", "Isolate", "Assembly Accession", "Genome_ASM_PATH"]].to_csv(Shaw2021_Ecoli_50Asm_AsmPATH_TSV, sep = "\t", index=False)


Shaw2021_Ecoli_50Asm_Meta_TSV = f"{Ecoli_Meta_Dir}/231011.Shaw2021.Ecoli.50I.MetaData.tsv"

Shaw_50A_Info_DF.to_csv(Shaw2021_Ecoli_50Asm_Meta_TSV, sep = "\t", index=False)



In [36]:
print(Shaw2021_Ecoli_50Asm_AsmPATH_TSV)

../../Data/230905_Ecoli_50CI_Metadata_Shaw2021/231011.Shaw2021.Ecoli.50I.AsmFA.tsv


In [37]:
!wc -l $Shaw2021_Ecoli_50Asm_AsmPATH_TSV

51 ../../Data/230905_Ecoli_50CI_Metadata_Shaw2021/231011.Shaw2021.Ecoli.50I.AsmFA.tsv


In [38]:
!head -n 2 $Shaw2021_Ecoli_50Asm_AsmPATH_TSV

SampleID	Isolate	Assembly Accession	Genome_ASM_PATH
GCA_014109125	RH02|T1-C02	GCA_014109125.1	/n/data1/hms/dbmi/farhat/mm774/Projects/230905_Ecoli_PG_Analysis/Data/Shaw2021.50Genomes.Data/data/GCA_014109125.1/GCA_014109125.1_ASM1410912v1_genomic.fna


In [39]:
!wc -l $Shaw2021_Ecoli_50Asm_Meta_TSV

51 ../../Data/230905_Ecoli_50CI_Metadata_Shaw2021/231011.Shaw2021.Ecoli.50I.MetaData.tsv


In [41]:
#!head -n 4 $Shaw2021_Ecoli_50Asm_Meta_TSV