# Create input TSV for PG analysis - Houtak2023 - Saureus 68 Asm Dataset

In [88]:
import numpy as np
import pandas as pd
from tqdm import tqdm
#import matplotlib.pyplot as plt
#import seaborn as sns
#import pickle

%matplotlib inline

In [89]:
import glob

# 0) Define directories

In [90]:

Proj_MainDir = "/n/data1/hms/dbmi/farhat/mm774/Projects"

SA_PG_MainDir = f"{Proj_MainDir}/241206_Saureus_PG_Analysis"  

Data_MainDir = f"{SA_PG_MainDir}/Data"  

SA_68CI_GenomeFASTAs_Dir = f"{Data_MainDir}/CRS.Saureus.68CI.GenomesFromGitRepo"

Saureus_Repo_Meta_Dir = f"../../Data/241206_Saureus_HoutakBouras2023_Metadata"  

Sa_ENA_RunData_TSV = f"{Saureus_Repo_Meta_Dir}/ENA.RunMetadata.PRJNA914892.tsv"

Sa_HoutakBouras2023_Isolate_Metadata_TSV = f"{Saureus_Repo_Meta_Dir}/HoutakBouras2023.metadata.csv"

Sa_HoutakBouras2023_NCBI_GenomeInfo_TSV  = f"{Saureus_Repo_Meta_Dir}/NCBI.HoutakBouras2023.Saureus.174CI.HybridGenomes.tsv"

In [91]:
!ls -1 $Data_MainDir

CRS.Saureus.68CI.GenomesFromGitRepo
ENA.RunMetadata.PRJNA914892.tsv
HoutakBouras2023.metadata.csv
NCBI.HoutakBouras2023.Saureus.174CI.HybridGenomes.tsv


In [92]:
!ls -1 $SA_68CI_GenomeFASTAs_Dir | head 

C100.fasta
C113.fasta
C121.fasta
C133.fasta
C136.fasta
C13.fasta
C148.fasta
C149.fasta
C155.fasta
C16.fasta


# Part 1: Parse metadata for data & genomes from `HoutakBouras-2023` paper

### [paper link](https://www.microbiologyresearch.org/content/journal/mgen/10.1099/mgen.0.001128#tab2)
### [Git Repo Link](https://github.com/gbouras13/CRS_Saureus_Evolutionary_Landscape/tree/main)


### a) Parse Genome Assembly info (NCBI)

In [93]:
GenomeInfo_DF = pd.read_csv(Sa_HoutakBouras2023_NCBI_GenomeInfo_TSV, sep = "\t")
GenomeInfo_DF.shape

(175, 22)

In [94]:
GenomeInfo_Trim_DF = GenomeInfo_DF[["Organism Infraspecific Names Strain", "Assembly Accession", "Assembly Name", "Organism Name"]]
GenomeInfo_Trim_DF.columns = ["IsolateID", "Assembly_Accession", "Assembly_Name", "Organism"]
GenomeInfo_Trim_DF.shape

(175, 4)

In [95]:
GenomeInfo_Trim_DF.head(2)

Unnamed: 0,IsolateID,Assembly_Accession,Assembly_Name,Organism
0,C308,GCA_027920385.1,ASM2792038v1,Staphylococcus aureus
1,C308,GCF_027920385.1,ASM2792038v1,Staphylococcus aureus


### b) Parse sequencing run info (ENA)

In [96]:
ENA_RunAcc_Info_DF = pd.read_csv(Sa_ENA_RunData_TSV, sep="\t")
ENA_RunAcc_Info_DF.shape

(366, 19)

In [97]:
ENA_RunAcc_Info_DF.head(3)

Unnamed: 0,run_accession,sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,instrument_model,nominal_length,library_layout,read_count,base_count,study_alias,fastq_bytes,fastq_ftp,submitted_ftp,sra_ftp,sample_alias,bam_ftp
0,SRR22859702,SAMN32360851,SRX18818535,PRJNA914892,1280,Staphylococcus aureus,OXFORD_NANOPORE,MinION,,SINGLE,46294,205749631,PRJNA914892,188942850,ftp.sra.ebi.ac.uk/vol1/fastq/SRR228/002/SRR228...,,ftp.sra.ebi.ac.uk/vol1/srr/SRR228/002/SRR22859702,C148,
1,SRR22859703,SAMN32360850,SRX18818534,PRJNA914892,1280,Staphylococcus aureus,OXFORD_NANOPORE,MinION,,SINGLE,32389,87157236,PRJNA914892,79890033,ftp.sra.ebi.ac.uk/vol1/fastq/SRR228/003/SRR228...,,ftp.sra.ebi.ac.uk/vol1/srr/SRR228/003/SRR22859703,C318,
2,SRR22859708,SAMN32360846,SRX18818529,PRJNA914892,1280,Staphylococcus aureus,OXFORD_NANOPORE,MinION,,SINGLE,42719,139895165,PRJNA914892,128110615,ftp.sra.ebi.ac.uk/vol1/fastq/SRR228/008/SRR228...,,ftp.sra.ebi.ac.uk/vol1/srr/SRR228/008/SRR22859708,C100,


In [98]:
ENA_RunAcc_Info_DF["instrument_model"].value_counts()

instrument_model
MinION                 174
NextSeq 550            147
Illumina HiSeq 2000     45
Name: count, dtype: int64

In [99]:
ENA_RunAcc_Illumina_DF = ENA_RunAcc_Info_DF.query("instrument_model != 'MinION'")
ENA_RunAcc_Illumina_DF.shape

(192, 19)

In [100]:
ENA_RunAcc_Illumina_DF["sample_alias"].nunique()

192

In [101]:
ENA_RunAcc_Illumina_Trim_DF = ENA_RunAcc_Illumina_DF[["sample_alias", "study_accession", "sample_accession", "run_accession", "read_count"]]
ENA_RunAcc_Illumina_Trim_DF.shape

(192, 5)

### c) Parse isolate metadata (From publication Git repo)

In [102]:

IsolateMetadata_DF = pd.read_csv(Sa_HoutakBouras2023_Isolate_Metadata_TSV)
IsolateMetadata_DF.shape

(68, 9)

# Part 1: Merge & subset metadata for 68 `S aureus` isolates from paper

In [103]:
IsolateMetadata_DF.head(1)

Unnamed: 0,rid,timepoint,Cnumber_ID,time_between_pairs,sex,age,aspirin_sensitivity,asthma,CRS_pheno
0,276,T0,C100,start,F,38,0,1,CRSwNP


In [104]:
ENA_RunAcc_Illumina_Trim_DF.head(1)

Unnamed: 0,sample_alias,study_accession,sample_accession,run_accession,read_count
7,C344,PRJNA914892,SAMN32360971,SRR22859722,2032071


In [105]:
GenomeInfo_Trim_DF.head(1)

Unnamed: 0,IsolateID,Assembly_Accession,Assembly_Name,Organism
0,C308,GCA_027920385.1,ASM2792038v1,Staphylococcus aureus


In [106]:
Sa_68CI_Info_DF = pd.merge(IsolateMetadata_DF, ENA_RunAcc_Illumina_Trim_DF, how = "left",
                           left_on="Cnumber_ID", right_on = "sample_alias" )

Sa_68CI_Info_DF = pd.merge(Sa_68CI_Info_DF, GenomeInfo_Trim_DF, how = "left",
                           left_on="Cnumber_ID", right_on = "IsolateID" )

Target_Cols = ["Cnumber_ID", "rid", "timepoint", "study_accession", "run_accession", "Assembly_Accession", "Organism"]

Sa_68CI_Info_Trim_DF = Sa_68CI_Info_DF[Target_Cols]

Sa_68CI_Info_Trim_DF.columns = ["SampleID", "PatientID", "Timepoint", "Bioproject", "SR_RunAccession", "Assembly_Accession", "Organism"]

Sa_68CI_Info_Trim_DF.shape

(68, 7)

In [107]:
Sa_68CI_Info_Trim_DF.head(10)

Unnamed: 0,SampleID,PatientID,Timepoint,Bioproject,SR_RunAccession,Assembly_Accession,Organism
0,C100,276,T0,PRJNA914892,SRR22859841,GCA_030290035.1,Staphylococcus aureus
1,C364,276,T1,PRJNA914892,SRR22859839,GCA_030289195.1,Staphylococcus aureus
2,C22,420,T0,PRJNA914892,SRR22859927,GCA_030290075.1,Staphylococcus aureus
3,C320,420,T1,PRJNA914892,SRR22859838,GCA_030289995.1,Staphylococcus aureus
4,C235,539,T0,PRJNA914892,SRR22859837,GCA_030290115.1,Staphylococcus aureus
5,C318,539,T1,PRJNA914892,SRR22859836,GCA_030290095.1,Staphylococcus aureus
6,C79,1170,T0,PRJNA914892,SRR22859834,GCA_030289815.1,Staphylococcus aureus
7,C148,1170,T1,PRJNA914892,SRR22859835,GCA_030290055.1,Staphylococcus aureus
8,C265,1415,T0,PRJNA914892,SRR22859928,GCA_030289955.1,Staphylococcus aureus
9,C324,1415,T1,PRJNA914892,SRR22859833,GCA_030289755.1,Staphylococcus aureus


# Part 2: Create a dict of Asm FAs 

In [108]:
dictOf_AsmFA_PATHs = {}

i_GenomesFA_Dir = SA_68CI_GenomeFASTAs_Dir

for i, row in Sa_68CI_Info_Trim_DF.iterrows():

    i_SampleID = row["SampleID"]

    i_FA_PATH = f"{i_GenomesFA_Dir}/{i_SampleID}.fasta"

    dictOf_AsmFA_PATHs[i_SampleID] = i_FA_PATH


In [109]:
len(dictOf_AsmFA_PATHs.keys())

68

In [110]:
dictOf_AsmFA_PATHs['C100']

'/n/data1/hms/dbmi/farhat/mm774/Projects/241206_Saureus_PG_Analysis/Data/CRS.Saureus.68CI.GenomesFromGitRepo/C100.fasta'

In [111]:
!ls -lah /n/data1/hms/dbmi/farhat/mm774/Projects/241206_Saureus_PG_Analysis/Data/CRS.Saureus.68CI.GenomesFromGitRepo/C100.fasta

-rw-rw-r-- 1 mm774 farhat 2.8M Nov 28  2023 /n/data1/hms/dbmi/farhat/mm774/Projects/241206_Saureus_PG_Analysis/Data/CRS.Saureus.68CI.GenomesFromGitRepo/C100.fasta


In [112]:
!head -n 2 /n/data1/hms/dbmi/farhat/mm774/Projects/241206_Saureus_PG_Analysis/Data/CRS.Saureus.68CI.GenomesFromGitRepo/C100.fasta

>C100 [organism=Staphylococcus aureus] [location=chromosome] [topology=circular] [completeness=complete]
ATGTCGGAAAAAGAAATTTGGGAAAAAGTGCTTGAAATTGCTCAAGAAAAATTATCAGCT


# Part 3: Add `Genome_ASM_PATH` to the DF

In [113]:
Sa_68CI_Info_Trim_DF["Genome_ASM_PATH"] = Sa_68CI_Info_Trim_DF["SampleID"].map(dictOf_AsmFA_PATHs)  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Sa_68CI_Info_Trim_DF["Genome_ASM_PATH"] = Sa_68CI_Info_Trim_DF["SampleID"].map(dictOf_AsmFA_PATHs)


In [114]:
Sa_68CI_Info_Trim_DF.head(4)

Unnamed: 0,SampleID,PatientID,Timepoint,Bioproject,SR_RunAccession,Assembly_Accession,Organism,Genome_ASM_PATH
0,C100,276,T0,PRJNA914892,SRR22859841,GCA_030290035.1,Staphylococcus aureus,/n/data1/hms/dbmi/farhat/mm774/Projects/241206...
1,C364,276,T1,PRJNA914892,SRR22859839,GCA_030289195.1,Staphylococcus aureus,/n/data1/hms/dbmi/farhat/mm774/Projects/241206...
2,C22,420,T0,PRJNA914892,SRR22859927,GCA_030290075.1,Staphylococcus aureus,/n/data1/hms/dbmi/farhat/mm774/Projects/241206...
3,C320,420,T1,PRJNA914892,SRR22859838,GCA_030289995.1,Staphylococcus aureus,/n/data1/hms/dbmi/farhat/mm774/Projects/241206...


# 3) Output TSV w/ sample info and Assembly FASTA PATHs

In [115]:

HoutakBouras2023_Saureus_68CI_AsmPATH_TSV = f"{Saureus_Repo_Meta_Dir}/241206.HoutakBouras2023.Saureus.68I.MetaData.And.AsmFA.tsv"

HoutakBouras2023_Saureus_68CI_AsmPATH_AltPath_TSV = f"{SA_PG_MainDir}/241206.HoutakBouras2023.Saureus.68I.MetaData.And.AsmFA.tsv"

Sa_68CI_Info_Trim_DF.to_csv(HoutakBouras2023_Saureus_68CI_AsmPATH_TSV,
                            sep = "\t",
                            index=False)

Sa_68CI_Info_Trim_DF.to_csv(HoutakBouras2023_Saureus_68CI_AsmPATH_AltPath_TSV,
                            sep = "\t",
                            index=False)


In [116]:
!wc -l $HoutakBouras2023_Saureus_68CI_AsmPATH_TSV  

69 ../../Data/241206_Saureus_HoutakBouras2023_Metadata/241206.HoutakBouras2023.Saureus.68I.MetaData.And.AsmFA.tsv


In [117]:
!wc -l $HoutakBouras2023_Saureus_68CI_AsmPATH_AltPath_TSV  

69 /n/data1/hms/dbmi/farhat/mm774/Projects/241206_Saureus_PG_Analysis/241206.HoutakBouras2023.Saureus.68I.MetaData.And.AsmFA.tsv
