# Parsing AVAILABLE COV-SARS-2 sequencing metadata (ONT + Illumina WGS)

### Maximillian Marin (mgmarin@g,harvard,edu)


In [23]:
import pandas as pd

In [24]:
!ls -1 ../

clusterConfig_Slurm_V1.json
config_V1.txt
CoronaSV_AssemblyBasedSVCalling_V1.smk
CoronaSV_AssemblyBasedSVCalling_V2.smk
DAGs
Data
Envs
JupyterNotebooks
logs
Notes
Plots
README.md
References
runInfo_TSVs
Scripts
SMK_config_V1.txt


In [25]:
RunInfo_TSVs_Dir="../runInfo_TSVs"

CoronaSV_Metadata_V1_201013_TSV_PATH = f"{RunInfo_TSVs_Dir}/201013_CoronaSV_Metadata_V1.tsv"

In [26]:
!ls -1 $RunInfo_TSVs_Dir

201013_CoronaSV_Metadata_V1.tsv
201013_CoronaSV_Metadata_V2_Ten_Illumina_PE_Runs_Only_TestSubset.tsv
Test_RunInfo_1Illumina_1ONT.tsv
Test_RunInfo_1Illumina_1ONT_V2.tsv


In [27]:
CoronaSV_Metadata_All_DF = pd.read_csv(CoronaSV_Metadata_V1_201013_TSV_PATH, sep = "\t")

In [None]:
CoronaSV_Metadata_ONT_DNA_DF = CoronaSV_Metadata_All_DF[  (CoronaSV_Metadata_All_DF["Platform"] == "OXFORD_NANOPORE") & (CoronaSV_Metadata_All_DF["Assay_Type"] != "RNA-Seq") ]           

In [None]:
CoronaSV_Metadata_Illumina_PE_DF = CoronaSV_Metadata_All_DF[ (CoronaSV_Metadata_All_DF["LibraryLayout"] == "PAIRED") & (CoronaSV_Metadata_All_DF["Platform"] == "ILLUMINA") & (CoronaSV_Metadata_All_DF["Assay_Type"] != "RNA-Seq") ]           


In [28]:
CoronaSV_Metadata_All_DF.shape

(482, 14)

In [29]:
CoronaSV_Metadata_All_DF.head(10)

Unnamed: 0,BioProject,SRA_Study,Run,Experiment,BioSample,SRA_Sample,Assay_Type,Platform,Instrument,LibraryLayout,Library_Name,Sample_Name,Organism,isolate
0,PRJEB33797,ERP116617,ERR3460958,ERX3482709,SAMEA5841278,ERS3629828,RNA-Seq,ILLUMINA,Illumina HiSeq 2500,SINGLE,Illumina,160_WT_II_p21_illumina,Human coronavirus 229E,
1,PRJEB33797,ERP116617,ERR3460959,ERX3482710,SAMEA5841279,ERS3629829,RNA-Seq,ILLUMINA,Illumina HiSeq 2500,SINGLE,Illumina,SL2_BCoV_III_p12pool_illumina,Human coronavirus 229E,
2,PRJEB33797,ERP116617,ERR3460960,ERX3482711,SAMEA5841280,ERS3629830,RNA-Seq,ILLUMINA,Illumina HiSeq 2500,SINGLE,Illumina,SL2_SARS_II_p12pool_illumina,Human coronavirus 229E,
3,PRJEB33797,ERP116617,ERR3460961,ERX3482712,SAMEA5841281,-,RNA-Seq,OXFORD_NANOPORE,MinION,SINGLE,-,WT_direct-rna_nanopore,Human coronavirus 229E,-
4,PRJEB33797,ERP116617,ERR3460962,ERX3482713,SAMEA5841282,-,RNA-Seq,OXFORD_NANOPORE,MinION,SINGLE,-,SL2_direct-rna_nanopore,Human coronavirus 229E,-
5,PRJEB38459,ERP121890,ERR4173381,ERX4136993,SAMEA6853602,ERS4581192,RNA-Seq,ILLUMINA,Illumina NovaSeq 6000,PAIRED,unspecified,SAMEA6853602 S-42,Severe acute respiratory syndrome coronavirus 2,
6,PRJEB38459,ERP121890,ERR4173382,ERX4136994,SAMEA6853602,ERS4581192,AMPLICON,OXFORD_NANOPORE,GridION,SINGLE,unspecified,SAMEA6853602 S-42,Severe acute respiratory syndrome coronavirus 2,
7,PRJNA607948,SRP250294,SRR11140744,SRX7777166,SAMN14154205,SRS6189924,WGS,ILLUMINA,Illumina MiSeq,PAIRED,veroSTAT-1KO_illumina,veroSTAT-1KO_illumina,Severe acute respiratory syndrome coronavirus 2,2019-nCoV/USA-WI1/2020 Illumina replicate - Ve...
8,PRJNA607948,SRP250294,SRR11140745,SRX7777165,SAMN14154204,SRS6189920,WGS,OXFORD_NANOPORE,GridION,SINGLE,veroSTAT-1KO_ONT,veroSTAT-1KO_ONT,Severe acute respiratory syndrome coronavirus 2,2019-nCoV/USA-WI1/2020 ONT replicate - Vero ST...
9,PRJNA607948,SRP250294,SRR11140746,SRX7777164,SAMN14154203,SRS6189919,WGS,ILLUMINA,Illumina MiSeq,PAIRED,veroE6_illumina,veroE6_illumina,Severe acute respiratory syndrome coronavirus 2,2019-nCoV/USA-WI1/2020 Illumina replicate - Ve...


In [30]:
CoronaSV_Metadata_All_DF["Platform"].value_counts()

OXFORD_NANOPORE    308
ILLUMINA           174
Name: Platform, dtype: int64

In [31]:
CoronaSV_Metadata_All_DF["Assay_Type"].value_counts()

AMPLICON            416
WGS                  36
Targeted-Capture     24
RNA-Seq               6
Name: Assay_Type, dtype: int64

In [54]:
CoronaSV_Metadata_ONT_DNA_DF = CoronaSV_Metadata_All_DF[  (CoronaSV_Metadata_All_DF["Platform"] == "OXFORD_NANOPORE") & (CoronaSV_Metadata_All_DF["Assay_Type"] != "RNA-Seq") ]           

In [55]:
CoronaSV_Metadata_ONT_DNA_DF.shape

(306, 14)

In [56]:
CoronaSV_Metadata_ONT_DNA_DF["Assay_Type"].value_counts()

AMPLICON    299
WGS           7
Name: Assay_Type, dtype: int64

In [57]:
CoronaSV_Metadata_Illumina_PE_DF = CoronaSV_Metadata_All_DF[ (CoronaSV_Metadata_All_DF["LibraryLayout"] == "PAIRED") & (CoronaSV_Metadata_All_DF["Platform"] == "ILLUMINA") & (CoronaSV_Metadata_All_DF["Assay_Type"] != "RNA-Seq") ]           


In [43]:
CoronaSV_Metadata_Illumina_PE_DF.shape

(170, 14)

In [47]:
CoronaSV_Metadata_Illumina_PE_DF["Platform"].value_counts()

ILLUMINA    170
Name: Platform, dtype: int64

In [48]:
CoronaSV_Metadata_Illumina_PE_DF["Assay_Type"].value_counts()

AMPLICON            117
WGS                  29
Targeted-Capture     24
Name: Assay_Type, dtype: int64

In [49]:
CoronaSV_Metadata_Illumina_PE_DF

Unnamed: 0,BioProject,SRA_Study,Run,Experiment,BioSample,SRA_Sample,Assay_Type,Platform,Instrument,LibraryLayout,Library_Name,Sample_Name,Organism,isolate
7,PRJNA607948,SRP250294,SRR11140744,SRX7777166,SAMN14154205,SRS6189924,WGS,ILLUMINA,Illumina MiSeq,PAIRED,veroSTAT-1KO_illumina,veroSTAT-1KO_illumina,Severe acute respiratory syndrome coronavirus 2,2019-nCoV/USA-WI1/2020 Illumina replicate - Ve...
9,PRJNA607948,SRP250294,SRR11140746,SRX7777164,SAMN14154203,SRS6189919,WGS,ILLUMINA,Illumina MiSeq,PAIRED,veroE6_illumina,veroE6_illumina,Severe acute respiratory syndrome coronavirus 2,2019-nCoV/USA-WI1/2020 Illumina replicate - Ve...
11,PRJNA607948,SRP250294,SRR11140748,SRX7777162,SAMN14154201,SRS6189918,WGS,ILLUMINA,Illumina MiSeq,PAIRED,vero76_illumina,vero76_illumina,Severe acute respiratory syndrome coronavirus 2,2019-nCoV/USA-WI1/2020 Illumina replicate - Ve...
13,PRJNA607948,SRP250294,SRR11140750,SRX7777160,SAMN14154199,SRS6189914,WGS,ILLUMINA,Illumina MiSeq,PAIRED,swab_illumina,swab_illumina,Severe acute respiratory syndrome coronavirus 2,2019-nCoV/USA-WI1/2020 Illumina replicate - Swab
17,PRJNA616147,SRP254488,SRR11514749,SRX8086361,SAMN14483190,SRS6395996,AMPLICON,ILLUMINA,Illumina iSeq 100,PAIRED,S2,EPI_ISL_417918,Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2/188407/human/2020/Malaysia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,PRJNA650037,SRP277377,SRR12480559,SRX8941960,SAMN15691558,SRS7197848,AMPLICON,ILLUMINA,Illumina MiSeq,PAIRED,MDHP-00036_Illumina,hCoV-19/USA/MD-HP00036/2020,Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2/human/USA/MD-HP00036/2020
478,PRJNA650037,SRP277377,SRR12480558,SRX8941961,SAMN15691559,SRS7197849,AMPLICON,ILLUMINA,Illumina MiSeq,PAIRED,MDHP-00037_Illumina,hCoV-19/USA/MD-HP00037/2020,Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2/human/USA/MD-HP00037/2020
479,PRJNA650037,SRP277377,SRR12480549,SRX8941962,SAMN15691544,SRS7197850,AMPLICON,ILLUMINA,Illumina MiSeq,PAIRED,MDHP-00015_Illumina,hCoV-19/USA/MD-HP00015/2020,Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2/human/USA/MD-HP00015/2020
480,PRJNA650037,SRP277377,SRR12480557,SRX8941963,SAMN15691560,SRS7197851,AMPLICON,ILLUMINA,Illumina MiSeq,PAIRED,MDHP-00038_Illumina,hCoV-19/USA/MD-HP00038/2020,Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2/human/USA/MD-HP00038/2020
