In [1]:
import pyskim
import pandas as pd
from pysradb.search import SraSearch

  from tqdm.autonotebook import tqdm


# Query SRA to find relevant sample accessions

## Define query term

In [22]:
query = 'txid10244[Organism:noexp] AND ("filetype cram"[Properties] OR "filetype bam"[Properties] OR "filetype fastq"[Properties])'
query = 'txid10244[Organism:noexp]'

max_query_num = 1_000_000

## Search

In [38]:
instance = SraSearch(
    verbosity=3, return_max=max_query_num, query=query #, platform="illumina"
)
instance.search()
df_search = instance.get_df()

100%|███████████████████████████████████████████| 63/63 [00:03<00:00, 18.36it/s]


## Save output

In [39]:
df_search.to_csv("sra_search.csv.gz", index=False)
df_search.head(1)

Unnamed: 0,study_accession,experiment_accession,experiment_title,sample_taxon_id,sample_scientific_name,experiment_library_strategy,experiment_library_source,experiment_library_selection,sample_accession,sample_alias,...,study_study_type_existing_study_type,submission_accession,submission_alias,submission_attributes_1_tag,submission_attributes_1_value,submission_attributes_2_tag,submission_attributes_2_value,submission_center_name,submission_lab_name,submission_title
0,ERP138137,ERX9368426,MinION sequencing,10244,Monkeypox virus,WGS,GENOMIC,other,ERS12148191,ena-SAMPLE-TAB-06-06-2022-15:54:10:513-3669,...,Other,ERA15198661,ena-SUBMISSION-TAB-06-06-2022-15:58:08:829-3673,,,,,Hospital General Universitario Gregorio Maranon,European Nucleotide Archive,Submitted by Hospital General Universitario Gr...


In [40]:
df_search = df_search[df_search["experiment_library_source"] == "METAGENOMIC"]
df_search.shape

(34, 210)

## Save all run accessions

In [41]:
run_accession_columns = df_search.filter(regex="run_?.*_accession").columns.tolist()
run_accession_columns

['run_1_accession']

In [42]:
accession_set = set()
for col in run_accession_columns:
    accessions = df_search[col].dropna().unique().tolist()
    accession_set.update(accessions)

In [43]:
print(f"Found {len(accession_set)} accessions")

Found 34 accessions


In [44]:
pd.Series(sorted(accession_set), name="accession").to_csv(
    "accession_list.csv", index=False
)

## Quick overview

In [45]:
pyskim.skim(df_search)

── Data Summary ────────────────────────────────────────────────────────────────────────────────────
type                 value
-----------------  -------
Number of rows          34
Number of columns      210
──────────────────────────────────────────────────
Column type frequency:
          Count
------  -------
object      210

── Variable type: object ───────────────────────────────────────────────────────────────────────────
     name                                         na_count    n_unique  top_counts
---  -----------------------------------------  ----------  ----------  ---------------------------------------------------------------------------------------------------------
  0  study_accession                                     0           3  ERP137826: 29, SRP377367: 3, ERP137905: 2
  1  experiment_accession                                0          34  ERX9357608: 1, ERX9317370: 1, SRX15482481: 1
  2  experiment_title                                    0          32  Met