# Download Dataset

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

## Prelude

In [2]:
root_dir = Path("../raw")
root_dir.mkdir(exist_ok=True)
root_dir

PosixPath('../raw')

In [3]:
import os, subprocess
from multiprocessing import Pool

def subprocs(cmd):
    return subprocess.run(cmd, shell=True, capture_output=True)
def execute_command_with_env(cmds, env, pool_num):
    if env is not None:
        env_wrapper_func = lambda cmd: f"conda run -n {env} /bin/bash -c '{cmd}'"
        cmds = [env_wrapper_func(cmd) for cmd in cmds]
    with Pool(pool_num) as p:
        output = p.map(subprocs, cmds)
    return output

def get_cmds_for_download(
    data_dir: Path,
    run_ids: list,
):
    def get_downloaded_samples(data_dir: Path):
        downloaded_samples = [f.name.split('_')[0] for f in data_dir.iterdir()]
        return downloaded_samples
    downloaded_samples = get_downloaded_samples(data_dir)
    cmds = [
        f"fasterq-dump -O {data_dir} --split-3 {f}"
        for f
        in run_ids
        if f not in downloaded_samples
    ]
    return cmds
def get_cmds_for_gzcomp(data_dir: Path):
    gzip_cmds = [f"gzip {f}" for f in data_dir.iterdir() if f.suffix == '.fastq']
    return gzip_cmds

## 1. WGS

In [4]:
wgs_dir = root_dir / 'wgs'
wgs_dir.mkdir(exist_ok=True)
wgs_dir

PosixPath('../raw/wgs')

### Load data

In [5]:
meta_df = pd.read_csv("sra_meta_PRJEB2779.csv")
meta_df.head()

Unnamed: 0,Run,Assay Type,AssemblyName,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,...,Isolate,PMID,Region,Tax_ID,Alias,ENA_checklist,SRA_accession,Title,Serovar,selection_source
0,ERR2759447,WGS,GCA_000069185.1,250,247922750,PRJEB2779,SAMEA104138834,46759301,WELLCOME SANGER INSTITUTE,public,...,,,,,,,,,,
1,ERR2759448,WGS,GCA_000069185.1,250,234726000,PRJEB2779,SAMEA104138835,50996446,WELLCOME SANGER INSTITUTE,public,...,,,,,,,,,,
2,ERR2759449,WGS,GCA_000069185.1,250,235427500,PRJEB2779,SAMEA104138836,53358376,WELLCOME SANGER INSTITUTE,public,...,,,,,,,,,,
3,ERR2759450,WGS,GCA_000069185.1,250,244737500,PRJEB2779,SAMEA104138837,54572561,WELLCOME SANGER INSTITUTE,public,...,,,,,,,,,,
4,ERR2759451,WGS,GCA_000069185.1,250,274817500,PRJEB2779,SAMEA104138838,60009131,WELLCOME SANGER INSTITUTE,public,...,,,,,,,,,,


In [6]:
filtered_df = meta_df[
    (meta_df['Organism'] == "Mycobacteroides abscessus subsp. abscessus")
    & (meta_df['Assay Type'] == "WGS")
    & (meta_df['Instrument'] == "Illumina HiSeq 2000")
].copy()
filtered_df.head()

Unnamed: 0,Run,Assay Type,AssemblyName,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,...,Isolate,PMID,Region,Tax_ID,Alias,ENA_checklist,SRA_accession,Title,Serovar,selection_source
878,ERR340492,WGS,GCA_000069185.1,200,498521000,PRJEB2779,SAMEA2069133,268620579,SC,public,...,,,,,,,,,,
879,ERR340493,WGS,GCA_000069185.1,200,477866600,PRJEB2779,SAMEA2071548,268237879,SC,public,...,,,,,,,,,,
881,ERR340495,WGS,GCA_000069185.1,200,435711400,PRJEB2779,SAMEA2069134,246387845,SC,public,...,,,,,,,,,,
882,ERR340496,WGS,GCA_000069185.1,200,437309600,PRJEB2779,SAMEA2071549,245012671,SC,public,...,,,,,,,,,,
883,ERR340497,WGS,GCA_000069185.1,200,455790600,PRJEB2779,SAMEA2070891,253592238,SC,public,...,,,,,,,,,,


In [7]:
shuffled_df = filtered_df.sample(frac=1, random_state=0)
shuffled_df.head()

Unnamed: 0,Run,Assay Type,AssemblyName,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,...,Isolate,PMID,Region,Tax_ID,Alias,ENA_checklist,SRA_accession,Title,Serovar,selection_source
2113,ERR459869,WGS,GCA_000069185.1,200,459520400,PRJEB2779,SAMEA2259719,225986897,SC,public,...,,,,,,,,,,
2385,ERR343204,WGS,GCA_000069185.1,200,665807400,PRJEB2779,SAMEA2071454,363695101,SC,public,...,,,,,,,,,,
1978,ERR494887,WGS,GCA_000069185.1,200,404522200,PRJEB2779,SAMEA2275753,183055967,SC,public,...,,,,,,,,,,
1984,ERR494905,WGS,GCA_000069185.1,200,358762200,PRJEB2779,SAMEA2275772,157041995,SC,public,...,,,,,,,,,,
2895,ERR374119,WGS,GCA_000069185.1,200,501470600,PRJEB2779,SAMEA2069132,259625801,SC,public,...,,,,,,,,,,


### Download

In [8]:
n = 20

run_to_down = shuffled_df['Run'].values[:n]
len(run_to_down)

20

In [9]:
cmds = get_cmds_for_download(wgs_dir, run_to_down)
len(cmds)

0

In [10]:
res = execute_command_with_env(cmds, 'sra', 4)

In [11]:
gzip_cmds = get_cmds_for_gzcomp(wgs_dir)
len(gzip_cmds)

0

In [12]:
res = execute_command_with_env(gzip_cmds, None, 4)

## 2. RNA-Seq

In [13]:
rna_dir = root_dir / 'rna'
rna_dir.mkdir(exist_ok=True)
rna_dir

PosixPath('../raw/rna')

### Load data

In [14]:
meta_df = pd.read_csv("sra_meta_PRJNA686956.csv")
meta_df.head()

Unnamed: 0,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,...,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,Time,treatment
0,SRR13281064,RNA-Seq,152,3319853736,PRJNA686956,SAMN17128499,1621101030,GEO,public,"sra,run.zq,fastq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:17:00Z,1,GSM4982991,Frozen reference strain,SRP298717,24 hours,Growth Control
1,SRR13281065,RNA-Seq,152,386384000,PRJNA686956,SAMN17128496,188380732,GEO,public,"sra,run.zq,fastq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:06:00Z,1,GSM4982992,Frozen reference strain,SRP298717,24 hours,Growth Control
2,SRR13281066,RNA-Seq,152,3798586248,PRJNA686956,SAMN17128493,1853687389,GEO,public,"run.zq,sra,fastq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:19:00Z,1,GSM4982993,Frozen reference strain,SRP298717,24 hours,Growth Control
3,SRR13281067,RNA-Seq,152,5698160952,PRJNA686956,SAMN17128492,2164963359,GEO,public,"fastq,run.zq,sra",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:32:00Z,1,GSM4982994,Frozen reference strain,SRP298717,24 hours,Growth Control
4,SRR13281068,RNA-Seq,152,2329595776,PRJNA686956,SAMN17128490,884884285,GEO,public,"fastq,sra,run.zq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:11:00Z,1,GSM4982995,Frozen reference strain,SRP298717,24 hours,Growth Control


In [15]:
meta_df['Assay Type'].value_counts()

Assay Type
RNA-Seq    50
Name: count, dtype: int64

In [16]:
meta_df['Time'].value_counts()

Time
24 hours    26
4 hours     24
Name: count, dtype: int64

In [17]:
meta_df['treatment'].value_counts()

treatment
Growth Control        15
Clarithromycin         6
Amikacin               6
Cefoxitin              6
Clofazimine            6
Growth Control 7H9     6
Tigecycline            5
Name: count, dtype: int64

In [18]:
selected_df = meta_df[
    (meta_df['Assay Type'] == "RNA-Seq")
    & (meta_df['Time'] == "24 hours")
    & (meta_df['treatment'].isin(['Clarithromycin', 'Growth Control']))
]
selected_df.head()

Unnamed: 0,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,...,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,Time,treatment
0,SRR13281064,RNA-Seq,152,3319853736,PRJNA686956,SAMN17128499,1621101030,GEO,public,"sra,run.zq,fastq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:17:00Z,1,GSM4982991,Frozen reference strain,SRP298717,24 hours,Growth Control
1,SRR13281065,RNA-Seq,152,386384000,PRJNA686956,SAMN17128496,188380732,GEO,public,"sra,run.zq,fastq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:06:00Z,1,GSM4982992,Frozen reference strain,SRP298717,24 hours,Growth Control
2,SRR13281066,RNA-Seq,152,3798586248,PRJNA686956,SAMN17128493,1853687389,GEO,public,"run.zq,sra,fastq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:19:00Z,1,GSM4982993,Frozen reference strain,SRP298717,24 hours,Growth Control
3,SRR13281067,RNA-Seq,152,5698160952,PRJNA686956,SAMN17128492,2164963359,GEO,public,"fastq,run.zq,sra",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:32:00Z,1,GSM4982994,Frozen reference strain,SRP298717,24 hours,Growth Control
4,SRR13281068,RNA-Seq,152,2329595776,PRJNA686956,SAMN17128490,884884285,GEO,public,"fastq,sra,run.zq",...,Mycobacteroides abscessus,ILLUMINA,2022-01-27T00:00:00Z,2020-12-22T01:11:00Z,1,GSM4982995,Frozen reference strain,SRP298717,24 hours,Growth Control


### Download

In [19]:
run_to_down = selected_df['Run'].values
len(run_to_down)

12

In [20]:
cmds = get_cmds_for_download(rna_dir, run_to_down)
len(cmds)

0

In [21]:
res = execute_command_with_env(cmds, 'sra', 4)

In [22]:
gzip_cmds = get_cmds_for_gzcomp(rna_dir)
len(gzip_cmds)

0

In [23]:
res = execute_command_with_env(gzip_cmds, None, 4)

### Metadata

In [24]:
meta_for_rna_df = selected_df[['Run', 'treatment']].reset_index(drop=True)
meta_for_rna_df.columns = ['Run', 'Treatment']
meta_for_rna_df[['R1', 'R2']] = meta_for_rna_df['Run'].apply(
    lambda x: sorted([f.resolve() for f in rna_dir.iterdir() if x in f.name])
).apply(pd.Series)
meta_for_rna_df

Unnamed: 0,Run,Treatment,R1,R2
0,SRR13281064,Growth Control,/data/raw/rna/SRR13281064_1.fastq.gz,/data/raw/rna/SRR13281064_2.fastq.gz
1,SRR13281065,Growth Control,/data/raw/rna/SRR13281065_1.fastq.gz,/data/raw/rna/SRR13281065_2.fastq.gz
2,SRR13281066,Growth Control,/data/raw/rna/SRR13281066_1.fastq.gz,/data/raw/rna/SRR13281066_2.fastq.gz
3,SRR13281067,Growth Control,/data/raw/rna/SRR13281067_1.fastq.gz,/data/raw/rna/SRR13281067_2.fastq.gz
4,SRR13281068,Growth Control,/data/raw/rna/SRR13281068_1.fastq.gz,/data/raw/rna/SRR13281068_2.fastq.gz
5,SRR13281069,Growth Control,/data/raw/rna/SRR13281069_1.fastq.gz,/data/raw/rna/SRR13281069_2.fastq.gz
6,SRR13281073,Clarithromycin,/data/raw/rna/SRR13281073_1.fastq.gz,/data/raw/rna/SRR13281073_2.fastq.gz
7,SRR13281074,Clarithromycin,/data/raw/rna/SRR13281074_1.fastq.gz,/data/raw/rna/SRR13281074_2.fastq.gz
8,SRR13281075,Clarithromycin,/data/raw/rna/SRR13281075_1.fastq.gz,/data/raw/rna/SRR13281075_2.fastq.gz
9,SRR13281108,Growth Control,/data/raw/rna/SRR13281108_1.fastq.gz,/data/raw/rna/SRR13281108_2.fastq.gz


In [25]:
meta_for_rna_df.to_csv(rna_dir / 'meta.csv')