# Description

This notebook downloads the expression dataset described in:
* https://pubmed.ncbi.nlm.nih.gov/27655842/
* https://doi.org/10.1093/bib/bbab495

# Define magic

In [1]:
from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()


@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic('bash', '', cell.format(**globals()))

# Modules

In [2]:
from ccc import conf

# Install pysradb

In [3]:
conda_env = conf.CONDA_ENVS_DIR / "pysradb"
conda_env.parent.mkdir(parents=True, exist_ok=True)
display(conda_env)

PosixPath('/opt/data/software/conda_envs/pysradb')

In [4]:
%%pybash
. ~/.bashrc
conda create -y -p {conda_env} -c bioconda python=3.10.* pysradb=2.0.*

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /opt/data/software/conda_envs/pysradb

  added / updated specs:
    - pysradb=2.0
    - python=3.10


The following NEW packages will be INSTALLED:

  _libgcc_mutex      pkgs/main/linux-64::_libgcc_mutex-0.1-main
  _openmp_mutex      pkgs/main/linux-64::_openmp_mutex-5.1-1_gnu
  blas               pkgs/main/linux-64::blas-1.0-mkl
  bottleneck         pkgs/main/linux-64::bottleneck-1.3.5-py310ha9d4c09_0
  brotlipy           pkgs/main/linux-64::brotlipy-0.7.0-py310h7f8727e_1002
  bzip2              pkgs/main/linux-64::bzip2-1.0.8-h7b6447c_0
  ca-certificates    pkgs/main/linux-64::ca-certificates-2023.01.10-h06a4308_0
  certifi            pkgs/main/linux-64::certifi-2023.5.7-py310h06a4308_0
  cffi               pkgs/main/linux-64::cffi-1.15.1-py310h5eee18b_3
  charset-normalizer pkgs/main/noarch::charset-normalizer-2.0.4-pyhd3eb1b0_0



  current version: 4.11.0
  latest version: 23.3.1

Please update conda by running

    $ conda update -n base -c defaults conda




In [5]:
%%pybash
. ~/.bashrc
conda activate {conda_env}
python --version

Python 3.10.11


In [6]:
%%pybash
. ~/.bashrc
conda activate {conda_env}

pysradb --version

pysradb 2.0.2


# Download processed gene expression data

In [7]:
download_dir = conf.DATA_DIR / "rice"
download_dir.mkdir(parents=True, exist_ok=True)
display(download_dir)

PosixPath('/opt/data/data/rice')

In [8]:
%%pybash
. ~/.bashrc
conda activate {conda_env}

# download
pysradb download -g GSE74793 --out-dir {download_dir}

# rename file
mv {download_dir}/GSE74793/GSE74793_processed-data.txt.gz {download_dir}/gene_expr_processed-data.txt.gz
rm -rf {download_dir}/GSE74793/


The following files will be downloaded: 

GSE74793_processed-data.txt.gz




Downloading GSE74793_processed-data.txt.gz: 95.4MB [00:00, 1.14GB/s]                   


# Download metadata

In [9]:
%%pybash
{conda_env}/bin/pysradb metadata PRJNA301554 --detailed --saveto {download_dir}/metadata.tsv