From a32ad93c6c39111ccd9a9183ecfe04efcf1db03b Mon Sep 17 00:00:00 2001 From: eboileau Date: Fri, 19 May 2023 15:12:37 +0200 Subject: [PATCH] MNT patch 3.0.2 --- .pre-commit-config.yaml | 12 +- CHANGELOG.md | 13 ++ MANIFEST.in | 1 + docs/source/api-app-profiles.rst | 11 +- docs/source/api-app-rpbp.rst | 11 +- docs/source/apps.rst | 4 +- docs/source/faq.rst | 28 ++++- docs/source/index.rst | 2 +- docs/source/installation.rst | 28 ++--- docs/source/tutorial-cel.rst | 7 +- docs/source/tutorial-containers.rst | 115 ++++++++++++++++++ docs/source/tutorial.rst | 1 + docs/source/user-guide.rst | 4 +- src/rpbp/__init__.py | 2 +- .../collect_read_length_orf_profiles.py | 4 - .../create_read_length_orf_profiles.py | 1 - .../rpbp_profile_construction_dashboard.py | 32 ++--- .../summarize_rpbp_profile_construction.py | 8 -- .../dashboard/rpbp_predictions_dashboard.py | 43 +++++-- .../summarize_rpbp_predictions.py | 11 +- .../visualize_orf_type_metagene_profiles.py | 2 - .../create_base_genome_profile.py | 21 +++- .../create_orf_profiles.py | 1 - ...estimate_metagene_profile_bayes_factors.py | 2 - .../extract_metagene_profiles.py | 4 - .../extract_orf_profiles.py | 1 - .../select_periodic_offsets.py | 1 - .../reference_preprocessing/label_orfs.py | 4 - .../prepare_rpbp_genome.py | 39 ++++-- src/rpbp/ribo_utils/filenames.py | 22 ---- src/rpbp/ribo_utils/utils.py | 6 - src/rpbp/run_all_rpbp_instances.py | 1 - .../estimate_orf_bayes_factors.py | 1 - .../predict_translated_orfs.py | 2 - .../select_final_prediction_set.py | 2 - tests/regression/conftest.py | 6 - tests/regression/test_rpbp.py | 3 - 37 files changed, 295 insertions(+), 161 deletions(-) create mode 100644 docs/source/tutorial-containers.rst diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 427ff31..fdea0cb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v4.4.0 hooks: - id: check-yaml - id: end-of-file-fixer @@ -10,18 +10,18 @@ repos: - id: check-toml - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 23.3.0 hooks: - id: black - id: black-jupyter - repo: https://github.com/kynan/nbstripout - rev: 0.5.0 + rev: 0.6.1 hooks: - id: nbstripout - repo: https://github.com/pre-commit/mirrors-prettier - rev: v2.7.1 + rev: v3.0.0-alpha.6 hooks: - id: prettier additional_dependencies: @@ -29,13 +29,13 @@ repos: - "prettier-plugin-toml" - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.17.1 + rev: 0.22.0 hooks: - id: check-github-workflows - id: check-readthedocs - repo: https://github.com/pycqa/flake8 - rev: "5.0.4" + rev: "6.0.0" hooks: - id: flake8 args: diff --git a/CHANGELOG.md b/CHANGELOG.md index 71a9b28..c8fa3ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased] - started 2023-02 +## [3.0.2] 2023-05-19 + +### Added + +- `riboseq_sample_name_map`, `riboseq_condition_name_map` to `rpbp_predictions_dashboard.py` +- Exception for duplicated transcript ids with _de novo_ annotation. + +### Fixed + +- STAR output +- Redundant transcripts with _de novo_ annotation in `summarize_rpbp_predictions.py` +- ORF numbers in labels + ## [3.0.1] 2023-02-10 ### Changed diff --git a/MANIFEST.in b/MANIFEST.in index f689b66..76d423e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include README.md include src/rpbp/models/*/*.stan include src/rpbp/analysis/rpbp_predictions/dashboard/assets/*.* +include src/rpbp/analysis/profile_construction/dashboard/assets/*.* diff --git a/docs/source/api-app-profiles.rst b/docs/source/api-app-profiles.rst index 324b51e..4e7a802 100644 --- a/docs/source/api-app-profiles.rst +++ b/docs/source/api-app-profiles.rst @@ -5,18 +5,15 @@ Launch a Dash app for quality control and visualization of ribosome profiling da .. code-block:: bash - usage: rpbp-profile-construction-dashboard [-h] [-d] [--host] [--port] config + usage: rpbp-profile-construction-dashboard [-h] [-c CONFIG] [-d] [--host] [--port] -Positional Arguments --------------------- - --config - A YAML configuration file. The same used to run the pipeline. - Named Arguments --------------- +-c, --config + A YAML configuration file (required). The same used to run the pipeline. + -d, --debug Enable debug mode. diff --git a/docs/source/api-app-rpbp.rst b/docs/source/api-app-rpbp.rst index 3d5cdd5..704094e 100644 --- a/docs/source/api-app-rpbp.rst +++ b/docs/source/api-app-rpbp.rst @@ -5,18 +5,15 @@ Launch a Dash app to visualize Ribo-seq ORF predicted with Rp-Bp. .. code-block:: bash - usage: rpbp-predictions-dashboard [-h] [-d] [--host] [--port] config + usage: rpbp-predictions-dashboard [-h] [-c CONFIG] [-d] [--host] [--port] -Positional Arguments --------------------- - --config - A YAML configuration file. The same used to run the pipeline. - Named Arguments --------------- +-c, --config + A YAML configuration file (required). The same used to run the pipeline. + -d, --debug Enable debug mode. diff --git a/docs/source/apps.rst b/docs/source/apps.rst index 3891fe8..6a7cddd 100644 --- a/docs/source/apps.rst +++ b/docs/source/apps.rst @@ -87,7 +87,7 @@ To launch the *profile construction dashboard* .. code-block:: bash - rpbp-profile-construction-dashboard + rpbp-profile-construction-dashboard -c CONFIG The application has multiple views to facilitate quality control, *e.g.* @@ -104,7 +104,7 @@ To launch the *predictions dashboard* .. code-block:: bash - rpbp-predictions-dashboard + rpbp-predictions-dashboard -c CONFIG The application has multiple views to facilitate ORF discovery, including an integrated `IGV browser `_ for the visual exploration of predicted Ribo-seq ORFs, *e.g.* diff --git a/docs/source/faq.rst b/docs/source/faq.rst index f846d2c..5845631 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -5,6 +5,9 @@ Frequently asked questions * :ref:`q2` * :ref:`q3` * :ref:`q4` +* :ref:`q5` +* :ref:`q6` +* :ref:`q7` .. _q1: @@ -19,7 +22,7 @@ Example calls are also given in the user guide and tutorials. How do I launch the app remotely? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By default, the application is opened in a browser page on *localhost:8050*. You don't have to actually worry about this. But you can also specify a ``--host`` and a ``--port`` when calling the app, enabling you to launch it from a remote server. In the latter case, however, you have to open a browser page at the correct address. For example, if you use ``--host 123.123.123.123``, then open a page on *http://123.123.123.123:8050/*. +By default, the application is opened in a browser page on *localhost:8050*. You don't have to actually worry about this. But you can also specify a ``--host`` and a ``--port`` when calling the app, enabling you to launch it from a remote server. In the latter case, however, you have to open a browser page at the correct address. For example, if you use ``--host 123.123.123.123``, then open a page on *http://123.123.123.123:8050/*. To launch the app from a container, see :ref:`tutorial_containers`. .. _q3: @@ -34,3 +37,26 @@ I have my own alignments, can I use **Rp-Bp**? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The short answer is yes. The pipeline is designed to handle all steps from raw FASTQ files up to the final list of translated Ribo-seq ORFs, but you can start the pipeline from any step. Check the tutorial :ref:`existing-alignment`. + + +.. _q5: + +I get errors when calling ``summarize-rpbp-predictions`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The most common errors *e.g.* ``AttributeError: 'float' object has no attribute 'left'`` are due to the bin width for the `Circos `_ plot. Try reducing it using ``--circos-bin-width VALUE`` (default VALUE: 10000000). You may also have to use ``--circos-show-chroms`` if your organism has a different nomenclature. Use ``summarize-rpbp-predictions --help`` for details. + +.. _q6: + +The IGV browser does not load or the predictions app fails to start +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The most likely reason is that your reference genome sequence (given by the config key ``fasta``) is not indexed, *i.e* the file **\*.fasta.fai** is missing. You can create the missing index using `samtools faidx `_. + + +.. _q7: + +I get ``RuntimeWarning: invalid value encountered in divide res, _ = _lowess(y, x, x, np.ones_like(x),`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This happens for 3 nt ORFs. diff --git a/docs/source/index.rst b/docs/source/index.rst index de8264d..031caa7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -9,7 +9,7 @@ Rp-Bp: Ribosome profiling with Bayesian predictions Introduction ------------ -Ribosome profiling (Ribo-seq) is an RNA-sequencing-based readout of RNA translation. Isolation and deep-sequencing of ribosome-protected RNA fragments (ribosome footprints) provides a genome-wide snapshot of the translatome at sub-codon resolution. Aligned by their P-site positions, the footprints from actively translating ribosomes should exhibit a 3-nt periodicity along the ORF (the ribosome moves by steps of a codon, *i.e.* 3-nt). To select reads and predict translation, most methods, including **Rp-Bp**, take advantage of this periodic signal. +Ribosome profiling (Ribo-seq) is an RNA-sequencing-based readout of RNA translation. Isolation and deep-sequencing of ribosome-protected RNA fragments (ribosome footprints) provides a genome-wide snapshot of the translatome at sub-codon resolution. Aligned by their P-site positions, the footprints from actively translating ribosomes should exhibit a 3-nt periodicity. To select reads and predict translation, most methods, including **Rp-Bp**, take advantage of this periodic signal. **Rp-Bp** is an unsupervised Bayesian approach to predict translated open reading frames (ORFs) from ribosome profiles, using the automatic Bayesian Periodic fragment length and ribosome P-site offset Selection (BPPS), *i.e.* read lengths and ribosome P-site offsets are inferred from the data, without supervision. **Rp-Bp** is able to handle *de novo* translatome annotation by directly assessing the periodicity of the Ribo-seq signal. diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 5dc123f..aa5c887 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -15,7 +15,7 @@ To use a container (Docker or Singularity) with **Rp-Bp** pre-installed, simply # ...singularity singularity pull rpbp.sif docker://quay.io/biocontainers/rpbp: -There is no *latest* tag, you need to specify the version tag. See `rpbp/tags `_ for valid values for . +There is no *latest* tag, you need to specify the version tag. See `rpbp/tags `_ for valid values for . Check the `Tutorials `_ on how to use the containers. .. _conda_install: @@ -36,6 +36,10 @@ or create an environment, called *rpbp*, containing the **Rp-Bp** package conda create -n rpbp rpbp +.. tip:: + + `Mamba `_ can be used as a drop-in replacement, you can swap almost all commands between conda and mamba. + .. _pypi_install: Contributing to **Rp-Bp** @@ -49,12 +53,12 @@ To install the local VCS project in development mode conda create -n rpbp_dev # ...activate it... conda activate rpbp_dev - # ... and only install dependencies - (rpbp_dev) conda install --only-deps rpbp + # ... and only install dependencies (rpbp_dev is now activated) + conda install --only-deps rpbp # clone the git repository - (rpbp_dev) git clone https://github.com/dieterich-lab/rp-bp.git && cd rp-bp + git clone https://github.com/dieterich-lab/rp-bp.git && cd rp-bp # install - (rpbp_dev) pip --verbose install --no-deps -e . 2>&1 | tee install.log + pip --verbose install --no-deps -e . 2>&1 | tee install.log PyPI installation @@ -69,17 +73,11 @@ However, if you already have the required dependencies installed on your system, python3 -m venv rpbp_pypi # ... activate it ... source rpbp_pypi/bin/activate - # ... and install Rp-Bp - (rpbp_pypi) pip install rpbp - + # ... and install Rp-Bp (rpbp_pypi is now activated) + pip install rpbp -Required dependencies -""""""""""""""""""""" -* `Flexbar `_ -* `Bowtie 2 `_ -* `STAR `_ -* `Samtools `_ +**Required dependencies:** `Flexbar `_, `Bowtie 2 `_, `STAR `_, `Samtools `_. .. warning:: @@ -101,7 +99,7 @@ or remove the package installed in another environment .. code-block:: bash # remove the rpbp package from myenv environment... - (myenv) conda remove -n myenv rpbp + conda remove -n myenv rpbp To remove **Rp-Bp** if installed with pip diff --git a/docs/source/tutorial-cel.rst b/docs/source/tutorial-cel.rst index c5f3355..56cf5dd 100644 --- a/docs/source/tutorial-cel.rst +++ b/docs/source/tutorial-cel.rst @@ -60,17 +60,20 @@ Launch any of the web applications with .. code-block:: bash - rpbp-profile-construction-dashboard c-elegans-test.yaml + rpbp-profile-construction-dashboard -c c-elegans-test.yaml or .. code-block:: bash - rpbp-predictions-dashboard c-elegans-test.yaml + rpbp-predictions-dashboard -c c-elegans-test.yaml To navigate the apps is easy, just follow the "hints". Most items are interactive. Press ``CTRL+C`` to quit. +.. attention:: + + For the apps only, the configuration file is passed using a (required) named argument ``-c/--config CONFIG``. .. note:: diff --git a/docs/source/tutorial-containers.rst b/docs/source/tutorial-containers.rst new file mode 100644 index 0000000..ab58fd5 --- /dev/null +++ b/docs/source/tutorial-containers.rst @@ -0,0 +1,115 @@ +.. _tutorial_containers: + +How to use the containers +========================= + +First pull a Docker or Singularity container, see `installation `_. For this tutorial, we use a general mechanism for persisting data, which allows to create and modify files on the host system from within the container. We use the *c-elegans-chrI-example* with the *c-elegans-test.yaml* configuration file, see also `How to run the pipeline `_. + +.. note:: + + You can also launch a container with an interactive shell *e.g.* ``docker run -it quay.io/biocontainers/rpbp:3.0.1--py310h30d9df9_0 /bin/bash`` or ``singularity shell rpbp.sif``. With ``singularity shell``, ``$HOME`` is mounted by default. + +.. attention:: + + In the following, do not forget to modify the tag ``3.0.1--py310h30d9df9_0`` according to what you pulled! For Singularity, adjust the name of the Singularity image format file ``rpbp.sif`` and/or the path according to your needs. + + +How to run the pipeline +----------------------- + +To run the pipeline, change the paths in the configuration file to point to the location where the directory is mounted in the container. You can do this using a text editor, or simply by modifying the file in place + +.. code-block:: bash + + sed -i 's|/path/to/your/c-elegans-example|/data|g' c-elegans-test.yaml + + +.. important:: + + Default parameters were modified for the example and included in the configuration file. If you use this configuration file as a general template for your data, do not forget to remove everything below the line "REMOVE BELOW THIS LINE IF YOU USE THIS CONFIGURATION FILE AS TEMPLATE FOR YOUR DATA". + + +You can now create the genome indices and annotations. For this small example, it is important to downscale the STAR ``--genomeSAindexNbases`` parameter as follows + + +.. code-block:: bash + + docker run --volume `pwd`:/data quay.io/biocontainers/rpbp:3.0.1--py310h30d9df9_0 prepare-rpbp-genome /data/c-elegans-test.yaml --star-options "--genomeSAindexNbases 10" --num-cpus 4 --logging-level INFO --log-file /data/rpbp-genome.log + + +.. code-block:: bash + + singularity run --bind `pwd`:/data rpbp.sif prepare-rpbp-genome /data/c-elegans-test.yaml --star-options "--genomeSAindexNbases 10" --num-cpus 4 --logging-level INFO --log-file /data/rpbp-genome.log + + +The file *rpbp-genome.log* contains logging output for the reference preprocessing. You now have a new directory called *WBcel235.79.chrI* with genome indices and annotations. + +Finally, run the ORF discovery pipeline + + +.. code-block:: bash + + docker run --volume `pwd`:/data quay.io/biocontainers/rpbp:3.0.1--py310h30d9df9_0 run-all-rpbp-instances /data/c-elegans-test.yaml --merge-replicates --run-replicates --keep-intermediate-files --num-cpus 4 --logging-level INFO --log-file /data/rpbp-pipeline.log + + +.. code-block:: bash + + singularity run --bind `pwd`:/data rpbp.sif run-all-rpbp-instances /data/c-elegans-test.yaml --merge-replicates --run-replicates --keep-intermediate-files --num-cpus 4 --logging-level INFO --log-file /data/rpbp-pipeline.log + + +The file *rpbp-pipeline.log* contains logging output for the different processing steps. You now have four new directories (*with-*, *without-*) including output from Flexbar, Bowtie2, and STAR, and directories with **Rp-Bp** output: *metagene-profiles*, *orf-profiles*, and *orf-predictions*. The *orf-predictions* include the output for each sample *c-elegans-rep-1* and *c-elegans-rep-2* as well as for the merged replicates *c-elegans-test*. + +How to summarize the results and launch the apps +------------------------------------------------ + +Prepare the summary output for the *profile construction dashboard* + +.. code-block:: bash + + docker run --volume `pwd`:/data quay.io/biocontainers/rpbp:3.0.1--py310h30d9df9_0 summarize-rpbp-profile-construction /data/c-elegans-test.yaml --num-cpus 4 --logging-level INFO --log-file /data/rpbp-profile-summary.log + + +.. code-block:: bash + + singularity run --bind `pwd`:/data rpbp.sif summarize-rpbp-profile-construction /data/c-elegans-test.yaml --num-cpus 4 --logging-level INFO --log-file /data/rpbp-profile-summary.log + + +and for the *predictions dashboard* + +.. code-block:: bash + + docker run --volume `pwd`:/data quay.io/biocontainers/rpbp:3.0.1--py310h30d9df9_0 summarize-rpbp-predictions /data/c-elegans-test.yaml --no-replicates --circos-bin-width 10000 --circos-show-chroms I --logging-level INFO --log-file /data/rpbp-predictions-summary.log + +.. code-block:: bash + + singularity run --bind `pwd`:/data rpbp.sif summarize-rpbp-predictions /data/c-elegans-test.yaml --no-replicates --circos-bin-width 10000 --circos-show-chroms I --logging-level INFO --log-file /data/rpbp-predictions-summary.log + + +Due to the size of the data, we reduce the bin width for the `Circos `_ plot. We also need to specify which sequences or chromosomes we want to include (by default, only numbered chromosomes and X/x, Y/y are shown). You now have a new directory *analysis* with *profile_construction* and *rpbp_predictions* output. + +Launch any of the web applications with + +.. code-block:: bash + + docker run -p 8050:8050 --volume `pwd`:/data quay.io/biocontainers/rpbp:3.0.1--py310h30d9df9_0 rpbp-profile-construction-dashboard -c /data/c-elegans-test.yaml --host="0.0.0.0" + +.. code-block:: bash + + singularity run --bind `pwd`:/data rpbp.sif rpbp-profile-construction-dashboard -c /data/c-elegans-test.yaml --host="0.0.0.0" + +or + +.. code-block:: bash + + docker run -p 8050:8050 --volume `pwd`:/data quay.io/biocontainers/rpbp:3.0.1--py310h30d9df9_0 rpbp-predictions-dashboard -c /data/c-elegans-test.yaml --host="0.0.0.0" + +.. code-block:: bash + + singularity run --bind `pwd`:/data rpbp.sif rpbp-predictions-dashboard -c /data/c-elegans-test.yaml --host="0.0.0.0" + + +You then have to open a browser page at the correct address, *e.g.* you see ``Running on http://127.0.0.1:8050``, click on this link, or open a browser page at this address. To navigate the apps is easy, just follow the "hints". Most items are interactive. Press ``CTRL+C`` to quit. + +.. attention:: + + For the apps only, the configuration file is passed using a (required) named argument ``-c/--config CONFIG``. diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 7d33271..9d5a360 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -6,6 +6,7 @@ Tutorials tutorial-cel existing-alignments + tutorial-containers The tutorials includes a small test Ribo-seq dataset and reference **Rp-Bp** output that is also used for regression tests. It has been constructed to include some reads which uniquely map to the annotated transcripts, some reads which map to ribosomal sequences, some reads which do not uniquely map to the genome, and some reads which are filtered due to quality issues. diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst index 11b1899..4fd87d3 100644 --- a/docs/source/user-guide.rst +++ b/docs/source/user-guide.rst @@ -38,7 +38,7 @@ A single YAML configuration file can be used for both index creation and running In addition to the above required keys: -* ``riboseq_samples`` *(required, input)* A dictionary *key: value*, where *key* is used to construct filenames, and *value* is the full path to the FASTQ.GZ file for a given sample. The *key* should not contain spaces or special characters. +* ``riboseq_samples`` *(required, input)* A dictionary *key: value*, where *key* is used to construct filenames, and *value* is the full path to the FASTQ.gz file for a given sample. The *key* should not contain spaces or special characters. * ``riboseq_biological_replicates`` *(optional, input)* A dictionary *key: value*, where *key* is a condition, and *value* contains all samples which are replicates of the condition. Items of the *value* list must match the ``riboseq_samples`` *key*. * ``adapter_file`` *(optional, input)* Path to adapter sequences to be removed (FASTA). @@ -279,7 +279,7 @@ Without the ``--profiles-only`` option, the pipeline will predict which ORFs sho .. tip:: - If you first created profiles and estimated periodicity using the ``--profiles-only`` option, you can decide to continue with the translation prediction step at a later stage. You only have to ```run-all-rpbp-instances [--merge-replicates] [--run-replicates]``` using the same configuration file. Steps for which output files already exists will be skipped, unless the ``--overwrite`` option is set. + If you first created profiles and estimated periodicity using the ``--profiles-only`` option, you can decide to continue with the translation prediction step at a later stage. You only have to ``run-all-rpbp-instances [--merge-replicates] [--run-replicates]`` using the same configuration file. Steps for which output files already exists will be skipped, unless the ``--overwrite`` option is set. Required input diff --git a/src/rpbp/__init__.py b/src/rpbp/__init__.py index 7d4f050..e6f8e06 100644 --- a/src/rpbp/__init__.py +++ b/src/rpbp/__init__.py @@ -1,2 +1,2 @@ -__version_info__ = ("3", "0", "1") +__version_info__ = ("3", "0", "2") __version__ = ".".join(__version_info__) diff --git a/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py b/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py index 853cdc5..1a15edd 100644 --- a/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py +++ b/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py @@ -85,7 +85,6 @@ def main(): length_profile_map = {} for name in names: - msg = "Processing sample: {}".format(name) logger.info(msg) @@ -104,7 +103,6 @@ def main(): return for length, offset in zip(lengths, offsets): - mtx = filenames.get_riboseq_profiles( config["riboseq_data"], name, @@ -125,7 +123,6 @@ def main(): if args.add_ids: with gzip.open(args.out, "wb") as target_gz: - for length, mtx in length_profile_map.items(): mtx = mtx.tocoo() @@ -139,7 +136,6 @@ def main(): target_gz.write(s.encode()) else: with gzip.open(args.out, "wb") as target_gz: - for length, mtx in length_profile_map.items(): mtx = mtx.tocoo() diff --git a/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py b/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py index cd94246..e734692 100644 --- a/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py +++ b/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py @@ -98,7 +98,6 @@ def main(): job_ids = [] for name in names: - msg = "Processing sample: {}".format(name) logger.info(msg) diff --git a/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py b/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py index 8648ef1..83e2a07 100644 --- a/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py +++ b/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py @@ -44,6 +44,12 @@ # ------------------------------------------------------ Functions ------------------------------------------------------ +def parse_env(key): + return ( + {"default": os.environ.get(key)} if os.environ.get(key) else {"required": True} + ) + + def get_parser(): parser = argparse.ArgumentParser( description="Launch a Dash app for quality " @@ -52,8 +58,10 @@ def get_parser(): ) parser.add_argument( - "config", + "--config", + "-c", type=str, + **parse_env("RPBP_CFG"), help="A YAML configuration file." "The same used to run the pipeline.", ) @@ -63,13 +71,13 @@ def get_parser(): parser.add_argument("--port", type=int, default=8050, help="Port number.") - args = parser.parse_args() + # args = parser.parse_args() + args, unkn = parser.parse_known_args() return args.config, args.debug, args.host, args.port def get_diff_counts(data_np): - # add an extra column so the diff counts will work zeros = np.zeros((data_np.shape[0], 1)) data_np = np.append(zeros, data_np, axis=1) @@ -87,7 +95,6 @@ def get_window_counts( end_upstream_window, end_downstream_window, ): - # profile around start codon start_upstream = start_upstream_window - offset start_downstream = start_downstream_window - offset @@ -120,7 +127,6 @@ def get_profiles_bars( end_upstream_window, end_downstream_window, ): - start_window_size = start_downstream_window - start_upstream_window + 1 end_window_size = end_downstream_window - end_upstream_window + 1 @@ -128,7 +134,6 @@ def get_profiles_bars( metagene_profile_end = np.zeros(end_window_size) for sample in samples: - sample_name = reverse_sample_name_map[sample] # TODO: if we have the df with all lengths and offsets ready, we don't need to @@ -149,7 +154,6 @@ def get_profiles_bars( ) for length, offset in zip(lengths, offsets): - m_length = metagene_profiles["length"] == int(length) metagene_profile = metagene_profiles[m_length] @@ -174,7 +178,6 @@ def get_profiles_lines( end_upstream_window, end_downstream_window, ): - start_window_size = start_downstream_window - start_upstream_window + 1 end_window_size = end_downstream_window - end_upstream_window + 1 @@ -182,7 +185,6 @@ def get_profiles_lines( all_ribo_metagene_profile_end_df = pd.DataFrame(columns=range(end_window_size)) for sample in samples: - sample_name = reverse_sample_name_map[sample] # TODO: if we have the df with all lengths and offsets ready, we don't need to @@ -207,7 +209,6 @@ def get_profiles_lines( metagene_profile_end = np.zeros(end_window_size) for length, offset in zip(lengths, offsets): - m_length = metagene_profiles["length"] == int(length) metagene_profile = metagene_profiles[m_length] @@ -250,7 +251,6 @@ def metagene_plot_bars( y_max=None, layout=None, ): - indices = [(0, 1, 2), (1, 0, 2), (2, 1, 0)] start_window_size = start_downstream_window - start_upstream_window + 1 end_window_size = end_downstream_window - end_upstream_window + 1 @@ -327,7 +327,6 @@ def metagene_plot_lines( y_label, y_max=None, ): - start_window_size = start_downstream_window - start_upstream_window + 1 end_window_size = end_downstream_window - end_upstream_window + 1 @@ -626,7 +625,7 @@ def fig_to_uri(in_fig, close_all=True, **save_args): app = dash.Dash(__name__) # do we have to expose the flask variable in the file? -# server = app.server +server = app.server app.layout = html.Div( [ @@ -949,7 +948,6 @@ def fig_to_uri(in_fig, close_all=True, **save_args): [Input("drop_samples", "value"), Input("radio_stacked_reads", "value")], ) def stacked_reads(selected, zoom): - selected_samples = selected.split(",") stack_cts_order = STACK_CTS_ORDER @@ -992,7 +990,6 @@ def stacked_reads(selected, zoom): ], ) def funnel_reads(hoverData, zoom, selected): - stack_cts_order = STACK_CTS_ORDER funnel_cts_name = FUNNEL_CTS_NAME if zoom == 1: # exclude ribosomal/poor quality @@ -1059,7 +1056,6 @@ def update_available_samples(selected): [Input("dropdown_available_samples", "value"), Input("drop_unique", "value")], ) def read_length_min_max(sample, unique): - unique = ast.literal_eval(unique) m_unique = read_length_distributions.is_unique == unique m_sample = read_length_distributions.Sample == sample @@ -1081,7 +1077,6 @@ def read_length_min_max(sample, unique): ], ) def length_bars(sample, unique, log_y, slider): - unique = ast.literal_eval(unique) pal = pal_bars[2] if unique: @@ -1133,7 +1128,6 @@ def length_bars(sample, unique, log_y, slider): ], ) def read_length_metagene_bars(hoverData, sample): - # TODO: how to get a default value, i.e. not hoverData['points'][0] is None? m_sample = lengths_and_offsets.Sample == sample @@ -1271,7 +1265,6 @@ def read_length_metagene_bars(hoverData, sample): # percentage of reads in each frame @app.callback(Output("stacked_frames_fig", "figure"), Input("drop_samples", "value")) def stacked_frames(selected): - selected_samples = selected.split(",") df = frame_counts[frame_counts.Sample.isin(selected_samples)] @@ -1305,7 +1298,6 @@ def stacked_frames(selected): ], ) def all_metagenes(window_upstream, window_downstream, step, plot, selected): - # instead of defining a default value and override placeholder... if window_upstream is None: window_upstream = 3 diff --git a/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py b/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py index 142d597..c657efe 100644 --- a/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py +++ b/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py @@ -29,7 +29,6 @@ def get_read_filtering_summary(filename, args): - overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" @@ -48,7 +47,6 @@ def get_read_filtering_summary(filename, args): def get_read_length_distributions(sample, config, is_unique, note, args): - logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) @@ -83,7 +81,6 @@ def get_read_length_distributions(sample, config, is_unique, note, args): def collect_all_read_length_distributions(sample, config, note, args): - # distribution for this sample read_length_distribution = filenames.get_riboseq_read_length_distribution( config["riboseq_data"], sample, note=note @@ -96,7 +93,6 @@ def collect_all_read_length_distributions(sample, config, note, args): def collect_lengths_and_offsets(sample, config, is_unique, note, args): - # not only the periodic lengths and offsets, but all # and add status information based on filters @@ -123,7 +119,6 @@ def collect_lengths_and_offsets(sample, config, is_unique, note, args): lengths, offsets, statuses = [], [], [] for length in range(min_read_length, max_read_length + 1): - # check which offset is used # select the row for this length mask_length = offsets_df["length"] == length @@ -194,7 +189,6 @@ def get_profile(sample, config, is_unique, note): def get_frame_counts(sample, config, is_unique, note): - msg = "{}: extracting frame counts".format(sample) logger.info(msg) @@ -221,7 +215,6 @@ def get_frame_counts(sample, config, is_unique, note): def create_fastqc_reports(sample_data, config, note, is_unique, args): - sample, raw_data = sample_data msg = "{}: creating fastqc reports".format(sample) logger.info(msg) @@ -509,7 +502,6 @@ def main(): ) if args.create_fastqc_reports: - msg = "Calling FastQC..." logger.info(msg) diff --git a/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py b/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py index 06a0859..0d57b57 100644 --- a/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py +++ b/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py @@ -17,11 +17,19 @@ import pandas as pd import plotly.express as px +import rpbp.ribo_utils.utils as ribo_utils + from rpbp.defaults import orf_type_colors, orf_type_labels, orf_type_name_map # ------------------------------------------------------ Functions ------------------------------------------------------ +def parse_env(key): + return ( + {"default": os.environ.get(key)} if os.environ.get(key) else {"required": True} + ) + + def get_parser(): parser = argparse.ArgumentParser( description="Launch a Dash app to visualize " @@ -29,8 +37,10 @@ def get_parser(): ) parser.add_argument( - "config", + "--config", + "-c", type=str, + **parse_env("RPBP_CFG"), help="A YAML configuration file." "The same used to run the pipeline.", ) @@ -40,13 +50,13 @@ def get_parser(): parser.add_argument("--port", type=int, default=8050, help="Port number.") - args = parser.parse_args() + # args = parser.parse_args() + args, unkn = parser.parse_known_args() return args.config, args.debug, args.host, args.port def fmt_tooltip(row): - conditions = row.condition.split("|") bayes_factor_mean = row.bayes_factor_mean.split("|") bayes_factor_var = row.bayes_factor_var.split("|") @@ -63,7 +73,6 @@ def fmt_tooltip(row): def get_orf_type_counts(condition): - orf_type_counts = condition.groupby(["orf_type", "strand"]).size() orf_type_counts = orf_type_counts.reset_index(name="count") @@ -71,7 +80,6 @@ def get_orf_type_counts(condition): def filter_sort_table(filter_query, sort_by): - filtering_expressions = filter_query.split(" && ") df = display_table.copy() @@ -148,6 +156,14 @@ def filter_sort_table(filter_query, sort_by): **Novel**: Translation event inter- or intragenic (only when Rp-Bp is run with a *de novo* assembly) """ +# ribo_utils._return_key_dict +sample_name_map = ribo_utils.get_sample_name_map( + config +) # default to riboseq_samples.keys() +condition_name_map = ribo_utils.get_condition_name_map( + config +) # default to riboseq_biological_replicates.keys() + col_rev = {v: k for k, v in orf_type_colors.items()} row_col = {} for orf_type, labels in orf_type_labels.items(): @@ -158,6 +174,9 @@ def filter_sort_table(filter_query, sort_by): # *** load/wrangle data orfs = pd.read_csv(config["predicted_orfs"], sep="\t", low_memory=False) # bed_utils orfs.columns = orfs.columns.str.replace("#", "") +orfs["condition"] = orfs["condition"].apply(lambda x: sample_name_map[x]) +# apply condition name map, in case we also have conditions +orfs["condition"] = orfs["condition"].apply(lambda x: condition_name_map[x]) orfs["orf_len"] = orfs["orf_len"] / 3 orfs["profile_sum"] = orfs[["x_1_sum", "x_2_sum", "x_3_sum"]].sum(axis=1) orfs["profile_sum"] = orfs["profile_sum"].astype(int) @@ -438,10 +457,18 @@ def split_filter_part(filter_part): # ------------------------------------------------------ APP ------------------------------------------------------ app = dash.Dash(__name__) +server = app.server + +# we need this for serving the app +base_url = ( + os.environ["DASH_URL_BASE_PATHNAME"] + if "DASH_URL_BASE_PATHNAME" in os.environ + else "" +) -@app.server.route("/data/", defaults={"suffix": None}) -@app.server.route("/data//") +@app.server.route(f"{base_url}/data/", defaults={"suffix": None}) +@app.server.route(f"{base_url}/data//") def config_data(configval, suffix): """Serve the file specified for the given key in the configuration. Potentially apply a suffix for derived files like an index. @@ -874,7 +901,6 @@ def refmt(trx): State("circos_fig", "tracks"), ) def hist_orf_type(value, current): - tracks_config = { "innerRadius": circos_innerRadius, "outerRadius": circos_outerRadius, @@ -909,7 +935,6 @@ def hist_orf_type(value, current): prevent_initial_call=True, ) def func(n_clicks, sort_by, filter_query): # table_data - # df = pd.DataFrame.from_dict(table_data) changed_inputs = [x["prop_id"] for x in ctx.triggered] if "btn_csv.n_clicks" in changed_inputs: diff --git a/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py b/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py index 049326f..0b2301f 100644 --- a/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py +++ b/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py @@ -92,7 +92,6 @@ def get_predictions_file( config, ftype="predictions", ): - if is_sample: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( @@ -169,7 +168,6 @@ def cut_it(x, seqname, karyotype, args): def get_circos_graph(orfs, sub_folder, config, args): - orf_types = orfs.orf_type.unique() df = orfs.drop_duplicates(subset=bed_utils.bed12_field_names).copy() df.rename(columns={"seqname": "block_id"}, inplace=True) @@ -242,7 +240,6 @@ def add_data( is_filtered, config, ): - orfs_file = get_predictions_file( name, is_sample, @@ -261,7 +258,6 @@ def add_data( def get_bed_blocks(row): - # convert genomic coordinates to bed blocks # ad-hoc for standardized ORFs @@ -298,7 +294,6 @@ def get_bed_blocks(row): def get_standardized_orfs(filen, sheet): - # ad-hoc for standardized ORFs colmap = [ @@ -318,7 +313,6 @@ def get_standardized_orfs(filen, sheet): def _create_figures(name_pretty_name_is_sample, config, args): - name, pretty_name, is_sample = name_pretty_name_is_sample is_unique = not config.get("keep_riboseq_multimappers", False) @@ -404,7 +398,6 @@ def _create_figures(name_pretty_name_is_sample, config, args): def create_all_figures(config, sample_name_map, condition_name_map, args): - is_sample = True sample_names = sorted(config["riboseq_samples"].keys()) samples = [(name, sample_name_map[name], is_sample) for name in sample_names] @@ -694,6 +687,10 @@ def main(): bed_df_dn = bed_utils.read_bed(transcript_bed_dn, low_memory=False)[cols] bed_df_dn.rename(columns={"id": "transcript_id"}, inplace=True) bed_df = pd.concat([bed_df, bed_df_dn]) + # now we have the problem that bed_df may contain duplicate transcript ids + # for the purpose of display/annotation, we favour the annotated one + # in general the novel transcript will be a variant (matched introns, extension, etc.) + bed_df.drop_duplicates(subset=["transcript_id"], inplace=True) labeled_orfs = filenames.get_labels( config["genome_base_path"], config["genome_name"], note=orf_note diff --git a/src/rpbp/analysis/rpbp_predictions/visualize_orf_type_metagene_profiles.py b/src/rpbp/analysis/rpbp_predictions/visualize_orf_type_metagene_profiles.py index e156189..854f7a9 100644 --- a/src/rpbp/analysis/rpbp_predictions/visualize_orf_type_metagene_profiles.py +++ b/src/rpbp/analysis/rpbp_predictions/visualize_orf_type_metagene_profiles.py @@ -31,7 +31,6 @@ def get_windows(profile): - profile = profile / np.max(profile) orf_len = len(profile) @@ -66,7 +65,6 @@ def get_profile(orf, profiles, min_profile): def plot_windows(windows, title, out): - if len(windows) == 0: msg = "Did not find any windows for: {}".format(title) logger.warning(msg) diff --git a/src/rpbp/orf_profile_construction/create_base_genome_profile.py b/src/rpbp/orf_profile_construction/create_base_genome_profile.py index 8311426..d682566 100644 --- a/src/rpbp/orf_profile_construction/create_base_genome_profile.py +++ b/src/rpbp/orf_profile_construction/create_base_genome_profile.py @@ -253,11 +253,26 @@ def main(): # rename STAR output to that expected by the pipeline genome_star_bam = Path(genome_star_bam) - genome_star_bam.replace(genome_sorted_bam) + # otherwise check and replace + if genome_star_bam.is_file(): + if not Path(genome_sorted_bam).is_file() or args.overwrite: + genome_star_bam.replace(genome_sorted_bam) + msg = f"Replacing {genome_star_bam.as_posix()} by {genome_sorted_bam}" + logger.info(msg) + else: + msg = ( + f"Both files {genome_star_bam.as_posix()} and {genome_sorted_bam} " + f"already exist! You should re-run using [--overwrite] to avoid " + f"data corruption!" + ) + logger.error(msg) + else: + # re-run w/o overwrite + msg = f"The file {genome_sorted_bam} already exists. Continuing..." + logger.warning(msg) # create the bamtools index - cmd = "samtools index -b {}".format(genome_sorted_bam) - shell_utils.check_call(cmd, call=call) + bam_utils.index_bam_file(genome_sorted_bam, args) # check if we want to keep multimappers if config.get("keep_riboseq_multimappers", False): diff --git a/src/rpbp/orf_profile_construction/create_orf_profiles.py b/src/rpbp/orf_profile_construction/create_orf_profiles.py index f5fc6e6..8856b17 100644 --- a/src/rpbp/orf_profile_construction/create_orf_profiles.py +++ b/src/rpbp/orf_profile_construction/create_orf_profiles.py @@ -40,7 +40,6 @@ def main(): - parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script runs all of the processing necessary to diff --git a/src/rpbp/orf_profile_construction/estimate_metagene_profile_bayes_factors.py b/src/rpbp/orf_profile_construction/estimate_metagene_profile_bayes_factors.py index c21c593..1ceb487 100644 --- a/src/rpbp/orf_profile_construction/estimate_metagene_profile_bayes_factors.py +++ b/src/rpbp/orf_profile_construction/estimate_metagene_profile_bayes_factors.py @@ -41,7 +41,6 @@ def estimate_marginal_likelihoods( signal, periodic_models, nonperiodic_models, iterations, chains, seed ): - # construct the input for the models x_1 = signal[0::3] x_2 = signal[1::3] @@ -93,7 +92,6 @@ def estimate_marginal_likelihoods( def estimate_profile_bayes_factors(profile, args): - # logging cmdstanpy_logger.disabled = True if args.enable_ext_logging: diff --git a/src/rpbp/orf_profile_construction/extract_metagene_profiles.py b/src/rpbp/orf_profile_construction/extract_metagene_profiles.py index 6353179..1091d94 100644 --- a/src/rpbp/orf_profile_construction/extract_metagene_profiles.py +++ b/src/rpbp/orf_profile_construction/extract_metagene_profiles.py @@ -34,7 +34,6 @@ def get_interval_df(start, end, seqname, strand): - interval_df = pd.DataFrame() interval_df["start"] = start interval_df["end"] = end @@ -47,7 +46,6 @@ def get_interval_df(start, end, seqname, strand): def get_length_strand_profiles(matches, profile_length): - init = lambda: np.zeros(profile_length, int) length_strand_profiles = collections.defaultdict(init) @@ -68,7 +66,6 @@ def get_length_strand_profiles(matches, profile_length): def get_metagene_profile_df( length, type_label, length_strand_profiles, upstream, downstream ): - reverse_metagene_profile = length_strand_profiles[(length, "-")] forward_metagene_profile = length_strand_profiles[(length, "+")] @@ -89,7 +86,6 @@ def get_metagene_profile_df( def main(): - parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script extracts the metagene profile diff --git a/src/rpbp/orf_profile_construction/extract_orf_profiles.py b/src/rpbp/orf_profile_construction/extract_orf_profiles.py index 52b2a67..2b05fad 100644 --- a/src/rpbp/orf_profile_construction/extract_orf_profiles.py +++ b/src/rpbp/orf_profile_construction/extract_orf_profiles.py @@ -30,7 +30,6 @@ def get_p_site_intersections(seqname, strand, p_sites, exons_df): - # only the things in the right direction, etc. m_exons_seqname = exons_df["seqname"] == seqname m_p_sites_seqname = p_sites["seqname"] == seqname diff --git a/src/rpbp/orf_profile_construction/select_periodic_offsets.py b/src/rpbp/orf_profile_construction/select_periodic_offsets.py index 8377440..b541449 100644 --- a/src/rpbp/orf_profile_construction/select_periodic_offsets.py +++ b/src/rpbp/orf_profile_construction/select_periodic_offsets.py @@ -11,7 +11,6 @@ def get_most_periodic_offset(profile_df): - # mask_largest_count = profile_df['profile_sum'] == profile_df['profile_sum'].max() # largest_count_row = profile_df[mask_largest_count].iloc[0] m_highest_peak = profile_df["profile_peak"] == profile_df["profile_peak"].max() diff --git a/src/rpbp/reference_preprocessing/label_orfs.py b/src/rpbp/reference_preprocessing/label_orfs.py index 4942970..1c231e7 100644 --- a/src/rpbp/reference_preprocessing/label_orfs.py +++ b/src/rpbp/reference_preprocessing/label_orfs.py @@ -331,7 +331,6 @@ def main(): # transcript structure (i.e start upstream but otherwise # have the same structure). if args.nonoverlapping_label is None: - transcript_matches = bed_utils.get_bed_overlaps( annotated_exons, extracted_orf_exons, min_b_overlap=1 ) @@ -345,7 +344,6 @@ def main(): } else: - extended_match_ids = { m.b_info for m in extended_matches @@ -414,7 +412,6 @@ def main(): trailer_match_pairs = {(m.a_info, m.b_info) for m in trailer_matches} if args.nonoverlapping_label is None: - # For standard assembly, we also need to make sure that # all overlap matches are fully contained within the # transcript structure. @@ -491,7 +488,6 @@ def main(): logger.info(msg) else: - overlap_ids = {m.b_info for m in out_of_frame_matches} overlap_ids |= {m.b_info for m in leader_matches} overlap_ids |= {m.b_info for m in trailer_matches} diff --git a/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py b/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py index 258131e..24fbbcc 100644 --- a/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py +++ b/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py @@ -16,6 +16,8 @@ from pathlib import Path +import pandas as pd + import pbiotools.misc.logging_utils as logging_utils import pbiotools.misc.shell_utils as shell_utils import pbiotools.misc.slurm as slurm @@ -38,6 +40,11 @@ logger = logging.getLogger(__name__) +class DuplicateIdsError(Exception): + def __init__(self, message): + self.message = message + + def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False): """Process a GTF file into its ORFs.""" @@ -333,11 +340,20 @@ def main(): logger.info(msg) if call: - concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) - concatenated_bed["orf_num"] = range(len(concatenated_bed)) + concatenated_orfs = bed_utils.concatenate(orfs_files, sort_bed=True) + # this can happen... and is not currently well handled + if not concatenated_orfs.id.is_unique: + msg = ( + "Duplicate ORF ids were found when merging annotated and de novo ORFs. " + "This is due to matching transcript ids and start/stop boundaries. " + "Check you de novo annotation, and remove (or rename) these transcripts." + ) + logger.error(msg) + raise DuplicateIdsError(msg) + concatenated_orfs["orf_num"] = range(len(concatenated_orfs)) additional_columns = ["orf_num", "orf_len"] fields = bed_utils.bed12_field_names + additional_columns - bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) + bed_utils.write_bed(concatenated_orfs[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) @@ -359,9 +375,9 @@ def main(): logger.info(msg) if call: - concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) + concatenated_exons = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + ["exon_index", "transcript_start"] - bed_utils.write_bed(concatenated_bed[fields], exons_file) + bed_utils.write_bed(concatenated_exons[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) @@ -384,8 +400,17 @@ def main(): if call: # not a BED file - concatenated_bed = bed_utils.concatenate(label_files, sort_bed=False) - bed_utils.write_bed(concatenated_bed, labeled_orfs) + concatenated_labels = bed_utils.concatenate(label_files, sort_bed=False)[ + ["id", "orf_type", "transcripts"] + ] + # make sure the orf numbering is the same + concatenated_labels = pd.merge( + concatenated_labels, + concatenated_orfs[["id", "orf_num"]], + how="left", + on="id", + ) + bed_utils.write_bed(concatenated_labels, labeled_orfs) else: msg = "Skipping concatenation due to --call value" logger.info(msg) diff --git a/src/rpbp/ribo_utils/filenames.py b/src/rpbp/ribo_utils/filenames.py index 8bf45b7..0472c23 100644 --- a/src/rpbp/ribo_utils/filenames.py +++ b/src/rpbp/ribo_utils/filenames.py @@ -106,7 +106,6 @@ def get_riboseq_base( is_filtered=False, note=None, ): - unique = get_unique_string(is_unique) l = get_length_string(length) o = get_offset_string(offset) @@ -122,14 +121,12 @@ def get_riboseq_base( def get_riboseq_bam_base(riboseq_base, name, **kwargs): - bam_base = get_riboseq_base(riboseq_base, name, "without-rrna-mapping", **kwargs) return bam_base def get_riboseq_bam(riboseq_base, name, **kwargs): - s = get_riboseq_bam_base(riboseq_base, name, **kwargs) s = s + ".bam" return s @@ -147,7 +144,6 @@ def get_riboseq_bam_fastqc_data( note=None, is_chisq=False, ): - unique = get_unique_string(is_unique) l = get_length_string(length) n = get_note_string(note) @@ -166,7 +162,6 @@ def get_bed( is_annotated=False, is_de_novo=False, ): - c = get_annotated_string(is_annotated) d = get_de_novo_string(is_de_novo) fn = "{}{}{}.bed.gz".format(name, c, d) @@ -174,7 +169,6 @@ def get_bed( def get_riboseq_bayes_factors(riboseq_base, name, **kwargs): - s = get_riboseq_base(riboseq_base, name, "orf-predictions", **kwargs) s = s + ".bayes-factors.bed.gz" @@ -185,7 +179,6 @@ def get_riboseq_bayes_factors(riboseq_base, name, **kwargs): def get_riboseq_cell_type_protein(riboseq_base, name, **kwargs): - s = get_riboseq_base(riboseq_base, name, "cell-types", **kwargs) s = s + ".predicted-orfs.protein.fa" return s @@ -195,7 +188,6 @@ def get_riboseq_cell_type_protein(riboseq_base, name, **kwargs): def get_exons(base_path, name, is_annotated=False, is_de_novo=False, note=None): - note_str = get_note_string(note) c = get_annotated_string(is_annotated) d = get_de_novo_string(is_de_novo) @@ -204,7 +196,6 @@ def get_exons(base_path, name, is_annotated=False, is_de_novo=False, note=None): def get_labels(base_path, name, is_annotated=False, is_de_novo=False, note=None): - note_str = get_note_string(note) c = get_annotated_string(is_annotated) d = get_de_novo_string(is_de_novo) @@ -260,7 +251,6 @@ def get_riboseq_frame_counts(riboseq_base, name, **kwargs): def get_gtf( config, ): - if "de_novo_gtf" in config: base_path = config["genome_base_path"] name = config["genome_name"] @@ -274,21 +264,18 @@ def get_gtf( def get_metagene_profile_image(base, name, image_type="eps", **kwargs): - s = get_riboseq_base(base, name, "metagene-profiles", **kwargs) s = s + "." + image_type return s def get_metagene_profiles(riboseq_base, name, **kwargs): - s = get_riboseq_base(riboseq_base, name, "metagene-profiles", **kwargs) s = s + ".metagene-profile.csv.gz" return s def get_metagene_profiles_bayes_factors(riboseq_base, name, **kwargs): - s = get_riboseq_base(riboseq_base, name, "metagene-profiles", **kwargs) s = s + ".metagene-periodicity-bayes-factors.csv.gz" return s @@ -315,7 +302,6 @@ def get_models(models_base, model_type): def get_metagene_profile_bayes_factor_image( riboseq_base, name, image_type="eps", **kwargs ): - s = get_riboseq_base(riboseq_base, name, "metagene-profiles", **kwargs) s = s + ".bayes-factors." + image_type return s @@ -344,7 +330,6 @@ def get_orf_type_profile_base( is_chisq=False, subfolder="orf-predictions", ): - subfolder = os.path.join(subfolder, "plots") s = get_riboseq_base( riboseq_base, @@ -370,7 +355,6 @@ def get_orf_type_profile_image(base_path, orf_type, strand, image_type="eps"): def get_periodic_offsets(riboseq_base, name, **kwargs): - sub_folder = kwargs.pop("sub_folder", "metagene-profiles") s = get_riboseq_base(riboseq_base, name, sub_folder, **kwargs) s = s + ".periodic-offsets.csv.gz" @@ -378,7 +362,6 @@ def get_periodic_offsets(riboseq_base, name, **kwargs): def get_riboseq_peptide_matches(riboseq_base, name, peptide_name, **kwargs): - n = "{}-{}".format(name, peptide_name) s = get_riboseq_base(riboseq_base, n, "peptide-matches", **kwargs) @@ -394,21 +377,18 @@ def get_riboseq_predicted_orfs(riboseq_base, name, **kwargs): def get_riboseq_predicted_orfs_dna(riboseq_base, name, **kwargs): - s = get_riboseq_base(riboseq_base, name, "orf-predictions", **kwargs) s = s + ".predicted-orfs.dna.fa" return s def get_riboseq_predicted_orfs_protein(riboseq_base, name, **kwargs): - s = get_riboseq_base(riboseq_base, name, "orf-predictions", **kwargs) s = s + ".predicted-orfs.protein.fa" return s def get_riboseq_profiles(riboseq_base, name, **kwargs): - s = get_riboseq_base(riboseq_base, name, "orf-profiles", **kwargs) s = s + ".profiles.mtx.gz" return s @@ -441,7 +421,6 @@ def get_raw_data_fastqc_path(base_path): def get_raw_data_fastqc_data(base_path, filename): - name = get_fastqc_name(filename) fastqc_folder = "{}_fastqc".format(name) rdp = get_raw_data_fastqc_path(base_path) @@ -467,7 +446,6 @@ def get_transcript_fasta( is_annotated=False, is_de_novo=False, ): - c = get_annotated_string(is_annotated) d = get_de_novo_string(is_de_novo) fn = "{}.transcripts{}{}.fa".format(name, c, d) diff --git a/src/rpbp/ribo_utils/utils.py b/src/rpbp/ribo_utils/utils.py index 3513ced..a691f4f 100644 --- a/src/rpbp/ribo_utils/utils.py +++ b/src/rpbp/ribo_utils/utils.py @@ -22,12 +22,10 @@ def __missing__(self, key): def get_transcript_id(orf_id, sep="_"): - return orf_id.split(sep)[0] def get_all_transcript_ids(orfs, sep="_", num_cpus=1, progress_bar=False): - import pbiotools.misc.parallel as parallel transcript_ids = parallel.apply_parallel_iter( @@ -43,7 +41,6 @@ def get_all_transcript_ids(orfs, sep="_", num_cpus=1, progress_bar=False): def get_riboseq_replicates(config): - if "riboseq_biological_replicates" in config: if config["riboseq_biological_replicates"] is not None: msg = "Found 'riboseq_biological_replicates' key in config file" @@ -131,7 +128,6 @@ def get_periodic_lengths_and_offsets( is_unique=True, default_params=None, ): - """This function applies a set of filters to metagene profiles to select those which are "periodic" based on the read counts and Bayes factor estimates. @@ -469,7 +465,6 @@ def smooth_profile( reweighting_iterations=translation_options["smoothing_reweighting_iterations"], fraction=translation_options["smoothing_fraction"], ): - """This function smoothes the given ORF profile using the frame-specific approach. It assumes the profile is a dense numpy array and that any filtering due to differences of counts in reading frames, lengths, etc., @@ -592,7 +587,6 @@ def get_bf_filter( max_bf_var=translation_options["max_bf_var"], min_bf_likelihood=translation_options["min_bf_likelihood"], ): - """This function applies filters to the Bayes factor estimates to find all ORFs which should be predicted as translated. This does not consider the length and profile sums, so this filter would need to be combined with diff --git a/src/rpbp/run_all_rpbp_instances.py b/src/rpbp/run_all_rpbp_instances.py index 192e9ed..7c648b5 100644 --- a/src/rpbp/run_all_rpbp_instances.py +++ b/src/rpbp/run_all_rpbp_instances.py @@ -215,7 +215,6 @@ def main(): merge_replicates_str = "--merge-replicates" for condition_name in sorted(riboseq_replicates.keys()): - # then we predict the ORFs cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {} {}".format( args.config, diff --git a/src/rpbp/translation_prediction/estimate_orf_bayes_factors.py b/src/rpbp/translation_prediction/estimate_orf_bayes_factors.py index 1cf7a4f..5b99644 100644 --- a/src/rpbp/translation_prediction/estimate_orf_bayes_factors.py +++ b/src/rpbp/translation_prediction/estimate_orf_bayes_factors.py @@ -207,7 +207,6 @@ def get_bayes_factor(profile, translated_models, untranslated_models, args): def get_all_bayes_factors(orfs): - """This function calculates the Bayes' factor term for each region in regions. See the description of the script for the Bayes' factor calculations. diff --git a/src/rpbp/translation_prediction/predict_translated_orfs.py b/src/rpbp/translation_prediction/predict_translated_orfs.py index a78145f..a753cfd 100644 --- a/src/rpbp/translation_prediction/predict_translated_orfs.py +++ b/src/rpbp/translation_prediction/predict_translated_orfs.py @@ -64,7 +64,6 @@ def get_profile(name, config, args): def main(): - parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""""This script runs the second part of the pipeline: @@ -332,7 +331,6 @@ def main(): if args.write_unfiltered: filters.append(False) for is_filtered in filters: - filtered_str = "" if is_filtered: filtered_str = "--select-longest-by-stop --select-best-overlapping" diff --git a/src/rpbp/translation_prediction/select_final_prediction_set.py b/src/rpbp/translation_prediction/select_final_prediction_set.py index f539d1d..92a8d84 100644 --- a/src/rpbp/translation_prediction/select_final_prediction_set.py +++ b/src/rpbp/translation_prediction/select_final_prediction_set.py @@ -29,7 +29,6 @@ def get_best_overlapping_orf(merged_ids, predicted_orfs): - if len(merged_ids) == 1: m_id = predicted_orfs["id"] == merged_ids[0] return predicted_orfs[m_id].iloc[0] @@ -192,7 +191,6 @@ def main(): logger.info(msg) if args.select_best_overlapping: - msg = "Finding overlapping ORFs" logger.info(msg) diff --git a/tests/regression/conftest.py b/tests/regression/conftest.py index 33a6679..38d0754 100644 --- a/tests/regression/conftest.py +++ b/tests/regression/conftest.py @@ -13,7 +13,6 @@ @pytest.fixture(scope="session") def data_loc(tmp_path_factory): - """\ Download reference dataset for regression testing. @@ -41,7 +40,6 @@ def data_loc(tmp_path_factory): @pytest.fixture(scope="session") def getf_config(data_loc): - """\ Set configuration file. @@ -77,7 +75,6 @@ def getf_config(data_loc): @pytest.fixture(scope="session") def get_genome(getf_config): - """\ Run `prepare-rpbp-genome`. @@ -107,7 +104,6 @@ def get_genome(getf_config): @pytest.fixture(scope="session") def getf_genome(get_genome): - """\ Get all the Rp-Bp outpout file names for the reference genome indices, for the current output @@ -179,7 +175,6 @@ def getf_genome(get_genome): @pytest.fixture(scope="session") def get_pipeline(getf_config): - """\ Run `run-all-rpbp-instances`. @@ -213,7 +208,6 @@ def get_pipeline(getf_config): @pytest.mark.depends(on=["getf_genome"]) @pytest.fixture(scope="session") def getf_pipeline(get_pipeline): - """\ Get all the Rp-Bp outpout file names for the ORF periodicity estimates, ORF profiles, diff --git a/tests/regression/test_rpbp.py b/tests/regression/test_rpbp.py index fc70a4c..177fa89 100644 --- a/tests/regression/test_rpbp.py +++ b/tests/regression/test_rpbp.py @@ -30,7 +30,6 @@ def test_pipeline_part1(getf_genome): # test output of `run-all-rpbp-instances` @pytest.mark.depends(on=["test_pipeline_part1"]) def test_pipeline_part2(getf_pipeline): - from pbiotools.utils.fastx_utils import get_fasta_dict # deterministic output - profiles should match exactly @@ -43,7 +42,6 @@ def test_pipeline_part2(getf_pipeline): # periodic-offsets - compare only the periodic lengths for file, ref_file in periodics: - file_df = to_df(file[0]) file_lengths = file[1] @@ -82,7 +80,6 @@ def test_pipeline_part2(getf_pipeline): "profile_sum", ] for file, ref_file in predictions: - file_df = to_df(file) ref_file_df = to_df(ref_file)