resolwe_bio/processes/expression/cuffquant.yml

# ======================
# Expression - Cuffquant
# ======================
---

- slug: cuffquant
  name: Cuffquant 2.2
  requirements:
    expression-engine: jinja
    executor:
      docker:
        image: public.ecr.aws/genialis/resolwebio/rnaseq:6.0.0
    resources:
      cores: 4
  data_name: "{{ alignment|name|default('?') }}"
  version: 2.3.1
  type: data:cufflinks:cuffquant
  category: Quantify
  flow_collection: sample
  persistence: CACHED
  description: |
    Cuffquant allows you to compute the gene and transcript expression
    profiles and save these profiles to files that you can analyze later with
    Cuffdiff or Cuffnorm. See
    [here](http://cole-trapnell-lab.github.io/cufflinks/manual/) for more
    information.
  input:
    - name: alignment
      label: Aligned reads
      type: data:alignment:bam
    - name: annotation
      label: Annotation (GTF/GFF3)
      type: data:annotation
    - name: genome
      label: Run bias detection and correction algorithm
      type: data:seq:nucleotide
      required: false
      description: |
        Provide Cufflinks with a multifasta file (genome file) via this
        option to instruct it to run a bias detection and correction
        algorithm which can significantly improve accuracy of transcript
        abundance estimates.
    - name: mask_file
      label: Mask file
      type: data:annotation:gtf
      required: false
      description: |
        Ignore all reads that could have come from transcripts in this
        GTF file. We recommend including any annotated rRNA,
        mitochondrial transcripts other abundant transcripts you wish to
        ignore in your analysis in this file. Due to variable efficiency
        of mRNA enrichment methods and rRNA depletion kits, masking
        these transcripts often improves the overall robustness of
        transcript abundance estimates.
    - name: library_type
      label: Library type
      type: basic:string
      description: |
        In cases where Cufflinks cannot determine the platform and
        protocol used to generate input reads, you can supply this
        information manually, which will allow Cufflinks to infer source
        strand information with certain protocols. The available options
        are listed below. For paired-end data, we currently only support
        protocols where reads are point towards each other:
        fr-unstranded - Reads from the left-most end of the fragment
        (in transcript coordinates) map to the transcript strand, and
        the right-most end maps to the opposite strand; fr-firststrand
        - Same as above except we enforce the rule that the right-most
        end of the fragment (in transcript coordinates) is the first
        sequenced (or only sequenced for single-end reads).
        Equivalently, it is assumed that only the strand generated
        during first strand synthesis is sequenced; fr-secondstrand -
        Same as above except we enforce the rule that the left-most end
        of the fragment (in transcript coordinates) is the first
        sequenced (or only sequenced for single-end reads).
        Equivalently, it is assumed that only the strand generated
        during second strand synthesis is sequenced.
      default: fr-unstranded
      choices:
        - label: fr-unstranded
          value: fr-unstranded
        - label: fr-firststrand
          value: fr-firststrand
        - label: fr-secondstrand
          value: fr-secondstrand
    - name: multi_read_correct
      label: Do initial estimation procedure to more accurately weight reads with multiple genome mappings
      type: basic:boolean
      default: false
      description: |
        Run an initial estimation procedure that weights reads mapping
        to multiple locations more accurately.
  output:
    - name: cxb
      label: Abundances (.cxb)
      type: basic:file
    - name: source
      label: Gene ID database
      type: basic:string
    - name: species
      label: Species
      type: basic:string
    - name: build
      label: Build
      type: basic:string
  run:
    runtime: polyglot
    language: bash
    program: |
      NAME=`basename {{ alignment.bam.file }} .bam`

      cuffquant \
        {% if genome %} --frag-bias-correct {{ genome.fasta.file }} {% endif %} \
        {% if multi_read_correct %} --multi-read-correct {% endif %} \
        {% if mask_file %} -M {{ mask_file.gtf.file }} {% endif %} \
        --library-type {{ library_type }} \
        --num-threads {{requirements.resources.cores}} \
        --quiet \
        {{ annotation.annot.file }} \
        {{ alignment.bam.file }}
      re-checkrc "Cuffquant analysis failed."

      mv abundances.cxb "${NAME}.cxb"
      re-save-file cxb "${NAME}.cxb"
      re-save source {{ annotation.source }}
      re-save species {{ alignment.species }}
      re-save build {{ alignment.build }}