resolwe_bio/processes/alignment/subread.yml

# ================
# Aligner - Subread
# ================
---

- slug: alignment-subread
  name: Subread
  requirements:
    expression-engine: jinja
    executor:
      docker:
        image: resolwebio/legacy:latest
  data_name: "Alignment ({{ reads.fastq.0.file|basename|default('?') }})"
  version: 0.0.7
  type: data:alignment:bam:subread
  category: analyses:alignment
  flow_collection: sample
  persistence: CACHED
  description: >
    An accurate and efficient aligner for mapping both genomic DNA-seq reads and RNA-seq reads (for the purpose of expression analysis).
  input:
    - name: genome
      label: Reference genome
      type: data:genome:fasta
    - name: reads
      label: Reads
      type: data:reads:fastq
    - name: options
      label: Options
      group:
        - name: indel
          label: Number of INDEL bases
          type: basic:integer
          required: false
          default: 5
          description: >
            Specify the number of INDEL bases allowed in the mapping.
        - name: consensus
          label: Consensus threshold
          type: basic:integer
          required: false
          default: 3
          description: >
            Specify the consensus threshold, which is the minimal number of consensus subreads required for reporting a hit.
        - name: mis_matched_bp
          label: Max number of mis-matched bases
          type: basic:integer
          required: false
          default: 3
          description: >
            Specify the maximum number of mis-matched bases allowed in the alignment.
        - name: cpu_number
          label: Number of threads/CPUs
          type: basic:integer
          required: false
          default: 1
          description: >
            Specify the number of threads/CPUs used for mapping
        - name: unique_reads
          label: Output uniquely mapped reads
          type: basic:boolean
          required: false
          description: >
            Reads that were found to have more than one best mapping location will not be reported.
    - name: PE_options
      label: Paired end alignment options
      group:
        - name: reads_orientation
          label: reads orientation
          type: basic:string
          required: false
          description: >
            Specify the orientation of the two reads from the same pair.
          default: "fr"
          choices:
            - label: ff
              value: ff
            - label: fr
              value: fr
            - label: rf
              value: rf
        - name: consensus_subreads
          label: Minimum number of consensus subreads
          type: basic:integer
          required: false
          default: 1
          description: >
            Specify the minimum number of consensus subreads both reads from the sam pair must have.
  output:
    - name: bam
      label: Alignment file
      type: basic:file
      description: Position sorted alignment
    - name: bai
      label: Index BAI
      type: basic:file
    - name: unmapped
      label: Unmapped reads
      type: basic:file
      required: false
    - name: stats
      label: Statistics
      type: basic:file
  run:
    runtime: polyglot
    language: bash
    program: |
      GENOME_NAME=`basename {{ genome.fasta.file }} .fasta`
      INDEX={{genome.index_subread.dir}}"/${GENOME_NAME}_index"

      if [ -d {{ genome.index_subread.dir }} ]; then
        echo "Genome index found."
      else
        re-error "Index not found, provide index file."
      fi

      re-progress 0.2

      FW_READS=()
      RW_READS=()

      {% for r in reads.fastq %}
        FW_READS+=({{ r.file }})
      {% endfor %}

      cat "${FW_READS[@]}" > fw_reads.fastq.gz

      {% if reads|type|subtype('data:reads:fastq:paired:') %}
        {% for r in reads.fastq2 %}
          READS_NAME=`basename {{ r.file }} .fastq.gz`
          RW_READS+=({{ r.file }})
        {% endfor %}

        cat "${RW_READS[@]}" > rw_reads.fastq.gz
      {% endif %}

      NAME=`basename {{ reads.fastq.0.file }} .fastq.gz`

      re-progress 0.3

      UNIQUE_READS="{% if options.unique_reads %}-u {% endif %}"

      echo "Align with subread:"
      {% if reads|type|subtype('data:reads:fastq:single:') %}
        subread-align \
          -i "${INDEX}" \
          -o "${NAME}_align_unsorted.bam" \
          -r "fw_reads.fastq.gz" \
          -t 0 \
          -I {{ options.indel }} \
          -m {{ options.consensus }} \
          -M {{ options.mis_matched_bp }} \
          -T {{ options.cpu_number }} \
          ${UNIQUE_READS}
      {% else %}
        subread-align \
          -i "${INDEX}" \
          -o "${NAME}_align_unsorted.bam" \
          -r "fw_reads.fastq.gz" \
          -R "rw_reads.fastq.gz" \
          -t 0 \
          -I {{ options.indel }} \
          -m {{ options.consensus }} \
          -M {{ options.mis_matched_bp }} \
          -T {{ options.cpu_number }} \
          ${UNIQUE_READS} \
          -S {{ PE_options.reads_orientation }} \
          -S {{ PE_options.reads_orientation }}
      {% endif %}

      re-checkrc
      re-progress 0.5

      echo "Sorting BAM file by chromosomal coordinates:"
      samtools sort -o "${NAME}.bam" "${NAME}_align_unsorted.bam"
      re-checkrc
      re-progress 0.6

      echo -e "\nINDEXING bam:"
      samtools index "${NAME}.bam" "${NAME}.bam.bai"
      re-checkrc

      echo "Calculating statistics"
      samtools flagstat "${NAME}.bam" > "${NAME}_report.txt"
      re-checkrc
      re-progress 0.75

      echo "Creating unmapped fastq file"
      samtools view -u  -f 4 -F 264 "${NAME}.bam" > "${NAME}_unmapped.bam"
      bamToFastq -i "${NAME}_unmapped.bam" -fq "${NAME}_unmapped.fastq"
      gzip -c "${NAME}_unmapped.fastq" > "${NAME}_unmapped.fastq.gz"

      re-save-file bam "${NAME}.bam"
      re-save-file bai "${NAME}.bam.bai"
      re-save-file stats "${NAME}_report.txt"

      if [ -f "${NAME}_unmapped.fastq.gz" ]; then
        re-save-file unmapped "${NAME}_unmapped.fastq.gz"
      fi