workflows/xenbase-rnaseq-se.cwl

cwlVersion: v1.0
class: Workflow


requirements:
  - class: SubworkflowFeatureRequirement
  - class: StepInputExpressionRequirement
  - class: MultipleInputFeatureRequirement
  - class: InlineJavascriptRequirement
    expressionLib:
    - var default_output_name = function(named_input, ext, segment) {
          ext = ext || "";
          segment = segment || 1;
          if (Array.isArray(named_input) && named_input.length > 0){
            return named_input[0].location.split('/').slice(-1)[0].split('.').slice(0,segment).join('.')+ext;
          } else {
            return named_input.location.split('/').slice(-1)[0].split('.').slice(0,segment).join('.')+ext;
          }
      };


inputs:

  fastq_file:
    type: File
    label: "FASTQ input file"
    format: "http://edamontology.org/format_1930"
    doc: "Reads data in a FASTQ format"

  fasta_file_adapters:
    type: File
    label: "Adapters FASTA file"
    format: "http://edamontology.org/format_1929"
    doc: "Adapters FASTA file to be used by Trimmomatic"

  rsem_indices_folder:
    type: Directory
    label: "RSEM indices folder"
    doc: "Path to RSEM indices generated with BowTie2"

  bowtie_indices_folder:
    type: Directory
    label: "BowTie Ribosomal Indices"
    doc: "Path to Bowtie generated indices for ribosomal FASTA"

  threads:
    type: int?
    default: 2
    label: "Number of threads"
    doc: "Number of threads for those steps that support multithreading"


outputs:

  rsem_isoforms_file:
    type: File
    format: "http://edamontology.org/format_3475"
    label: "RSEM isoforms expression file"
    doc: "RSEM isoforms expression file"
    outputSource: rename_rsem_isoforms_file/target_file

  biowardrobe_isoforms_file:
    type: File
    format: "http://edamontology.org/format_3752"
    label: "Biowardrobe compatible isoforms expression file"
    doc: "Biowardrobe compatible isoforms expression file"
    outputSource: make_biowardrobe_isoforms/biowardrobe_isoforms_file

  rsem_genes_file:
    type: File
    format: "http://edamontology.org/format_3475"
    label: "RSEM genes expression file"
    doc: "RSEM genes expression file"
    outputSource: rename_rsem_genes_file/target_file

  bambai_pair:
    type: File
    format: "http://edamontology.org/format_2572"
    label: "Coordinate sorted BAM alignment file (+index BAI)"
    doc: "Coordinate sorted BAM file and BAI index file"
    outputSource: rename_rsem_bambai_pair/target_file

  bigwig_file:
    type: File
    format: "http://edamontology.org/format_3006"
    label: "BigWig file"
    doc: "Generated BigWig file"
    outputSource: bam_to_bigwig/bigwig_file

  fastx_statistics:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "FASTQ statistics"
    doc: "fastx_quality_stats generated quality statistics file"
    outputSource: fastx_quality_stats/statistics_file

  get_stat_log:
    type: File
    label: "RSEM & Bowtie combined log"
    format: "http://edamontology.org/format_2330"
    doc: "Mapping statistics from RSEM & Bowtie logs"
    outputSource: get_stat/output_file

  rsem_stat_folder:
    type: Directory
    label: "RSEM alignment statistics"
    doc: "RSEM generated statistics folder. Mostly for debug purposes"
    outputSource: rsem_calculate_expression/stat_folder

  bowtie_log:
    type: File
    format: "http://edamontology.org/format_2330"
    label: "Ribo Bowtie alignment log"
    doc: "Ribo Bowtie alignment log file. Mostly for debug purposes"
    outputSource: ribo_bowtie_aligner/log_file


steps:

  extract_fastq:
    run: ../tools/extract-fastq.cwl
    in:
      compressed_file: fastq_file
    out: [fastq_file]

  fastx_quality_stats:
    run: ../tools/fastx-quality-stats.cwl
    in:
      input_file: extract_fastq/fastq_file
    out: [statistics_file]

  fastqc_stats:
    run: ../tools/fastqc.cwl
    in:
      fastq_file: extract_fastq/fastq_file
    out: [summary_file]

  fastqc_results_trigger:
    run: ../expressiontools/fastqc-results-trigger.cwl
    in:
      summary_file: fastqc_stats/summary_file
    out: [trigger]

  trim_adapters:
    run: ../tools/trimmomatic.cwl
    in:
      fastq_file_upstream: extract_fastq/fastq_file
      adapters_file: fasta_file_adapters
      trigger: fastqc_results_trigger/trigger
      lib_type:
        default: "SE"
      illuminaclip_step_param:
        default: '2:30:15'
      threads: threads
    out: [upstream_trimmed_file]

  rsem_calculate_expression:
    run: ../tools/rsem-calculate-expression.cwl
    in:
      upstream_read_file: trim_adapters/upstream_trimmed_file
      indices_folder: rsem_indices_folder
      bowtie2:
        default: true
      sort_bam_by_coordinate:
        default: true
      output_genome_bam:
        default: true
      threads: threads
    out:
      - isoform_results_file
      - gene_results_file
      - genome_sorted_bam_bai_pair
      - stat_folder
      - total_reads_number
      - mapped_reads_number
      - multimapped_reads_number

  rename_rsem_bambai_pair:
    run: ../tools/rename.cwl
    in:
      source_file: rsem_calculate_expression/genome_sorted_bam_bai_pair
      target_filename:
        source: fastq_file
        valueFrom: $(default_output_name(self, ".bam"))
    out: [target_file]

  rename_rsem_isoforms_file:
    run: ../tools/rename.cwl
    in:
      source_file: rsem_calculate_expression/isoform_results_file
      target_filename:
        source: fastq_file
        valueFrom: $(default_output_name(self, ".isoforms.tsv"))
    out: [target_file]

  rename_rsem_genes_file:
    run: ../tools/rename.cwl
    in:
      source_file: rsem_calculate_expression/gene_results_file
      target_filename:
        source: fastq_file
        valueFrom: $(default_output_name(self, ".genes.tsv"))
    out: [target_file]

  get_chr_length_file:
    run: ../expressiontools/get-file-by-name.cwl
    in:
      input_files: rsem_indices_folder
      basename_regex:
        default: "chrlist$"
    out: [selected_file]

  bam_to_bigwig:
    run: ../subworkflows/bam-bedgraph-bigwig.cwl
    in:
      bam_file: rename_rsem_bambai_pair/target_file
      chrom_length_file: get_chr_length_file/selected_file
      mapped_reads_number: rsem_calculate_expression/mapped_reads_number
    out: [bigwig_file]

  ribo_bowtie_aligner:
    run: ../tools/bowtie-alignreads.cwl
    in:
      upstream_filelist: trim_adapters/upstream_trimmed_file
      indices_folder: bowtie_indices_folder
      output_filename:
        source: fastq_file
        valueFrom: $(default_output_name(self, ".txt"))
      v:
        default: 3
      best:
        default: true
      threads: threads
    out:
      - mapped_reads_number
      - log_file

  get_stat:
    run: ../tools/custom-bash.cwl
    in:
      input_file: rsem_calculate_expression/isoform_results_file
      script:
        default: "echo -n $1 $2 $3 $4 `cat $0 | cut -f 5 | grep -v expected_count | awk '{sum+=$1} END {print int(sum)}'` > $5"
      param:
        source:
        - rsem_calculate_expression/total_reads_number
        - rsem_calculate_expression/mapped_reads_number
        - ribo_bowtie_aligner/mapped_reads_number
        - rsem_calculate_expression/multimapped_reads_number
        - fastq_file
        valueFrom: |
          ${
            self[4] = default_output_name(self[4], ".stat");
            return self.map(String);
          }
    out: [output_file]

  get_annotation_file:
    run: ../expressiontools/get-file-by-name.cwl
    in:
      input_files: rsem_indices_folder
      basename_regex:
        default: "ti$"
    out: [selected_file]

  make_biowardrobe_isoforms:
    run: ../tools/python-make-biowardrobe-isoforms.cwl
    in:
      rsem_isoforms_file: rename_rsem_isoforms_file/target_file
      rsem_annotation_file: get_annotation_file/selected_file
    out: [biowardrobe_isoforms_file]

$namespaces:
  s: http://schema.org/

$schemas:
- http://schema.org/docs/schema_org_rdfa.html

s:name: "xenbase-rnaseq-se"
s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/workflows/xenbase-rnaseq-se.cwl
s:codeRepository: https://github.com/Barski-lab/workflows
s:license: http://www.apache.org/licenses/LICENSE-2.0

s:isPartOf:
  class: s:CreativeWork
  s:name: Common Workflow Language
  s:url: http://commonwl.org/

s:creator:
- class: s:Organization
  s:legalName: "Cincinnati Children's Hospital Medical Center"
  s:location:
  - class: s:PostalAddress
    s:addressCountry: "USA"
    s:addressLocality: "Cincinnati"
    s:addressRegion: "OH"
    s:postalCode: "45229"
    s:streetAddress: "3333 Burnet Ave"
    s:telephone: "+1(513)636-4200"
  s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png"
  s:department:
  - class: s:Organization
    s:legalName: "Allergy and Immunology"
    s:department:
    - class: s:Organization
      s:legalName: "Barski Research Lab"
      s:member:
      - class: s:Person
        s:name: Michael Kotliar
        s:email: mailto:misha.kotliar@gmail.com
        s:sameAs:
        - id: http://orcid.org/0000-0002-6486-3898

doc: |
  XenBase workflow for analysing RNA-Seq single-end data

s:about: |
  1. Convert input SRA file into pair of upsrtream and downstream FASTQ files (run fastq-dump)
  2. Analyze quality of FASTQ files (run fastqc with each of the FASTQ files)
  3. If any of the following fields in fastqc generated report is marked as failed for at least one of input FASTQ files:
        "Per base sequence quality",
        "Per sequence quality scores",
        "Overrepresented sequences",
        "Adapter Content",
    - trim adapters (run trimmomatic)
  4. Align original or trimmed FASTQ files to reference genome, calculate genes and isoforms expression (run RSEM)
  5. Count mapped reads number in sorted BAM file (run bamtools stats)
  6. Generate genome coverage BED file (run bedtools genomecov)
  7. Sort genearted BED file (run sort)
  8. Generate genome coverage bigWig file from BED file (run bedGraphToBigWig)