# Processing Whole Exome Sequencing (WES/WXS) Data

In [1]:
!conda info


     active environment : sra-tools
    active env location : /home/fmbuga/.conda/envs/sra-tools
            shell level : 3
       user config file : /home/fmbuga/.condarc
 populated config files : /home/fmbuga/.condarc
          conda version : 4.10.3
    conda-build version : not installed
         python version : 3.6.9.final.0
       virtual packages : __linux=3.10.0=0
                          __glibc=2.17=0
                          __unix=0=0
                          __archspec=1=x86_64
       base environment : /opt/intel/intelpython3  (read only)
      conda av data dir : /opt/intel/intelpython3/etc/conda
  conda av metadata url : None
           channel URLs : https://conda.anaconda.org/conda-forge/linux-64
                          https://conda.anaconda.org/conda-forge/noarch
                          https://conda.anaconda.org/bioconda/linux-64
                          https://conda.anaconda.org/bioconda/noarch
                          https://repo.anaconda.com/pkgs/m

In [2]:
!pwd

/home/fmbuga/ERR1831349


In [3]:
!ls

230722_P_fastq_2_vcf.ipynb


## Download data as fastq from NCBI SRA

In [6]:
!cat fasterq_dump_230722_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=fasterq_dump_230722
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/fasterq_dump_230722_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/fasterq_dump_230722_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=0

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate sra-tools                      ### CONDA ENVIRONMENT NAME
echo ""

work_dir=$(pwd)
echo "working directory: $work_dir"     ### WORKING DIRECTORY
echo ""

ncbi_sra_accession=$1                   ### NCBI SRA archive accession ID to download

# fasterq-dump
call=$(fasterq-dump $ncbi_sra_accession \
    --threads 28 \
    --progress \
    --details \
    --split-files)                                   ### split-files for paired-end data
    
echo $call
eval $call
echo "fasterq-dump

In [7]:
# submit fasterq-dump job to SLURM

!sbatch fasterq_dump_230722_slurm.sh \
    ERR1831349

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 645837
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645836     nodes     bash   fmbuga  R    1:08:45      1 node18
            645837     nodes fasterq_   fmbuga  R       0:05      1 node35
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645836     nodes     bash   fmbuga  R    1:08:55      1 node18
            645837     nodes fasterq_   fmbuga  R       0:15      1 node35
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645836     nodes     bash   fmbuga  R    1:09:11      1 node18
            645837     nodes fasterq_   fmbuga  R       0:31      1 node35
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645836     nodes     bash   fmbuga  R    1:09:41      1 node18
            645837     nodes fasterq_   fmbuga  R       1:01      1 node35


In [3]:
# create folder to store raw fastq and move downloaded files there
!mkdir ./00_raw_fastq
!mv *.fastq ./00_raw_fastq
!ls ./00_raw_fastq

ERR1831349_1.fastq  ERR1831349_2.fastq


## Check Quality of fastq with FastQC

In [6]:
!cat fastqc_allinFolder_230723_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=fastqc_allInFolder_230723
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/fastqc_allInFolder_230723_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/fastqc_allInFolder_230723_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --mem=0

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate sra-tools                      ### CONDA ENVIRONMENT NAME
conda info
echo ""


call="fastqc --version"
echo $call
eval $call
echo ""

input_dir=$1             ### INPUT DIRECTORY
output_dir=$2            ### OUTPUT DIRECTORY 
echo "output directory: $output_dir"
echo ""

# fastqc

call=$(find $input_dir -name "*.fastq" | \
        xargs fastqc \
                --outdir $output_dir \
                --threads 2)                                   ### INPUT FASTQs


In [7]:
# submit fastqc job to SLURM
!sbatch fastqc_allinFolder_230723_slurm.sh \
    ./00_raw_fastq \
    ./00_raw_fastq

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 645885
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645884     nodes     bash   fmbuga  R      33:05      1 node59
            645885     nodes fastqc_a   fmbuga  R       0:05      1 node60
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645884     nodes     bash   fmbuga  R      33:15      1 node59
            645885     nodes fastqc_a   fmbuga  R       0:15      1 node60
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645884     nodes     bash   fmbuga  R      33:30      1 node59
            645885     nodes fastqc_a   fmbuga  R       0:30      1 node60
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645884     nodes     bash   fmbuga  R      34:01      1 node59
            645885     nodes fastqc_a   fmbuga  R       1:01      1 node60


In [11]:
# Import libraries for HTML visualization
from IPython.display import IFrame

In [12]:
IFrame(src='./00_raw_fastq/ERR1831349_1_fastqc.html', width=1500, height=2000)

In [13]:
IFrame(src='./00_raw_fastq/ERR1831349_2_fastqc.html', width=1500, height=2000)

- since no red flags in FastQC, proceed with mapping
- if mapping doesn't pass QC checks may return and see if some trimming helps

## map reads to human reference hg38 with bwa mem

In [16]:
!cat bwa_mem_hg38_samtools_fastq_to_bam_230723_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=bwa_mem_hg38_samtools_fastq_to_bam_230723
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/bwa_mem_hg38_samtools_fastq_to_bam_230723_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/bwa_mem_hg38_samtools_fastq_to_bam_230723_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
##SBATCH --mem-per-cpu=4G

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate gatk4                      ### CONDA ENVIRONMENT NAME
conda info
echo ""


call="bwa" # to get the version
echo "bwa version"
echo $call
eval $call
echo ""

call="samtools --version"
echo $call
eval $call
echo ""

fastq1=$1                             ### INPUT FASTQ1
fastq2=$2                             ### INPUT FASTQ2
output_dir=$3                         ### OUTPUT DIRECTORY 

echo "output directory:

In [15]:
# create directory to output bam files to
!mkdir ./01_raw_bam

In [1]:
# submit mapping job to SLURM HPC
!sbatch bwa_mem_hg38_samtools_fastq_to_bam_230723_slurm.sh \
    ./00_raw_fastq/ERR1831349_1.fastq \
    ./00_raw_fastq/ERR1831349_2.fastq \
    ./01_raw_bam/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 645893
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R      28:21      1 node58
            645893     nodes bwa_mem_   fmbuga  R       0:05      1 node05
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R      28:31      1 node58
            645893     nodes bwa_mem_   fmbuga  R       0:15      1 node05
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R      28:46      1 node58
            645893     nodes bwa_mem_   fmbuga  R       0:30      1 node05
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R      29:16      1 node58
            645893     nodes bwa_mem_   fmbuga  R       1:00      1 node05


In [2]:
# QC check output bam file
!samtools flagstat \
    -@ 28 \
    ./01_raw_bam/ERR1831349.bam

93603695 + 0 in total (QC-passed reads + QC-failed reads)
93556700 + 0 primary
0 + 0 secondary
46995 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
93403023 + 0 mapped (99.79% : N/A)
93356028 + 0 primary mapped (99.79% : N/A)
93556700 + 0 paired in sequencing
46778350 + 0 read1
46778350 + 0 read2
92600884 + 0 properly paired (98.98% : N/A)
93279060 + 0 with itself and mate mapped
76968 + 0 singletons (0.08% : N/A)
593850 + 0 with mate mapped to a different chr
517790 + 0 with mate mapped to a different chr (mapQ>=5)


- based on the high mapping rate (~99%) no need to go back and trim fastq reads
- proceed with marking duplicates

## Mark Duplicates

In [3]:
!cat ~/tools/slurm_scripts/samtools_collate_fixmate_sort_markdup_221222_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=samtools_collate_fixmate_sort_markdup_221222
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_221222_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_221222_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
#SBATCH --mem-per-cpu=4G

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate minimap2_224                      ### CONDA ENVIRONMENT NAME
conda info
echo ""

call="samtools --version" # to get the version
echo $call
eval $call
echo ""

input_bam=$1
output_dir=$2

out_name=$(basename $input_bam .bam)

call=$(samtools collate \
    -Ou \
    --threads 28 \
    $input_bam | \
        samtools fixmate \
            - \
            - \
            --threads 28 \
            -u 

In [4]:
# copy mark duplicates shell script
!cp ~/tools/slurm_scripts/samtools_collate_fixmate_sort_markdup_221222_slurm.sh .

In [5]:
# make folder to store bam with duplicates marked
!mkdir ./02_bam_markdup

In [7]:
# submit mark duplicates job to SLURM HPC
!sbatch samtools_collate_fixmate_sort_markdup_221222_slurm.sh \
    ./01_raw_bam/ERR1831349.bam \
    ./02_bam_markdup/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 645897
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:44:02      1 node58
            645897     nodes samtools   fmbuga  R       0:05      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:44:12      1 node58
            645897     nodes samtools   fmbuga  R       0:15      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:44:27      1 node58
            645897     nodes samtools   fmbuga  R       0:30      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:44:57      1 node58
            645897     nodes samtools   fmbuga  R       1:00      1 node62


In [8]:
# submit mark duplicates job to SLURM HPC
!sbatch samtools_collate_fixmate_sort_markdup_221222_slurm.sh \
    ~/nanopore/rogstrix/dorado/01_bam_raw/FAF01132-84868110.bam \
    ./02_bam_markdup/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 645898
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:54:17      1 node58
            645898     nodes samtools   fmbuga  R       0:05      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:54:27      1 node58
            645898     nodes samtools   fmbuga  R       0:15      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:54:43      1 node58
            645898     nodes samtools   fmbuga  R       0:31      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    1:55:13      1 node58
            645898     nodes samtools   fmbuga  R       1:01      1 node62


In [9]:
# QC check output bam file
!samtools flagstat \
    -@ 28 \
    ./02_bam_markdup/FAF01132-84868110_collate_fixmate_sort_markdup.bam

1681556 + 0 in total (QC-passed reads + QC-failed reads)
719983 + 0 primary
505430 + 0 secondary
456143 + 0 supplementary
17979 + 0 duplicates
17979 + 0 primary duplicates
1617881 + 0 mapped (96.21% : N/A)
656308 + 0 primary mapped (91.16% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [10]:
# QC check output bam file
!samtools flagstat \
    -@ 28 \
    ~/nanopore/rogstrix/dorado/01_bam_raw/FAF01132-84868110.bam

1681556 + 0 in total (QC-passed reads + QC-failed reads)
719983 + 0 primary
505430 + 0 secondary
456143 + 0 supplementary
0 + 0 duplicates
0 + 0 primary duplicates
1617881 + 0 mapped (96.21% : N/A)
656308 + 0 primary mapped (91.16% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [11]:
!cat samtools_collate_fixmate_sort_markdup_230723_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=samtools_collate_fixmate_sort_markdup_230723
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_230723_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_230723_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=0

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate minimap2_224                      ### CONDA ENVIRONMENT NAME
conda info
echo ""

call="samtools --version" # to get the version
echo $call
eval $call
echo ""

input_bam=$1
output_dir=$2

out_name=$(basename $input_bam .bam)

call=$(samtools collate \
    -Ou \
    --threads 28 \
    $input_bam | \
        samtools fixmate \
            - \
            - \
            --threads 28 \
            -u | \
     

In [12]:
# submit mark duplicates job to SLURM HPC
!sbatch samtools_collate_fixmate_sort_markdup_221222_slurm.sh \
    ./01_raw_bam/ERR1831349.bam \
    ./02_bam_markdup/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 645899
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    2:26:44      1 node58
            645899     nodes samtools   fmbuga  R       0:05      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    2:26:54      1 node58
            645899     nodes samtools   fmbuga  R       0:15      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    2:27:09      1 node58
            645899     nodes samtools   fmbuga  R       0:30      1 node62
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            645891     nodes     bash   fmbuga  R    2:27:39      1 node58
            645899     nodes samtools   fmbuga  R       1:00      1 node62


In [1]:
!cat samtools_collate_fixmate_sort_markdup_HIMEM_230724_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=samtools_collate_fixmate_sort_markdup_HIMEM_230724
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_HIMEM_230724_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_HIMEM_230724_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=12
#SBATCH --mem=0
#SBATCH --partition=himem

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate minimap2_224                      ### CONDA ENVIRONMENT NAME
conda info
echo ""

call="samtools --version" # to get the version
echo $call
eval $call
echo ""

input_bam=$1
output_dir=$2

out_name=$(basename $input_bam .bam)

call=$(samtools collate \
    -Ou \
    --threads 12 \
    $input_bam | \
        samtools fixmate \
            - \
            - \
       

In [2]:
# submit mark duplicates job to SLURM HPC
!sbatch samtools_collate_fixmate_sort_markdup_230723_slurm.sh \
    ./01_raw_bam/ERR1831349.bam \
    ./02_bam_markdup/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 646005
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      10:17      1 node26
            646005     nodes samtools   fmbuga  R       0:05      1 node05
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      10:27      1 node26
            646005     nodes samtools   fmbuga  R       0:15      1 node05
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      10:42      1 node26
            646005     nodes samtools   fmbuga  R       0:30      1 node05
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      11:12      1 node26
            646005     nodes samtools   fmbuga  R       1:00      1 node05


In [3]:
# submit mark duplicates job to SLURM HPC
!sbatch samtools_collate_fixmate_sort_markdup_HIMEM_230724_slurm.sh \
    ./01_raw_bam/ERR1831349.bam \
    ./02_bam_markdup/

Submitted batch job 646007


In [4]:
!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      26:06      1 node26
            646007     himem samtools   fmbuga  R       0:52      1 himem01
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      26:16      1 node26
            646007     himem samtools   fmbuga  R       1:02      1 himem01
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      26:32      1 node26
            646007     himem samtools   fmbuga  R       1:18      1 himem01
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646002     nodes     bash   fmbuga  R      27:02      1 node26
            646007     himem samtools   fmbuga  R       1:48      1 himem01


- ERROR
`[bam_sort_core] merging from 24 files and 12 in-memory blocks...
[markdup] error: no ms score tag. Please run samtools fixmate on file first.
[markdup] error: no ms score tag. Please run samtools fixmate on file first.
`

In [5]:
!samtools collate \
    -Ou \
    --threads 26 \
    ./01_raw_bam/ERR1831349.bam | \
        samtools fixmate \
            - \
            - \
            --threads 26 \
            -u | \
            samtools sort \
                - \
                --threads 26 \
                -u |\
                samtools markdup \
                    - \
                    --threads 26 \
                    input_collate_fixmate_sort_markdup.bam


[bam_sort_core] merging from 26 files and 26 in-memory blocks...
[markdup] error: no ms score tag. Please run samtools fixmate on file first.
[markdup] error: no ms score tag. Please run samtools fixmate on file first.
samtools sort: failed writing to "-": Illegal seek


In [6]:
!samtools sort

Usage: samtools sort [options...] [in.bam]
Options:
  -l INT     Set compression level, from 0 (uncompressed) to 9 (best)
  -u         Output uncompressed data (equivalent to -l 0)
  -m INT     Set maximum memory per thread; suffix K/M/G recognized [768M]
  -M         Use minimiser for clustering unaligned/unplaced reads
  -K INT     Kmer size to use for minimiser [20]
  -n         Sort by read name (not compatible with samtools index command)
  -t TAG     Sort by value of TAG. Uses position as secondary index (or read name if -n is set)
  -o FILE    Write final output to FILE rather than standard output
  -T PREFIX  Write temporary files to PREFIX.nnnn.bam
  --no-PG    do not add a PG line
      --input-fmt-option OPT[=VAL]
               Specify a single input file format option in the form
               of OPTION or OPTION=VALUE
  -O, --output-fmt FORMAT[,OPT[=VAL]]...
               Specify output format (SAM, BAM, CRAM)
      --output-fmt-option OPT[=VAL]
               Specify a

In [None]:
!samtools collate \
    -Ou \
    --threads 28 \
    ./01_raw_bam/ERR1831349.bam | \
        samtools fixmate \
            - \
            - \
            --threads 28 \
            -u | \
            samtools sort \
                - \
                --threads 28 \
                -u \
                input_collate_fixmate_sort.sam

In [3]:
!cat samtools_collate_fixmate_sort_230723_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=samtools_collate_fixmate_sort_markdup_230723
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_230723_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_collate_fixmate_sort_markdup_230723_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=0

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate minimap2_224                      ### CONDA ENVIRONMENT NAME
conda info
echo ""

call="samtools --version" # to get the version
echo $call
eval $call
echo ""

input_bam=$1
output_dir=$2

out_name=$(basename $input_bam .bam)

call=$(samtools collate \
    -Ou \
    --threads 28 \
    $input_bam | \
        samtools fixmate \
            - \
            - \
            --threads 28 \
            -u | \
     

In [4]:
# submit samtools_collate_fixmate_sort job to SLURM HPC
!sbatch samtools_collate_fixmate_sort_230723_slurm.sh \
    ./01_raw_bam/ERR1831349.bam \
    ./02_bam_markdup/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 646011
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R      52:10      1 node58
            646011     nodes samtools   fmbuga  R       0:06      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R      52:20      1 node58
            646011     nodes samtools   fmbuga  R       0:16      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R      52:35      1 node58
            646011     nodes samtools   fmbuga  R       0:31      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R      53:05      1 node58
            646011     nodes samtools   fmbuga  R       1:01      1 node25


In [1]:
!cat samtools_markdup_230723_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=samtools_markdup_230723
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_markdup_230723_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/samtools_markdup_230723_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=0

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate minimap2_224                      ### CONDA ENVIRONMENT NAME
conda info
echo ""

call="samtools --version" # to get the version
echo $call
eval $call
echo ""

input_bam=$1
output_dir=$2

out_name=$(basename $input_bam .bam)

call=$(samtools markdup \
                    $input_bam \
                    --threads 28 \
                    -sf "$output_dir$out_name"_markdup_bam.markdupstats \
                    --write-index \
                    "$output_dir$out_name"_

In [4]:
# submit mark duplicates job to SLURM HPC
!sbatch samtools_markdup_230723_slurm.sh \
    ./02_bam_markdup/ERR1831349_collate_fixmate_sort.bam \
    ./02_bam_markdup/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 646015
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:16:25      1 node58
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:16:35      1 node58
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:16:50      1 node58
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:17:21      1 node58


```
ERROR

[markdup] error: no ms score tag. Please run samtools fixmate on file first.
[markdup] error: no ms score tag. Please run samtools fixmate on file first.

```

In [6]:
!cat ~/tools/slurm_scripts/gatk_markDuplicates_221003_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=gatk_markDuplicates_221003
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/gatk_markDuplicates_221003_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/gatk_markDuplicates_221003_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=120G

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate gatk4                      ### CONDA ENVIRONMENT NAME
conda info
echo ""


call="gatk MarkDuplicates --version" # to get the version
echo $call
eval $call
echo ""

call="gatk MarkDuplicatesSpark --version" # to get the version
echo $call
eval $call
echo ""

call="gatk MarkDuplicates --help" # help
echo $call
eval $call
echo ""

call="gatk MarkDuplicatesSpark --help" # help
echo $call
eval $call
echo ""


output_dir="./04_bam_dedup/"            ### OUTPUT D

In [7]:
!cp ~/tools/slurm_scripts/gatk_markDuplicates_221003_slurm.sh \
    gatk_markDuplicates_230724_slurm.sh

In [9]:
!cat gatk_markDuplicates_230724_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=gatk_markDuplicates_221003
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/gatk_markDuplicates_221003_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/gatk_markDuplicates_221003_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=0

datenow=$(date)
echo $datenow
echo ""
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo ""
echo "hostname: $HOSTNAME"
echo ""


##################

eval "$(conda shell.bash hook)"
conda activate gatk4                      ### CONDA ENVIRONMENT NAME
conda info
echo ""


call="gatk MarkDuplicates --version" # to get the version
echo $call
eval $call
echo ""

call="gatk MarkDuplicatesSpark --version" # to get the version
echo $call
eval $call
echo ""

call="gatk MarkDuplicates --help" # help
echo $call
eval $call
echo ""

call="gatk MarkDuplicatesSpark --help" # help
echo $call
eval $call
echo ""


input_bam=$1                            ### INPUT BAM
o

In [10]:
# submit mark duplicates job to SLURM HPC
!sbatch gatk_markDuplicates_230724_slurm.sh \
    ./01_raw_bam/ERR1831349.bam \
    ./02_bam_markdup/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 646016
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:31:36      1 node58
            646016     nodes gatk_mar   fmbuga  R       0:05      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:31:46      1 node58
            646016     nodes gatk_mar   fmbuga  R       0:15      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:32:02      1 node58
            646016     nodes gatk_mar   fmbuga  R       0:31      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646009     nodes     bash   fmbuga  R    1:32:32      1 node58
            646016     nodes gatk_mar   fmbuga  R       1:01      1 node25


In [11]:
!samtools flagstat \
    -@ 28 \
    ./02_bam_markdup/ERR1831349_markdup.bam

93603695 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
46995 + 0 supplementary
34258723 + 0 duplicates
93403023 + 0 mapped (99.79% : N/A)
93556700 + 0 paired in sequencing
46778350 + 0 read1
46778350 + 0 read2
92600884 + 0 properly paired (98.98% : N/A)
93279060 + 0 with itself and mate mapped
76968 + 0 singletons (0.08% : N/A)
593850 + 0 with mate mapped to a different chr
517790 + 0 with mate mapped to a different chr (mapQ>=5)


## Variant Calling

In [12]:
# make folder to store vcf
!mkdir 03_vcf_raw

In [1]:
!cat ~/tools/slurm_scripts/bcftools_mpileup_call_ONT_P099_221128_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=bcftools_mpileup_call_ONT_P099_221129
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/bcftools_mpileup_call_ONT_P099_221129_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/bcftools_mpileup_call_ONT_P099_221129_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=120G


# "...multithreading ...is currently used only for the compression of the output stream, only when --output-type is b or z"
# https://samtools.github.io/bcftools/bcftools.html
# "Use the -Ou option when piping between bcftools subcommands to speed up performance by removing unnecessary compression/decompression and VCF←→BCF conversion"
# for ONT https://github.com/samtools/bcftools/issues/1666

# "For ONT it may be beneficial to also run bcftools call with a higher -P, eg -P0.01 or -P 0.1"


datenow=$(date)
echo $datenow
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo "hostname: $HOSTNAME"
echo ""


#####

In [3]:
!cp ~/tools/slurm_scripts/bcftools_mpileup_call_ONT_P099_221128_slurm.sh \
    bcftools_mpileup_call_P099_230724_slurm.sh

In [4]:
!cat bcftools_mpileup_call_P099_230724_slurm.sh

#!/bin/bash
#
#SBATCH --job-name=bcftools_mpileup_call_ONT_P099_221129
#SBATCH --output=/home/fmbuga/tools/slurm_scripts/slurm_out_err/bcftools_mpileup_call_ONT_P099_221129_%j.out
#SBATCH --error=/home/fmbuga/tools/slurm_scripts/slurm_out_err/bcftools_mpileup_call_ONT_P099_221129_%j.err
#
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=0


# "...multithreading ...is currently used only for the compression of the output stream, only when --output-type is b or z"
# https://samtools.github.io/bcftools/bcftools.html
# "Use the -Ou option when piping between bcftools subcommands to speed up performance by removing unnecessary compression/decompression and VCF←→BCF conversion"
# for ONT https://github.com/samtools/bcftools/issues/1666

# "For ONT it may be beneficial to also run bcftools call with a higher -P, eg -P0.01 or -P 0.1"


datenow=$(date)
echo $datenow
srun hostname
echo ""

start=$(date +%s)
echo "start time: $start"
echo "hostname: $HOSTNAME"
echo ""


################

In [5]:
# submit bcftools mpileup call variants job to SLURM HPC
!sbatch bcftools_mpileup_call_P099_230724_slurm.sh \
    ./02_bam_markdup/ERR1831349_markdup.bam \
    ./03_vcf_raw/

!sleep 5
!squeue -u fmbuga
!sleep 10
!squeue -u fmbuga
!sleep 15
!squeue -u fmbuga
!sleep 30
!squeue -u fmbuga

Submitted batch job 646019
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646017     nodes     bash   fmbuga  R      17:31      1 node58
            646019     nodes bcftools   fmbuga  R       0:04      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646017     nodes     bash   fmbuga  R      17:42      1 node58
            646019     nodes bcftools   fmbuga  R       0:15      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646017     nodes     bash   fmbuga  R      17:57      1 node58
            646019     nodes bcftools   fmbuga  R       0:30      1 node25
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            646017     nodes     bash   fmbuga  R      18:27      1 node58
            646019     nodes bcftools   fmbuga  R       1:00      1 node25
