In [1]:
import pandas as pd
from config import RUN_GATK, QC, DOWNLOAD_GERMLINE_RESOURCE, DOWNLOAD_REF_GENOME, RESULTS_FOLDER, DATA_FOLDER, NORMAL_R1, NORMAL_R2, TUMOR_R1, TUMOR_R2, REFERENCE_CSV_FILE

In [2]:
# Download reference human genome
if DOWNLOAD_REF_GENOME:
    !wget -O "$DATA_FOLDER/hg38.fa.gz" http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
    !gunzip "$DATA_FOLDER/hg38.fa.gz"

In [3]:
if DOWNLOAD_GERMLINE_RESOURCE:
    BASE_URL="https://storage.googleapis.com/gatk-best-practices/somatic-hg38"
    GNOMAD_VCF="af-only-gnomad.hg38.vcf.gz"
    GNOMAD_INDEX="af-only-gnomad.hg38.vcf.gz.tbi"
    !wget -O $DATA_FOLDER/$GNOMAD_VCF $BASE_URL/$GNOMAD_VCF
    !wget -O $DATA_FOLDER/$GNOMAD_INDEX $BASE_URL/$GNOMAD_INDEX

# Quality control

In [4]:
# Perform quality checks using tools like FastQC and summarize quality metrics (e.g., sequence counts, 
# per-base quality, read duplication levels).

# Decompress if needed
!gunzip "$DATA_FOLDER"/*.fastq.gz

if QC:
    # Run FASTQC
    # The summary of quality metrics is provided in the pdf report
    !fastqc "$DATA_FOLDER/$NORMAL_R1" "$DATA_FOLDER/$NORMAL_R2" "$DATA_FOLDER/$TUMOR_R1" "$DATA_FOLDER/$TUMOR_R2" -o "$RESULTS_FOLDER"

zsh:1: no matches found: data/*.fastq.gz


# Alignment and mutation calling

In [5]:
# Align the samples to the human genome using tools like Bowtie2 or BWA.
# https://gatk.broadinstitute.org/hc/en-us/articles/360035535912-Data-pre-processing-for-variant-discovery

# Identify somatic mutations present in the cancer sample but absent in the normal tissue.
# https://gatk.broadinstitute.org/hc/en-us/articles/360035894731-Somatic-short-variant-discovery-SNVs-Indels
if RUN_GATK:
    !chmod +x somatic_mutations.sh
    !./somatic_mutations.sh "$DATA_FOLDER/$NORMAL_R1" "$DATA_FOLDER/$NORMAL_R2" "$DATA_FOLDER/$TUMOR_R1" "$DATA_FOLDER/$TUMOR_R2" "$DATA_FOLDER/hg38.fa" "$RESULTS_FOLDER" "$DATA_FOLDER"

In [6]:
# Use the normal tissue to calculate the median background mutation level.
# The background mutation levels accounts for sequencing errors or biases
# that can mimic true mutations. Determine how many reads per million are
# required to confidently call a given mutation.