# Full AoU pipeline
This script will:
<br>
- calculate the MAFs for each variant
- filter the sample to those with CMT diagnosis
- extract gene regions
- output VCF of variant counts and MAF data

In [None]:
%%writefile ~/CMT_pipeline_1.sh
#!/bin/bash
set -o pipefail
set -o errexit

# Make sure plink is executable
chmod +x "${plink}"

# Input variables
chrom="${chrom}"
bgenfile="${bgenfile}"
samplefile="${samplefile}"
gene_info="${gene_info}"
keep_samples="${keep_samples}"
flagged_samples="${flagged_samples}"     # ensure this is set (can be an empty file)
outdir="${OUTPUT_PATH}/"
mkdir -p "${outdir}"

# Normalize chrom input and set prefix, enabling split-par automatically for X
# Recognizes: 23, X, x, chrX, chr23
case "${chrom}" in
    23|"X"|"x"|"chrX"|"chr23")
        chrom_norm="23"
        prefix="chrX"
        splitpar_flag="--split-par b38"    # NO BUILD ARGUMENT, per your request
        ;;
    *)
        # Strip "chr" prefix if provided, and handle normal autosomes
        chrom_norm="${chrom#chr}"
        prefix="chr${chrom_norm}"
        splitpar_flag=""
        ;;
esac

echo "Running gene extraction for ${prefix} (split-par: '${splitpar_flag}')..."

# Step 1: Filter to just the individuals in keep_samples
"${plink}" --bgen "${bgenfile}" ref-first \
    --sample "${samplefile}" \
    --keep "${keep_samples}" \
    --remove "${flagged_samples}" \
    ${splitpar_flag} \
    --make-pgen \
    --out "${prefix}_filtered"

# Step 2: Extract gene region from the filtered dataset to VCF
"${plink}" --pfile "${prefix}_filtered" \
    --extract range "${gene_info}" \
    --export vcf \
    --out "${prefix}_gene"

# Step 3: Create REF allele file from VCF (no bcftools)
grep -v '^#' "${prefix}_gene.vcf" | \
awk '{
    if ($3 == ".") {
        # Build ID as chrom:pos:REF:ALT
        print $1 ":" $2 ":" $4 ":" $5 "\t" $4
    } else {
        # Use existing ID
        print $3 "\t" $4
    }
}' > "${prefix}_refalleles.txt"

# Step 4: Save variant IDs
"${plink}" --vcf "${prefix}_gene.vcf" \
    --write-snplist \
    --out "${prefix}_variants"

# Step 5: Create pfile with correct REF/ALT for filtered dataset
"${plink}" --pfile "${prefix}_filtered" \
    --extract "${prefix}_variants.snplist" \
    --ref-allele "${prefix}_refalleles.txt" \
    --make-pgen \
    --out "${prefix}_pfile"

# Step 6: Full frequency data (WHOLE population, not filtered)
"${plink}" --bgen "${bgenfile}" ref-first \
    --sample "${samplefile}" \
    --extract "${prefix}_variants.snplist" \
    --ref-allele "${prefix}_refalleles.txt" \
    ${splitpar_flag} \
    --make-pgen \
    --out "${prefix}_wholepop_pfile"

"${plink}" --pfile "${prefix}_wholepop_pfile" \
    --freq \
    --out "${prefix}_maf"

# Step 7: Export rare variants VCF (filtered to keep_samples)
"${plink}" --pfile "${prefix}_pfile" \
    --export vcf \
    --out "${prefix}_rare_cmt_variants"

# Step 8: Move outputs to final directory
mv "${prefix}_rare_cmt_variants.vcf" "${outdir}/${prefix}_rare_cmt_variants.vcf"
mv "${prefix}_maf.afreq" "${outdir}/${prefix}_maf.afreq"

echo "Pipeline complete for ${prefix}."


## TSV to iterate through chromosomes
Lines should be run individually via bash terminal

In [None]:
#writing in a bash terminal my tasks file for parallelization
cat > CMT_gene_info.tsv << 'EOF'
--env chrom	--input keep_samples	--input bgenfile	--input samplefile	--input gene_info
EOF


for chr in {1..22}; do
    echo -e "${chr}\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/cmt.ids\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/gene_counts/bgen/chr${chr}_cmt.bgen\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/gene_counts/bgen/chr${chr}_cmt.sample\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/input/chr${chr}.tsv"
  done >> CMT_gene_info.tsv

echo -e "X\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/cmt.ids\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/gene_counts/bgen/chrX_cmt.bgen\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/gene_counts/bgen/chrX_cmt.sample\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/input/chrX.tsv" >> CMT_gene_info.tsv

gsutil -m cp CMT_gene_info.tsv gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/

In [None]:
%%bash --out LINE_COUNT_JOB_ID

DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork
MACHINE_TYPE="n2-standard-16"
BASH_SCRIPT="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/CMT_pipeline_1.sh"
TASK_FILE="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/CMT_gene_info_x.tsv"
JOB_NAME="cmt_pipeline_I"

dsub \
    --provider google-batch \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --use-private-address \
    --network "global/networks/network" \
    --subnetwork "regions/us-central1/subnetworks/subnetwork" \
    --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/{job-id}-{task-id}-{task-attempt}.log" \
    --boot-disk-size 3000 \
    --disk-size 1200 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --input plink="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/plink2" \
    --input flagged_samples="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/als/plink_qc/flagged_samples.fam.ids" \
    --tasks "${TASK_FILE}" \
    --output-recursive OUTPUT_PATH="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/gene_regions/pipeline_out/"

## Downloading and combining VCFs in local directory

In [None]:
!gsutil -m cp gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/gene_regions/pipeline_out/*.vcf

In [None]:
# combining all gene vcf files into a single file for analysis
# loading necessary packages
import pandas as pd
import glob

def read_vcf(file):
    # Read the file line by line to find the header
    with open(file, 'r') as f:
        lines = f.readlines()
    
    # Find the line number where the header starts
    header_line = next(i for i, line in enumerate(lines) if line.startswith('#CHROM'))
    
    # Read the VCF from the header line onward
    df = pd.read_csv(file, sep='\t', skiprows=header_line, header=0)
    return df

# List all VCF files ending with 'gene.vcf'
vcf_file_list = glob.glob("*gene.vcf")

# Read and combine all VCF files
vcf_data_list = [read_vcf(file) for file in vcf_file_list]
gene_vcf_unfiltered = pd.concat(vcf_data_list, ignore_index=True)

## Downloading and combining afreq files in local directory

In [None]:
import pandas as pd
import glob

# Function to read and clean .frq files
def read_freq_clean(file):
    # Assuming space-separated values with a header
    df = pd.read_csv(file, sep='\t', header=0)
    return df

# Get list of all .frq files in the current directory
freq_file_list = glob.glob("*.afreq")

# Read and combine all .frq files
freq_data_list = [read_freq_clean(file) for file in freq_file_list]
freq = pd.concat(freq_data_list, ignore_index=True)
freq_filtered = freq[freq["ALT_FREQS"] < 0.001]
freq_subset = freq_filtered[["ID"]]

gene_vcf = pd.merge(gene_vcf_unfiltered, freq_subset, on='ID', how='inner')

In [None]:
gene_vcf.to_csv("variants_raw.vcf", sep = "\t", index = False)

# Annotation
Pipeline using Ensembl's VEP

In [None]:
input_vcf = "variants_raw.vcf"
output_vcf = "variants_input.vcf"

with open(input_vcf, "r") as infile, open(output_vcf, "w") as outfile:
    for line in infile:
        if line.startswith("#"):
            # Write header lines unchanged
            outfile.write(line)
        else:
            fields = line.strip().split("\t")
            if fields[0] == "23":
                fields[0] = "X"
            outfile.write("\t".join(fields) + "\n")

In [None]:
%%bash

git clone https://github.com/Ensembl/ensembl-vep.git
cd ensembl-vep
git checkout release/115
mkdir -p .vep
curl -O https://ftp.ensembl.org/pub/release-115/variation/indexed_vep_cache/homo_sapiens_vep_115_GRCh38.tar.gz
tar xzf homo_sapiens_vep_115_GRCh38.tar.gz -C .vep
perl INSTALL.pl
wget ftp://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
gunzip Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz

In [None]:
%%bash
cd ensembl-vep/
perl ./vep \
  --cache \
  --offline \
  --dir_cache .vep \
  --assembly GRCh38 \
  --species homo_sapiens \
  --fasta ./Homo_sapiens.GRCh38.dna.primary_assembly.fa \
  --input_file ../variants_input.vcf \
  --output_file ../variants_out.vcf 2> debug.log \
  --vcf \
  --pick \
  --everything \
  --canonical \
  --force_overwrite \
  --verbose \
  --fork 8 \
  --buffer_size 5000

sed 's/INFO/Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|GENE_PHENO|SIFT|PolyPhen|DOMAINS|miRNA|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|gnomADe_REMAINING_AF|gnomADe_SAS_AF|gnomADg_AF|gnomADg_AFR_AF|gnomADg_AMI_AF|gnomADg_AMR_AF|gnomADg_ASJ_AF|gnomADg_EAS_AF|gnomADg_FIN_AF|gnomADg_MID_AF|gnomADg_NFE_AF|gnomADg_REMAINING_AF|gnomADg_SAS_AF|MAX_AF|MAX_AF_POPS|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS/g' variants_out.vcf > variants_out1.vcf
sed 's/|/\t/g' variants_out1.vcf > variants_anno.vcf

# Variant Extraction

In [None]:
%%writefile ~/CMT_variant_extraction.sh
#!/bin/bash
set -o pipefail
set -o errexit

# Make sure plink is executable
chmod +x "${plink}"

# Input variables
chrom="${chrom}"
bgenfile="${bgenfile}"
samplefile="${samplefile}"
variants="${variants}"
keep_samples="${keep_samples}"
flagged_samples="${flagged_samples}"
outdir="${OUTPUT_PATH}/"
mkdir -p "${outdir}"

"${plink}" --bgen "${bgenfile}" ref-first \
    --sample "${samplefile}" \
    --remove "${flagged_samples}" \
    --extract "${variants}" \
    --export vcf \
    --out "${prefix}_full"

# Step 8: Move outputs to final directory
mv "${prefix}_full.vcf" "${outdir}/${prefix}_full.vcf"

echo "Pipeline complete for ${prefix}."

In [None]:
#writing in a bash terminal my tasks file for parallelization
cat > variant_extraction.tsv << 'EOF'
--env chrom	--input bgenfile	--input samplefile	--input variants
EOF


for chr in 1 2 3 5 10 12 14 15 19 22; do
    echo -e "${chr}\t\
gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chr}.bgen\t\
gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chr}.sample\t\
gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/summary/variant_files/chr${chr}variants.txt"
  done >> variant_extraction.tsv


gsutil -m cp variant_extraction.tsv gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/

# CNV calling

In [None]:
%%writefile ~/CNV_filtering.sh
#!/bin/bash
set -euo pipefail

outdir="${OUTPUT_PATH}/"
mkdir -p "${outdir}"

vcf="${VCF_FILE}"
outvcf="${outdir}/chr17_dup_filtered.vcf"

start=14100000
end=15700000
minlen=1000000  # minimum SV length in bp

# If compressed, decompress; otherwise read directly
if [[ "$vcf" == *.gz ]]; then
    gunzip -c "$vcf"
else
    cat "$vcf"
fi | awk -v start="$start" -v end="$end" -v minlen="$minlen" '
BEGIN { OFS="\t" }
/^#/ { print; next }
{
    # Manually extract first 8 fields without splitting the whole line
    line = $0
    n = split(line, arr, "\t")
    # arr[1]..arr[8] are CHROM..INFO, arr[9]..arr[n] are genotype/sample columns

    # Parse INFO field (arr[8])
    endpos = -1
    m = split(arr[8], info_fields, ";")
    for (i=1; i<=m; i++) {
        if (info_fields[i] ~ /^END=/) {
            endpos = substr(info_fields[i], 5) + 0
        }
    }

    if (endpos > 0) {
        svlen = endpos - arr[2]
        if (arr[5] == "<DUP>" && arr[2] >= start && endpos >= start && endpos <= end && svlen > minlen) {
            print line
        }
    }
}
' > "$outvcf"

echo "âœ… Filtered VCF written to $outvcf"

In [None]:
!gsutil cp /home/jupyter/CNV_filtering.sh gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/

In [None]:
%%bash --out LINE_COUNT_JOB_ID

DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork
MACHINE_TYPE="n2-standard-16"
BASH_SCRIPT="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/CNV_filtering.sh"
JOB_NAME="cnv_filtering"
VCF_PATH="gs://fc-aou-datasets-controlled/v8/wgs/short_read/structural_variants/vcf/full/AoU_srWGS_SV.v8.chr17.vcf.gz"

dsub \
    --provider google-batch \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --use-private-address \
    --network "global/networks/network" \
    --subnetwork "regions/us-central1/subnetworks/subnetwork" \
    --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/{job-id}-{task-id}-{task-attempt}.log" \
    --boot-disk-size 3000 \
    --disk-size 1200 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --input VCF_FILE="${VCF_PATH}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --output-recursive OUTPUT_PATH="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/pmp22/"