In [None]:
%%writefile ~/CNV_filtering.sh
#!/bin/bash
set -euo pipefail

outdir="${OUTPUT_PATH}/"
mkdir -p "${outdir}"

vcf="${VCF_FILE}"
outvcf="${outdir}/chr17_dup_filtered.vcf"

start=14100000
end=15700000
minlen=1000000  # minimum SV length in bp

# If compressed, decompress; otherwise read directly
if [[ "$vcf" == *.gz ]]; then
    gunzip -c "$vcf"
else
    cat "$vcf"
fi | awk -v start="$start" -v end="$end" -v minlen="$minlen" '
BEGIN { OFS="\t" }
/^#/ { print; next }
{
    # Manually extract first 8 fields without splitting the whole line
    line = $0
    n = split(line, arr, "\t")
    # arr[1]..arr[8] are CHROM..INFO, arr[9]..arr[n] are genotype/sample columns

    # Parse INFO field (arr[8])
    endpos = -1
    m = split(arr[8], info_fields, ";")
    for (i=1; i<=m; i++) {
        if (info_fields[i] ~ /^END=/) {
            endpos = substr(info_fields[i], 5) + 0
        }
    }

    if (endpos > 0) {
        svlen = endpos - arr[2]
        if (arr[5] == "<DUP>" && arr[2] >= start && endpos >= start && endpos <= end && svlen > minlen) {
            print line
        }
    }
}
' > "$outvcf"

echo "Filtered VCF written to $outvcf"

In [None]:
!gsutil cp /home/jupyter/CNV_filtering.sh gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/

In [None]:
%%bash --out LINE_COUNT_JOB_ID

DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork
MACHINE_TYPE="n2-standard-16"
BASH_SCRIPT="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/scripts/CNV_filtering.sh"
JOB_NAME="cnv_filtering"
VCF_PATH="gs://fc-aou-datasets-controlled/v8/wgs/short_read/structural_variants/vcf/full/AoU_srWGS_SV.v8.chr17.vcf.gz"

dsub \
    --provider google-batch \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --use-private-address \
    --network "global/networks/network" \
    --subnetwork "regions/us-central1/subnetworks/subnetwork" \
    --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/{job-id}-{task-id}-{task-attempt}.log" \
    --boot-disk-size 3000 \
    --disk-size 1200 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --input VCF_FILE="${VCF_PATH}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --output-recursive OUTPUT_PATH="gs://fc-secure-1ee5bf43-0b6b-4164-a355-ff45dfe2ae3a/data/cmt/counts/pmp22/"