In [10]:
import subprocess
import os

# === Config ===
reference = "/mnt/jupiter/johnsonlab/Capstone_proj/annotation_jasmine-the2899-mb-hirise-ahy4t__02-20-2022__hic_output.fasta"
out_dir = "/mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR"
gatk_path = "/bin/gatk-4.6.2.0/gatk"
threads = "15"

# === Sample list ===
files = [
    "10KP_RG", "14KP_RG", "15AZ", "28KP", "31SMW", "32SMW", 
    "34SMW", "35SMW", "36SMW_RG", "38SMW_RG", "39SMW_RG",
    "46SMW_RG", "52SMW", "55SMW", "57SMW", "69SMF_RG", 
    "70SMF_RG", "8KP_RG", "9KP_RG", "MOR021_RG", "MOR023_RG"
]

# === Collect gVCFs and ensure indexing ===
final_gvcfs = []
for file in files:
    print(f"🔍 Checking sample: {file}")
    final_gvcf = f"{out_dir}/{file}.final.g.vcf"
    final_idx = f"{final_gvcf}.idx"

    if os.path.exists(final_gvcf):
        if not os.path.exists(final_idx):
            print(f"Indexing missing for {file} — indexing now.")
            subprocess.run([
                gatk_path, "IndexFeatureFile",
                "-I", final_gvcf
            ], check=True)
        else:
            print(f"✅ Index already exists for {file}.")
        final_gvcfs.append(final_gvcf)
    else:
        print(f"⚠️ Missing GVCF: {final_gvcf}")

# === Combine GVCFs ===
combined_gvcf = os.path.join(out_dir, "cohort_combined.g.vcf")

# Prepare list of "-V file" args
variant_args = []
for gvcf in final_gvcfs:
    variant_args.extend(["-V", gvcf])

# Run CombineGVCFs
subprocess.run([
    gatk_path, "CombineGVCFs",
    "-R", reference,
    "-O", combined_gvcf
] + variant_args, check=True)


🔍 Checking sample: 10KP_RG
✅ Index already exists for 10KP_RG.
🔍 Checking sample: 14KP_RG
✅ Index already exists for 14KP_RG.
🔍 Checking sample: 15AZ
✅ Index already exists for 15AZ.
🔍 Checking sample: 28KP
✅ Index already exists for 28KP.
🔍 Checking sample: 31SMW
✅ Index already exists for 31SMW.
🔍 Checking sample: 32SMW
✅ Index already exists for 32SMW.
🔍 Checking sample: 34SMW
✅ Index already exists for 34SMW.
🔍 Checking sample: 35SMW
✅ Index already exists for 35SMW.
🔍 Checking sample: 36SMW_RG
✅ Index already exists for 36SMW_RG.
🔍 Checking sample: 38SMW_RG
✅ Index already exists for 38SMW_RG.
🔍 Checking sample: 39SMW_RG
✅ Index already exists for 39SMW_RG.
🔍 Checking sample: 46SMW_RG
✅ Index already exists for 46SMW_RG.
🔍 Checking sample: 52SMW
✅ Index already exists for 52SMW.
🔍 Checking sample: 55SMW
✅ Index already exists for 55SMW.
🔍 Checking sample: 57SMW
✅ Index already exists for 57SMW.
🔍 Checking sample: 69SMF_RG
✅ Index already exists for 69SMF_RG.
🔍 Checking sample: 70S

22:55:01.613 INFO  NativeLibraryLoader - Loading libgkl_compression.so from jar:file:/usr/bin/gatk-4.6.2.0/gatk-package-4.6.2.0-local.jar!/com/intel/gkl/native/libgkl_compression.so
22:55:01.776 INFO  CombineGVCFs - ------------------------------------------------------------
22:55:01.781 INFO  CombineGVCFs - The Genome Analysis Toolkit (GATK) v4.6.2.0
22:55:01.781 INFO  CombineGVCFs - For support and documentation go to https://software.broadinstitute.org/gatk/
22:55:01.781 INFO  CombineGVCFs - Executing as chdahl@genomics1 on Linux v5.4.0-215-generic amd64
22:55:01.781 INFO  CombineGVCFs - Java runtime: OpenJDK 64-Bit Server VM v17.0.15+6-Ubuntu-0ubuntu120.04
22:55:01.781 INFO  CombineGVCFs - Start Date/Time: May 25, 2025 at 10:55:01 PM UTC
22:55:01.782 INFO  CombineGVCFs - ------------------------------------------------------------
22:55:01.782 INFO  CombineGVCFs - ------------------------------------------------------------
22:55:01.783 INFO  CombineGVCFs - HTSJDK Version: 4.2.0
2

✅ Combined GVCF written to: /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/cohort_combined.g.vcf


Using GATK jar /usr/bin/gatk-4.6.2.0/gatk-package-4.6.2.0-local.jar
Running:
    java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /usr/bin/gatk-4.6.2.0/gatk-package-4.6.2.0-local.jar CombineGVCFs -R /mnt/jupiter/johnsonlab/Capstone_proj/annotation_jasmine-the2899-mb-hirise-ahy4t__02-20-2022__hic_output.fasta -O /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/cohort_combined.g.vcf -V /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/10KP_RG.final.g.vcf -V /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/14KP_RG.final.g.vcf -V /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/15AZ.final.g.vcf -V /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/28KP.final.g.vcf -V /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/31SMW.final.g.vcf -V /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/32SMW.final.g.vcf -V /mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR/34SMW.fina