In [None]:
import subprocess
import os

# Config
reference = "/mnt/jupiter/johnsonlab/Capstone_proj/annotation_jasmine-the2899-mb-hirise-ahy4t__02-20-2022__hic_output.fasta"
out_dir = "/mnt/jupiter/johnsonlab/Capstone_proj/results/BQSR"
gatk_path = "/bin/gatk-4.6.2.0/gatk"
threads = "15"

# Sample list
files = [
    "10KP_RG", "14KP_RG", "15AZ", "28KP", "31SMW", "32SMW", 
    "34SMW", "35SMW", "36SMW_RG", "38SMW_RG", "39SMW_RG",
    "46SMW_RG", "52SMW", "55SMW", "57SMW", "69SMF_RG", 
    "70SMF_RG", "8KP_RG", "9KP_RG", "MOR021_RG", "MOR023_RG"
]

# Collect gVCFs and ensure indexing
final_gvcfs = []
for file in files:
    print(f"🔍 Checking sample: {file}")
    final_gvcf = f"{out_dir}/{file}.final.g.vcf"
    final_idx = f"{final_gvcf}.idx"

    if os.path.exists(final_gvcf):
        if not os.path.exists(final_idx):
            print(f"Indexing missing for {file} — indexing now.")
            subprocess.run([
                gatk_path, "IndexFeatureFile",
                "-I", final_gvcf
            ], check=True)
        else:
            print(f"Index already exists for {file}.")
        final_gvcfs.append(final_gvcf)
    else:
        print(f"Missing GVCF: {final_gvcf}")

# Combine GVCFs
combined_gvcf = os.path.join(out_dir, "cohort_combined.g.vcf")

# Prepare list of "-V file" args
variant_args = []
for gvcf in final_gvcfs:
    variant_args.extend(["-V", gvcf])

# Run CombineGVCFs
subprocess.run([
    gatk_path, "CombineGVCFs",
    "-R", reference,
    "-O", combined_gvcf
] + variant_args, check=True)
