In [11]:
import json, os, sys, argparse, gzip, glob

def get_vcfs_file_list(files_dir):
    file_list = []
    filename_list = []
    for root, dirs, files in os.walk(files_dir):
        for file in files:
            if file.endswith(".vcf.gz"):
                file_list.append(os.path.join(root, file))
                filename_list.append(file)
                    
    return file_list, filename_list


def write_kourami_result_for_sample(out_dir, vcf_filename, rs_hla_list):
    DQA1, DQB1 = [], []
    path_to_kourami_file = os.path.join(out_dir, f"{vcf_filename}.kourami")
    if len(rs_hla_list) > 2:
        raise ValueError(f"More than four HLA alleles for {vcf_filename} sample were typed. {rs_hla_h}")
    else:
        for hla_allele in rs_hla_list:
            DQA1.append(hla_allele.split(";")[0])
            DQB1.append(hla_allele.split(";")[1])
            
    DQA1.extend(DQB1)
    with open(path_to_kourami_file, 'w') as kourami_res:
        for allele in DQA1:
            allele = allele.split("-")[1] + ":01G\n"
            kourami_res.write(allele)
        
    return path_to_kourami_file

def get_sampleID_rs_hla_list(vcf_file, rs_hla_dict):
    rs_hla_list = []
    with gzip.open(vcf_file, "rt") as vcf_file:
        for line in vcf_file:
            if line.startswith("#CHROM"):
                sample_id = line.split()[-1]
            elif line.startswith("chr6"):
                if line.split()[2] in rs_hla_dict.keys():
                    rs_hla_list.append(rs_hla_dict[line.split()[2]])
            elif line.startswith("chr7"):
                break
    
    if not rs_hla_list:
        rs_hla_list.append(rs_hla_dict["default"])
        rs_hla_list.append(rs_hla_dict["default"])
    
    return sample_id, rs_hla_list
    

def write_json(vcf_file, sample_ID, path_to_copy, copy, phenotype, genome_ver, 
                   summary_stat, kourami_res, HLA_inter, HLA_standalone, script_estimate_hla, out_dir, vcf_filename):
    base = "PRS_single_sample_calculation."
    dict_to_json = {
        base + "vcf": vcf_file,
        base + "ID": sample_ID.replace("_", "-"),
        base + "path_to_copy": path_to_copy,
        base + "copy": copy,
        base + "phenotype": phenotype,
        base + "genome_version": genome_ver,
        base + "summary_stat": summary_stat,
        base + "kuorami_results": kourami_res,
        base + "HLA_interaction": HLA_inter,
        base + "HLA_standalone": HLA_standalone,
        base + "script_estimate_hla": script_estimate_hla
    }
    
    with open(os.path.join(out_dir, f"{vcf_filename}.json"), 'w') as input_file:
        input_file.write(json.dumps(dict_to_json))
    

def main(args):
    parser = argparse.ArgumentParser(description='Script for generating inputs for wdl script')
    parser.add_argument("-vd", '--path_to_vcfs_dir', type=str, required=True,
                        help='Path to directory with vcfs file.')
    parser.add_argument("-od", "--output_dir", type=str,
                        default="/home/vcheranev/wdl_inputs_files/",
                        help="Full path to output directory.")
    parser.add_argument("--copy_path", type=str,
                        default="/home/vcheranev/wdl_results/",
                        help="Path to coping results.")
    parser.add_argument("--no_copy", action="store_false",
                        help="Path to HLA-HD tool directory.")
    parser.add_argument("--phenotype", nargs="+",
                        default=["T1D"],
                        help="Phenotype of patients.")
    parser.add_argument("--genome_version", type=str,
                        default="hg38",
                        help="Path to hlaScan tool directory.")
    parser.add_argument("--summary_stat", nargs="+",
                        default=["/home/vcheranev/PRS/PRS_calculation_summary_stat/TD1/sharp_2019/PGS000024_SNPs.tsv"],
                        help="Path to weights of SNV combination.")
    parser.add_argument("--kourami_res", type=str,
                        default="",
                        help="Path to kourami result file.")
    parser.add_argument("--HLA_interaction", type=str,
                        default="/home/vcheranev/PRS/PRS_calculation_summary_stat/TD1/sharp_2019/PGS000024_HLA_combination.tsv",
                        help="Path to weights of HLA combination.")
    parser.add_argument("--HLA_standalone", type=str,
                        default="/home/vcheranev/PRS/PRS_calculation_summary_stat/TD1/sharp_2019/PGS000024_HLA_standalone.tsv",
                        help="Path to weights of HLA standalone.")
    parser.add_argument("--script_estimate_hla", default="/home/vcheranev/PRS/HLA_impact_PRS.py",
                        help="Path to script which process hla results.")
    parser.add_argument("-j", "--rs_hla_json_path", type=str, 
                        default="/home/vcheranev/PRS/PGS000024.json",
                        help="Full path to json file with rsID and HLA allele.")
    args = parser.parse_args(args)
    files_dir = args.path_to_vcfs_dir
    out_dir = args.output_dir
    copy_path = args.copy_path
    kourami_res = args.kourami_res
    script_estimate_hla = args.script_estimate_hla
    rs_hla = args.rs_hla_json_path
    
    vcf_files, vcf_filenames = get_vcfs_file_list(files_dir)
    
    with open(rs_hla, "r") as j:
        rs_hla_dict = json.loads(j.read())
    
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
        
    if not os.path.exists(copy_path):
        os.makedirs(copy_path)
    
    for vcf_file, vcf_filename in zip(vcf_files, vcf_filenames):
        sample_ID, rs_hla_list = get_sampleID_rs_hla_list(vcf_file, rs_hla_dict)
        kourami_res = write_kourami_result_for_sample(out_dir, vcf_filename, rs_hla_list)
        write_json(vcf_file, sample_ID, copy_path, args.no_copy, args.phenotype, args.genome_version, 
                   args.summary_stat, kourami_res, args.HLA_interaction, args.HLA_standalone, script_estimate_hla, out_dir, \
                   vcf_filename)

        
if __name__ == "__main__":
    main(sys.argv[1:])

IndentationError: expected an indented block (<ipython-input-11-da32d938f8ed>, line 50)