In [None]:
[global]
parameter: genoFile = paths
parameter: phenoFile = path
parameter: covFile = path
parameter: region_list = path
import os
def get_genotype_file(geno_file_paths):
    #
    def valid_geno_file(x):
        suffixes = path(x).suffixes
        if suffixes[-1] == '.bed':
            return True
        if len(suffixes)>1 and ''.join(suffixes[-2:]) == ".vcf.gz":
            return True
        return False
    #
    def complete_geno_path(x, geno_file):
        if not valid_geno_file(x):
            raise ValueError(f"Genotype file {x} should be VCF (end with .vcf.gz) or PLINK bed file (end with .bed)")
        if not os.path.isfile(x):
            # relative path
            if not os.path.isfile(f'{geno_file:ad}/' + x):
                raise ValueError(f"Cannot find genotype file {x}")
            else:
                x = f'{geno_file:ad}/' + x
        return x
    # 
    def format_chrom(chrom):
        if chrom.startswith('chr'):
            chrom = chrom[3:]
        return chrom
    # Inputs are either VCF or bed, or a vector of them 
    if len(geno_file_paths) > 1:
        if all([valid_geno_file(x) for x in geno_file_paths]):
            return paths(geno_file_paths)
        else: 
            raise ValueError(f"Invalid input {geno_file_paths}")
    # Input is one genotype file or text list of genotype files
    geno_file = geno_file_paths[0]
    if valid_geno_file(geno_file):
        return path(geno_file)
    else: 
        units = [x.strip().split() for x in open(geno_file).readlines() if x.strip() and not x.strip().startswith('#')]
        if all([len(x) == 1 for x in units]):
            return paths([complete_geno_path(x[0], geno_file) for x in units])
        elif all([len(x) == 2 for x in units]):
            genos = dict([(format_chrom(x[0]), path(complete_geno_path(x[1], geno_file))) for x in units])
        else:
            raise ValueError(f"{geno_file} should contain one column of file names, or two columns of chrom number and corresponding file name")
        return genos
    
genoFile = get_genotype_file(genoFile)


In [None]:
[data_merger]
input:genoFile, group_by = 1
output: f'{cwd:a}/{name}.{_input:bnnn}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: container = container, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    library("plink2R")
    library("bedr")
    library("dplyr")
    library("readr")
    library("purrr")
    library("tidyr")
    ## Define function
    read_gene_pheno = function(path,gene_id){}
    ## Data Loader
    gene_id = ${str(_input:b).split(".")[-2]}
    covar = read_delim("${_input}","\t")
    ### Genotype
    geno = read_plink("${_input:n}")
    X = geno$bed
    ### Phenotype
    Phenotype_list = read_delim("${_phenoFile}","\t")
    Phenotype_list = Phenotype_list%>%mutate(data = map(dir, ~read_gene_pheno(.x,${str(_input:b).split(".")[-2]})))%>%unnest()
    Y = Phenotype_list%>%select(-dir)
    ## Processing of X and Y 



    ## Remove covariate
    ### Genotype
    for ( i in 1:ncol(X) ) {
        X[,i] = summary(lm( X[,i] ~ as.matrix(covar[,3:ncol(covar)]) ))$resid
        }
    X_resid = scale(X)
    ### Phenotype
    Y_resid = .lm.fit(x = factor, y = pheno)$residuals