## UKB Multivariate fine-mapping workflow

1. Sufficient statistics input XtX, XtY, YtY and n. We assume covariates C have been removed from X and Y. We provide a procedure to implement this.
2. GWAS summary statistics input z and R. We assume z scores have been computed after removal of covariates C.

In [None]:
[global]
# single column file each line is the data filename
parameter: LD_folder = path('/project2/mstephens/yuxin/ukb-bloodcells/LD')
parameter: name = 'bloodcells_chr'
parameter: analysis_units = path('/project2/mstephens/yuxin/ukb-bloodcells/regions.csv')
regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]

In [None]:
[LDstore_rds]
input: for_each = "regions"
output: f"{LD_folder:a}/{name}{_regions[0]}.{_regions[1]}.{_regions[2]}.ld.rds"
task: trunk_workers = 1, trunk_size = 1, walltime = '12h', mem = '20G', cores = 2, tags = f'{step_name}_{_output:bn}'
R: expand = "${ }"
    ld_file = '/project2/mstephens/yuxin/ukb-bloodcells/LD_ldstore/bloodcells_chr${_regions[0]}.${_regions[1]}.${_regions[2]}.matrix'
    library(data.table)
    ld = as.matrix(fread(ld_file))
    saveRDS(ld, ${_output:r})

In [None]:
[sufficient_summary_stats_preprocessing]
parameter: phenoFile = path('/project2/mstephens/yuxin/ukb-bloodcells/bloodcells.pheno.txt')
parameter: covarFile = path('/project2/mstephens/yuxin/ukb-bloodcells/bloodcells.covar.txt')
input: for_each = "regions"
output: suffstats = f"/project2/mstephens/yuxin/ukb-bloodcells/ukbbloodcells_suff_stats/bloodcells_chr{_regions[0]}.{_regions[1]}.{_regions[2]}.sufficient_stats.rds", 
        sumstats =  f"/project2/mstephens/yuxin/ukb-bloodcells/ukbbloodcells_summary_stats/bloodcells_chr{_regions[0]}.{_regions[1]}.{_regions[2]}.summary_stats.rds"
task: trunk_workers = 1, trunk_size = 1, walltime = '4h', mem = '50G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:n}.stdout", stderr = f"{_output:n}.stderr"
    geno_file = '/project2/mstephens/yuxin/ukb-bloodcells/genotypes/bloodcells_chr${_regions[0]}.${_regions[1]}.${_regions[2]}'
    z.file = '/project2/mstephens/yuxin/ukb-bloodcells/zscores/bloodcells_chr${_regions[0]}.${_regions[1]}.${_regions[2]}.z.rds')
    ld.file = '/project2/mstephens/yuxin/ukb-bloodcells/LD/bloodcells_chr${_regions[0]}.${_regions[1]}.${_regions[2]}.ld.rds'
    library(data.table)
    library(dplyr)
    
    X <- fread(paste0(geno_file, '.raw.gz'),sep = "\t",header = TRUE,stringsAsFactors = FALSE)
    map <- X[,1:6]
    X = X[, c('FID','IID','PAT','MAT','SEX', 'PHENOTYPE') := NULL]
    X <- as.matrix(X)
  
    # Read phenotype data
    cat("Reading phenotype data.\n")
    pheno <- suppressMessages(fread(phenoFile))

    cat("Reading covariate file.\n")
    Z = suppressMessages(fread(covarFile))

    match.idx = match(map$IID, pheno$IID)
    pheno = pheno[match.idx,]
    match.idx = match(map$IID, Z$IID)
    Z = Z[match.idx,]
  
    Y = pheno %>% select(-FID, -IID) %>% as.matrix
    Z = Z %>% select(-FID, -IID) %>% as.matrix
  
    # centering
    Y = sweep(Y, 2, colMeans(Y), '-')
    Z = sweep(Z, 2, colMeans(Z), '-')
  
    A   <- crossprod(Z) # Z'Z
    # chol decomposition for (Z'Z)^(-1)
    R = chol(solve(A)) # R'R = (Z'Z)^(-1)
    W = R %*% crossprod(Z, X) # RZ'X
    S = R %*% crossprod(Z, Y) # RZ'Y

    SNPnames = colnames(X)
    rm(X)
    rm(Z)

    zscores = readRDS(z.file)

    # Load LD matrix from raw genotype
    ld = readRDS(ld.file)
    XtX = sqrt(zscores$XtXD) * t(ld*sqrt(zscores$XtXD)) - crossprod(W) # W'W = X'ZR'RZ'X = X'Z(Z'Z)^{-1}Z'X
    XtX = as.matrix(XtX)
    rownames(XtX) = colnames(XtX) = SNPnames
    R = cov2cor(XtX)

    # X'Y
    XtY = as.matrix(zscores$XtY - crossprod(W, S)) # W'S = X'ZR'RZ'y = X'Z(Z'Z)^{-1}Z'y

    # YtY
    YtY = as.matrix(crossprod(Y) - crossprod(S))

    Z = -as.matrix(zscores$Z)
    
    saveRDS(list(XtX = XtX, XtY = XtY, YtY = YtY, N = nrow(Y), meta = zscores$pos), ${_output["suffstats"]:r})
    saveRDS(list(Z = Z, LD = R, meta = zscores$pos), ${_output["sumstats"]:r})
    