In [None]:
## CAFEH
1. generate LD for selected gene based on the per-gene genotype
2. Extract and save sumstat s/beta/z from each RDS for selected gene
3. Run CAFEH

In [None]:
nohup sos run /home/hs3163/GIT/xqtl-pipeline/code/integrative_analysis/CAFEH/CAFEH.ipynb CAFEH \
    --genoFile GRCh38_plink_files_list.txt  \
    --analysis-unit ALL_Ast_End_Exc_Inh_Mic_OPC_Oli.merged_rds.list \
    --region_list AD_genes.region_list -n  &
    

In [2]:
[global]
import glob
import pandas as pd
# Input
parameter: genoFile = path
parameter: analysis_unit = path
parameter: region_list = path
parameter: cwd = path("output")
parameter: name = "demo"
region_tbl = pd.read_csv(region_list,sep = "\t")
input_inv = pd.read_csv(genoFile,sep = "\t",names = ["gene_id","geno_path"],header = 0).merge(region_tbl,on = "gene_id").merge(pd.read_csv(analysis_unit,sep = "\t",names = ["gene_name","rds_path"],header = 0),on = "gene_name")
gene_inv = input_inv.gene_id.tolist()
geno_inv = input_inv.geno_path.tolist()
ss_inv = input_inv.rds_path.tolist()
## sampleSheetAfterQC.filtered_geno.txt
parameter: sample_to_keep = "sampleSheetAfterQC.filtered_geno.txt"
parameter: sample_size = 415
## Containers that contains the necessary packages
parameter: container = ""
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# Allow exclude some study
parameter: drop_study = ["-1","0"]
# use this function to edit memory string for PLINK input
from sos.utils import expand_size

In [None]:
[LD]
input: geno_inv, group_by = 1
output: f'{_input:n}.ld',f'{_input:n}.named.ld.tsv'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{_input:ad}:{_input:ad}']
    plink \
          --bfile ${_input:n} \
          --keep ${sample_to_keep} \
          --out ${_output[0]:n} \
          --threads ${numThreads} \
          --memory ${int(expand_size(mem) * 0.9)/1e6} --r square
python: expand= "${ }", stderr = f'{_output[1]}.stderr', stdout = f'{_output[1]}.stdout', container = container, volumes = [f'{_input:ad}:{_input:ad}']
    import pandas as pd
    ld  = pd.read_csv("${_output[0]}","\t",header = None)
    snp  = pd.read_csv("${_input:n}.bim","\t",header = None).iloc[:,1].tolist()
    ld.index = snp
    ld.columns = snp
    ld.to_csv("${_output[1]}","\t")

In [None]:
[Sumstat]
input: ss_inv, group_by = 1
output: f'{_input:n}.bhat.tsv',f'{_input:n}.sbhat.tsv',f'{_input:n}.z.tsv',f'{_input:n}.n.tsv'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand= "${ }", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{_input:ad}:{_input:ad}']
    data = readRDS("${_input}")
    write.table((t(data$bhat)), "${_output[0]}", sep = "\t", quote = FALSE)
    write.table((t(data$sbhat)), "${_output[1]}", sep = "\t", quote = FALSE)
    write.table((t(data$Z)), "${_output[2]}", sep = "\t", quote = FALSE)
    n = t(data$Z)
    n[,] = ${sample_size}
    write.table(n, "${_output[3]}", sep = "\t", quote = FALSE)

In [1]:
[CAFEH]
input: output_from(["LD","Sumstat"])
output: f'{cwd}/{_input[0]:bn}/cafeh.beta.results',f'{cwd}/{_input[0]:bn}/cafeh.beta.model'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
python: expand= "${ }", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout', container = container, volumes = [f'{_input:ad}:{_input:ad}']
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    from cafeh.cafeh_summary import fit_cafeh_summary, fit_cafeh_z
    from cafeh.cafeh_genotype import fit_cafeh_genotype
    
    from cafeh.model_queries import *
    LD_df = pd.read_csv(${_input[1]:r},sep='\t',index_col=0)
    beta_df = pd.read_csv(${_input[2]:r},sep='\t',index_col=0)
    # Drop specify condition
    if ${drop_study[0]} > -1:
        beta_df = beta_df.drop(beta_df.iloc[[${','.join(drop_study)}],].index)
    # Drop if SNPs are empty in all study, then drop study that have NA values
    beta_df = beta_df.dropna( 1, "all").dropna()
    LD_df = LD_df.dropna(1, "all").dropna()
    LD_df = LD_df.loc[list(set(beta_df.columns) & set(LD_df.index)),list(set(beta_df.columns) & set(LD_df.index))]
    beta_df = beta_df[list(set(beta_df.columns) & set(LD_df.index))]
    
    stderr_df = pd.read_csv(${_input[3]:r},sep='\t',index_col=0).loc[list(set(beta_df.index)),list(set(beta_df.columns) & set(LD_df.index))].dropna()
    beta_df = beta_df.loc[list(set(stderr_df.index)),]
    n_df = pd.read_csv(${_input[5]:r},sep='\t',index_col=0).loc[beta_df.index,beta_df.columns]
    cafeh = fit_cafeh_summary(LD_df, beta_df, stderr_df, n=n_df)
    cafeh.save('${_output[1]}', save_data=True)
    variant_report = summary_table(cafeh)
    variant_report.to_csv(${_output[0]:r}, sep='\t')