# Fine-mapping with SuSiE RSS model

This notebook take a list of LD file and a list of sumstat file and do salmon QC and susie RSS for each overlap LD block.

In [None]:
[global]

parameter: container = ""
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20

# getting the overlapped input
parameter: LD_list = path
parameter: sumstat_list = path
import pandas as pd
LD_list = pd.read_csv(LD_list,"\t")
sumstat_list = pd.read_csv(sumstat_list,"\t")
LD_list["#chr"] = [x[0].replace("chr", "") for x in  LD_list["#id"].str.split("_") ]
sumstat_list["#chr"] = [str(x).replace("chr", "") for x in  sumstat_list["#chr"] ]
input_inv = LD_list.merge(sumstat_list)
input_list = input_inv.iloc[:,[1,3]].values.tolist()
parameter: lead_idx_choice = "pvalue"
parameter: abf_prior_variance = 0.4
parameter: nlog10p_dentist_s_threshold = 4
parameter: r2_threshold = 0.6


[SuSiE_RSS_1]
input: input_list, group_by = 2
output: f'{cwd:a}/{_input[1]:bn}.{_input[0].split(".")[-3]}.unisusie_rss.fit.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
python: expand = '${ }', stdout = f"{_output:nn}.stdout", stderr = f"{_output:nn}.stderr", container = container
    import pandas as pd
    import numpy as np
    def load_npz_ld(path):
        np_ld_loaded = np.load(path,allow_pickle=True)
        # sort by start position
        snp_id = [x.replace(":","_") for x in np_ld_loaded.get("arr_1")]
        np_ld_loaded = np_ld_loaded.get("arr_0")
        new = np_ld_loaded + np_ld_loaded.T
        np.fill_diagonal(new, np.diag(new)/2)
        return new,snp_id

    def get_bcor_meta(bcor_obj):
        df_ld_snps = bcor_obj.getMeta()
        df_ld_snps.rename(columns={'rsid':'SNP', 'position':'BP', 'chromosome':'CHR', 'allele1':'A1', 'allele2':'A2'}, inplace=True, errors='raise')
        ###df_ld_snps['CHR'] = df_ld_snps['CHR'].astype(np.int64)
        df_ld_snps['BP'] = df_ld_snps['BP'].astype(np.int64)
        return df_ld_snps

    def load_ld_bcor(ld_prefix):
        bcor_file = ld_prefix+'.bcor'
        import os
        import time
        from ldstore.bcor import bcor
        if not os.path.exists(bcor_file):
            raise IOError('%s not found'%(bcor_file))
        t0 = time.time()
        bcor_obj = bcor(bcor_file)
        df_ld_snps = get_bcor_meta(bcor_obj)
        ld_arr = bcor_obj.readCorr([])
        assert np.all(~np.isnan(ld_arr))
        return ld_arr, df_ld_snps

    def abf(beta, se, W=0.04):
        from scipy import special 
        z = beta / se
        V = se ** 2
        r = W / (W + V)
        lbf = 0.5 * (np.log(1 - r) + (r * z ** 2))
        denom = special.logsumexp(lbf)
        prob = np.exp(lbf - denom)
        return lbf, prob
    
    def get_cs(variant, prob, coverage=0.95):
        ordering = np.argsort(prob)[::-1]
        idx = np.where(np.cumsum(prob[ordering]) > coverage)[0][0]
        cs = variant[ordering][: (idx + 1)]
        return cs
    def slalom(df,LD,abf_prior_variance = 0.4 ,nlog10p_dentist_s_threshold = 4, r2_threshold = 0.6  ):
        from scipy import stats
        lbf, prob = abf(df.beta, df.se, W=abf_prior_variance)
        cs = get_cs(df.variant, prob, coverage=0.95)
        cs_99 = get_cs(df.variant, prob, coverage=0.99)
        df["lbf"] = lbf
        df["prob"] = prob
        df["cs"] = df.variant.isin(cs)
        df["cs_99"] = df.variant.isin(cs_99)
        
        if ${lead_idx_choice} == "pvalue":
            lead_idx_snp = df.pvalue.idxmin()
        else:
            lead_idx_snp = df.prob.idxmax()
        
        
        
        lead_variant = df.variant[lead_idx_snp]
        df["lead_variant"] = False
        df["lead_variant"].iloc[lead_idx_snp] = True
        # annotate LD     
        ## This is to identify the R for each snp vs the lead snp
        df["r"] = [LD[np.where(np.in1d(df.variant,lead_variant))][:,np.where(np.in1d(df.variant,x))][0][0][0] for x in df.variant]
        lead_z = (df.beta / df.se).iloc[lead_idx_snp]
        df["t_dentist_s"] = ((df.beta / df.se) - df.r * lead_z) ** 2 / (1 - df.r ** 2)
        df["t_dentist_s"] = np.where(df["t_dentist_s"] < 0, np.inf, df["t_dentist_s"])
        df["t_dentist_s"].iloc[lead_idx_snp] = np.nan
        df["nlog10p_dentist_s"] = stats.chi2.logsf(df["t_dentist_s"], df=1) / -np.log(10)
        df["r2"] = df.r ** 2
        df["outliers"] = (df.r2 > r2_threshold) & (df.nlog10p_dentist_s > nlog10p_dentist_s_threshold)
        df_output = df
        n_r2 = np.sum(df.r2 > r2_threshold)
        n_dentist_s_outlier = np.sum(
            (df.r2 > r2_threshold) & (df.nlog10p_dentist_s > nlog10p_dentist_s_threshold)
        )
        max_pip_idx = df.prob.idxmax()
        df_summary = pd.DataFrame(
            {
                "lead_pip_variant": [df.variant.iloc[max_pip_idx]],
                "n_total": [len(df.index)],
                "n_r2": [n_r2],
                "n_dentist_s_outlier": [n_dentist_s_outlier],
                "fraction": [n_dentist_s_outlier / n_r2 if n_r2 > 0 else 0],
                "max_pip": [np.max(df.prob)]
            }
            )
        return df, df_summary
    
    ## Load LD
    if "${_input[0]}".endswith("npz"):
        LD,snp_id = load_npz_ld(${_input[0]:r}) 
    if "${_input[0]}".endswith("bcor"):
        LD,snp_id = load_ld_bcor(${_input[0]:nr}) 
        
    sumstat = pd.read_csv(${_input[1]:r}, "\t")
    ## Get only intersect snp
    intersct = np.intersect1d(sumstat.variant.to_numpy(),snp_id)
    sumstat = sumstat.query("variant in @intersct").reset_index()
    indice = np.where(np.in1d(snp_id, intersct))
    LD = LD[np.ix_(indice[0].tolist(), indice[0].tolist())]    
    ## slalom
    ss_qc,ss_qc_sum = slalom(sumstat,LD,${abf_prior_variance},${nlog10p_dentist_s_threshold},${r2_threshold})
    
    ## Filter out outlier
    LD = LD[np.ix_(ss_qc[~ss_qc.outliers].index,ss_qc[~ss_qc.outliers].index)]  
    ss_qc = ss_qc[~ss_qc.outliers]
    
    ## SuSiERSS

In [153]:
LD[np.ix_(ss_qc[~ss_qc.outliers].index,ss_qc[~ss_qc.outliers].index)]

array([[ 1.      ,  0.007248, -0.02019 , ...,  0.05038 , -0.01564 ,
         0.0509  ],
       [ 0.007248,  1.      , -0.02397 , ...,  0.01753 , -0.02008 ,
         0.01118 ],
       [-0.02019 , -0.02397 ,  1.      , ...,  0.0109  ,  0.0851  ,
         0.03336 ],
       ...,
       [ 0.05038 ,  0.01753 ,  0.0109  , ...,  1.      , -0.03488 ,
         0.9097  ],
       [-0.01564 , -0.02008 ,  0.0851  , ..., -0.03488 ,  1.      ,
        -0.0333  ],
       [ 0.0509  ,  0.01118 ,  0.03336 , ...,  0.9097  , -0.0333  ,
         1.      ]], dtype=float16)

In [152]:
np.shape(LD[np.ix_(ss_qc[~ss_qc.outliers].index,ss_qc[~ss_qc.outliers].index)])

(6668, 6668)

In [147]:
np.shape(LD)

(6668, 6668)

In [134]:
np.delete(LD, ss_qc[ss_qc.outliers].index, ss_qc[ss_qc.outliers].index)

TypeError: an integer is required (got type Int64Index)

# Draft

In [None]:
    elif "${_input[0]}".endswith("bcor"):
        LD,snp_id = load_bcor_ld(${_input[0]:r})

In [2]:
import pandas as pd
import numpy as np


In [3]:
LD_list = pd.read_csv("/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output_npz_2/1300_hg38_EUR_LD_blocks_npz_files/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.ld.list","\t")
sumstat_list = pd.read_csv("/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS/data_intergration/ADGWAS2022/qced_sumstat_list.txt","\t")


In [96]:
sumstat

Unnamed: 0,index,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency
0,4,1,727233,G,A,chr1_727233_G_A,-0.0210,0.0453,0.6424,0.0191,81166,386644,0.0191
1,9,1,733014,AG,A,chr1_733014_AG_A,0.0108,0.0413,0.7935,0.0267,78401,374200,0.0267
2,13,1,758213,AT,A,chr1_758213_AT_A,0.1318,0.1069,0.2173,0.0038,79473,380718,0.0038
3,16,1,758540,T,C,chr1_758540_T_C,0.0429,0.1113,0.6996,0.0040,77766,372990,0.9960
4,19,1,766399,GAATA,G,chr1_766399_GAATA_G,0.0055,0.0249,0.8246,0.0678,80801,380538,0.0678
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6663,15105,1,2886544,G,A,chr1_2886544_G_A,0.0071,0.0451,0.8746,0.0110,85502,400497,0.0110
6664,15109,1,2887241,C,T,chr1_2887241_C_T,-0.0055,0.0090,0.5417,0.3818,84196,393797,0.3818
6665,15114,1,2887908,T,C,chr1_2887908_T_C,-0.0194,0.0138,0.1594,0.1043,85665,400902,0.8957
6666,15116,1,2887986,T,C,chr1_2887986_T_C,0.0809,0.1020,0.4275,0.0034,81503,380555,0.9966


In [12]:
LD_list["#chr"] = [x[0].replace("chr", "") for x in  LD_list["#id"].str.split("_") ]

In [13]:
sumstat_list["#chr"] = [str(x).replace("chr", "") for x in  sumstat_list["#chr"] ]

In [14]:
input_inv = LD_list.merge(sumstat_list)

In [15]:
input_inv

Unnamed: 0,#id,dir,#chr,ADGWAS_Bellenguez_2022
0,chr1_16103_2888443,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1,chr1_2888443_4320284,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
2,chr1_4320284_5853833,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
3,chr1_5853833_7110219,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
4,chr1_7110219_9473386,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
...,...,...,...,...
1356,chr22_43251864_44603286,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1357,chr22_44603286_46177037,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1358,chr22_46177037_47876022,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1359,chr22_47876022_49355984,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...


In [16]:
LD_file = input_inv.iloc[:,1].values.tolist()
sumstat_file = input_inv.iloc[:,3].values.tolist()

In [17]:
sumstat_file = input_inv.iloc[:,3].values.tolist()

In [18]:
test = input_inv.iloc[:,[1,3]].values.tolist()[0]

In [154]:
test = input_inv.iloc[:,[1,3]].values.tolist()[1208]

In [155]:
test

['/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output_npz_2/1300_hg38_EUR_LD_blocks_npz_files/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.chr17_60570445_65149278.flt16.npz',
 '/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS/data_intergration/ADGWAS2022/ADGWAS_Bellenguez_2022.17/ADGWAS2022.chr17.sumstat.tsv']

In [156]:
test[0].endswith("npz")

True

In [98]:
a = ['Brian', 'Steve', 'Andrew', 'Craig']
b = ['Andrew','Steve']
c = np.setxor1d(a, b)


In [99]:
c

array(['Brian', 'Craig'], dtype='<U6')

In [116]:
type(intersct)

numpy.ndarray

In [115]:
type()

numpy.ndarray

In [104]:
ss_qc[ss_qc.outliers].variant

Series([], Name: variant, dtype: object)

In [21]:
def load_npz_ld(path):
        np_ld_loaded = np.load(path,allow_pickle=True)
        # sort by start position
        snp_id = [x.replace(":","_") for x in np_ld_loaded.get("arr_1")]
        np_ld_loaded = np_ld_loaded.get("arr_0")
        new = np_ld_loaded + np_ld_loaded.T
        np.fill_diagonal(new, np.diag(new)/2)
        return new,snp_id


def load_ld_bcor(ld_prefix):
    bcor_file = ld_prefix+'.bcor'
    import os
    import time
    from ldstore.bcor import bcor
    if not os.path.exists(bcor_file):
        raise IOError('%s not found'%(bcor_file))
    t0 = time.time()
    bcor_obj = bcor(bcor_file)
    df_ld_snps = get_bcor_meta(bcor_obj)
    ld_arr = bcor_obj.readCorr([])
    assert np.all(~np.isnan(ld_arr))
    return ld_arr, df_ld_snps

In [142]:
load_ld_bcor("/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/22/22_16050408_17674295.bcor")

NameError: name 'get_bcor_meta' is not defined

In [146]:
from ldstore.bcor import bcor

In [147]:
bcor("/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/22/22_16050408_17674295.bcor")


<ldstore.bcor.bcor at 0x2b8dcebaa910>

In [157]:
new,snp_id = load_npz_ld(test[0])

In [158]:
sumstat = pd.read_csv(test[1], "\t")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [159]:
sumstat.variant

0               chr17_129957_G_A
1               chr17_130935_T_C
2               chr17_130971_T_C
3               chr17_131543_T_C
4            chr17_138820_C_CCCG
                   ...          
503590        chr17_83205421_G_A
503591        chr17_83206344_G_A
503592        chr17_83206991_C_T
503593        chr17_83214512_G_A
503594    chr17_83219409_GGAGT_G
Name: variant, Length: 503595, dtype: object

In [125]:
a = np.intersect1d(sumstat.variant.to_numpy(),[x.replace(":","_") for x in snp_id])

In [126]:
len(a)

6668

In [120]:
snp_id

array(['chr1:16206_T_A', 'chr1:16433_C_G', 'chr1:16619_C_T', ...,
       'chr1:2887986_T_C', 'chr1:2888101_C_G', 'chr1:2888245_G_A'],
      dtype=object)

In [111]:
sumstat.variant.to_numpy()

array(['chr1_594445_C_T', 'chr1_595762_C_CTG', 'chr1_630947_G_A', ...,
       'chr1_248936715_C_T', 'chr1_248937246_A_G', 'chr1_248941807_AG_A'],
      dtype=object)

In [77]:
sumstat.variant.to_numpy()

array(['chr1_100000223_C_T', 'chr1_100000723_G_A', 'chr1_100000874_G_T',
       ..., 'chr1_248936715_C_T', 'chr1_248937246_A_G',
       'chr1_248941807_AG_A'], dtype=object)

In [94]:
snp_id = pd.read_csv("/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output_npz_2/1300_hg38_EUR_LD_blocks_plink_files/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.chr1_16103_2888443.bim","\t",header = None)[1].to_numpy()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [95]:
snp_id

array(['chr1:16206_T_A', 'chr1:16433_C_G', 'chr1:16619_C_T', ...,
       'chr1:2887986_T_C', 'chr1:2888101_C_G', 'chr1:2888245_G_A'],
      dtype=object)

In [88]:
test[0]

'/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output_npz_2/1300_hg38_EUR_LD_blocks_npz_files/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.chr1_16103_2888443.flt16.npz'

In [96]:
np.intersect1d(sumstat.variant.to_numpy(),snp_id)

array([], dtype=object)

In [97]:
snp_id

array(['chr1:16206_T_A', 'chr1:16433_C_G', 'chr1:16619_C_T', ...,
       'chr1:2887986_T_C', 'chr1:2888101_C_G', 'chr1:2888245_G_A'],
      dtype=object)

In [98]:
sumstat.variant.to_numpy()

array(['chr1_100000223_C_T', 'chr1_100000723_G_A', 'chr1_100000874_G_T',
       ..., 'chr1_248936715_C_T', 'chr1_248937246_A_G',
       'chr1_248941807_AG_A'], dtype=object)

In [161]:
sumstat.sort_values("position")

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency
0,17,129957,G,A,chr17_129957_G_A,0.0288,0.0262,0.27320,0.0501,34556,49251,0.0501
1,17,130935,T,C,chr17_130935_T_C,-0.0252,0.0134,0.05962,0.4255,34556,49251,0.4255
2,17,130971,T,C,chr17_130971_T_C,-0.0259,0.0134,0.05388,0.4197,34556,49251,0.4197
3,17,131543,T,C,chr17_131543_T_C,-0.0273,0.0135,0.04305,0.4171,34556,49251,0.4171
4,17,138820,C,CCCG,chr17_138820_C_CCCG,-0.7155,0.4643,0.12340,0.0005,20301,21839,0.0005
...,...,...,...,...,...,...,...,...,...,...,...,...
503590,17,83205421,G,A,chr17_83205421_G_A,-0.0960,0.2479,0.69860,0.0008,69576,360279,0.0008
503591,17,83206344,G,A,chr17_83206344_G_A,0.0662,0.0804,0.41080,0.0057,84196,393797,0.0057
503592,17,83206991,C,T,chr17_83206991_C_T,-0.0015,0.0823,0.98550,0.0054,83196,386481,0.0054
503593,17,83214512,G,A,chr17_83214512_G_A,-0.0051,0.0205,0.80510,0.1278,34921,55357,0.1278


In [160]:
sumstat

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency
0,17,129957,G,A,chr17_129957_G_A,0.0288,0.0262,0.27320,0.0501,34556,49251,0.0501
1,17,130935,T,C,chr17_130935_T_C,-0.0252,0.0134,0.05962,0.4255,34556,49251,0.4255
2,17,130971,T,C,chr17_130971_T_C,-0.0259,0.0134,0.05388,0.4197,34556,49251,0.4197
3,17,131543,T,C,chr17_131543_T_C,-0.0273,0.0135,0.04305,0.4171,34556,49251,0.4171
4,17,138820,C,CCCG,chr17_138820_C_CCCG,-0.7155,0.4643,0.12340,0.0005,20301,21839,0.0005
...,...,...,...,...,...,...,...,...,...,...,...,...
503590,17,83205421,G,A,chr17_83205421_G_A,-0.0960,0.2479,0.69860,0.0008,69576,360279,0.0008
503591,17,83206344,G,A,chr17_83206344_G_A,0.0662,0.0804,0.41080,0.0057,84196,393797,0.0057
503592,17,83206991,C,T,chr17_83206991_C_T,-0.0015,0.0823,0.98550,0.0054,83196,386481,0.0054
503593,17,83214512,G,A,chr17_83214512_G_A,-0.0051,0.0205,0.80510,0.1278,34921,55357,0.1278


In [176]:
sumstat.query("variant in @a")

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency,lbf,prob,cs,cs_99
4,1,727233,G,A,chr1_727233_G_A,-0.0210,0.0453,0.6424,0.0191,81166,386644,0.0191,-1.407817,9.027621e-32,False,False
9,1,733014,AG,A,chr1_733014_AG_A,0.0108,0.0413,0.7935,0.0267,78401,374200,0.0267,-1.565541,7.710363e-32,False,False
13,1,758213,AT,A,chr1_758213_AT_A,0.1318,0.1069,0.2173,0.0038,79473,380718,0.0038,-0.160906,3.141232e-31,False,False
16,1,758540,T,C,chr1_758540_T_C,0.0429,0.1113,0.6996,0.0040,77766,372990,0.9960,-0.664266,1.898863e-31,False,False
19,1,766399,GAATA,G,chr1_766399_GAATA_G,0.0055,0.0249,0.8246,0.0678,80801,380538,0.0678,-2.067118,4.669203e-32,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15105,1,2886544,G,A,chr1_2886544_G_A,0.0071,0.0451,0.8746,0.0110,85502,400497,0.0110,-1.502443,8.212549e-32,False,False
15109,1,2887241,C,T,chr1_2887241_C_T,-0.0055,0.0090,0.5417,0.3818,84196,393797,0.3818,-2.915753,1.998412e-32,False,False
15114,1,2887908,T,C,chr1_2887908_T_C,-0.0194,0.0138,0.1594,0.1043,85665,400902,0.8957,-1.692573,6.790557e-32,False,False
15116,1,2887986,T,C,chr1_2887986_T_C,0.0809,0.1020,0.4275,0.0034,81503,380555,0.9966,-0.539330,2.151555e-31,False,False


In [172]:
len(new)

15423

In [174]:
len(snp_id)

15423

In [175]:
len(sumstat)

1451880

In [162]:
LD = new

In [167]:
np.shape(LD)

(9346, 9346)

In [166]:
intersct = np.intersect1d(sumstat.variant.to_numpy(),snp_id)
sumstat = sumstat.query("variant in @intersct")
indice = np.where(np.in1d(snp_id, intersct))
LD=LD[np.ix_(indice[0].tolist(), indice[0].tolist())]

In [150]:
LD[[1,2,3],[4,5,6]]

array([-0.02258, -0.03102, -0.01105], dtype=float16)

In [238]:
sumstat["r"] = [LD[np.where(np.in1d(intersct,lead_variant))][:,np.where(np.in1d(intersct,x))][0][0][0] for x in sumstat.variant]

In [240]:
sumstat.query('variant == @lead_variant')

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency,lbf,prob,cs,cs_99,r
1385,1,1000335,C,T,chr1_1000335_C_T,-0.0241,0.0106,0.02273,0.2123,82635,393749,0.2123,-0.361512,2.570266e-31,False,False,1.0


In [27]:
LD[indice,:]

array([[[-1.4275e-02, -1.4275e-02,  4.0283e-02, ..., -1.5640e-02,
         -1.2062e-02,  5.0903e-02],
        [-1.6953e-02, -1.6953e-02,  2.8625e-02, ..., -2.0081e-02,
         -1.4320e-02,  1.1177e-02],
        [-6.1226e-03, -6.1226e-03, -6.7329e-03, ...,  8.5083e-02,
         -5.1727e-03,  3.3356e-02],
        ...,
        [ 2.7790e-03, -2.9465e-02, -2.9564e-03, ..., -3.4882e-02,
          2.3479e-03,  9.0967e-01],
        [-5.1270e-03, -5.1270e-03, -5.6381e-03, ...,  1.0000e+00,
         -4.3297e-03, -3.3295e-02],
        [ 5.5084e-03, -2.8122e-02, -2.1267e-04, ..., -3.3295e-02,
          4.6539e-03,  1.0000e+00]]], dtype=float16)

In [211]:
LD = LD[np.ix_(indice[0].tolist(), indice[0].tolist())]

In [229]:
sumstat = sumstat.query("variant in @intersct")

In [195]:
np.ix_(indice.values, indice)

AttributeError: 'tuple' object has no attribute 'values'

In [140]:
where = np.where(np.in1d(snp_id, a))
new[where,where]

In [138]:
snp_id = [x.replace(":","_") for x in snp_id]

In [None]:
    hl._set_flags(no_whole_stage_codegen="1")
    reference_genome = args.reference_genome
    gnomad_version = gnomad_latest_versions[reference_genome]
    gnomad_ht_path = f"gs://finucane-requester-pays/slalom/gnomad/release/{gnomad_version}/ht/genomes/gnomad.genomes.r{gnomad_version}.sites.most_severe.ht"

    ht_snp = hl.import_table(args.snp, impute=True, types={"chromosome": hl.tstr}, delimiter="\s+")
    ht_snp = ht_snp.annotate(
        locus=hl.parse_locus(
            hl.delimit([ht_snp.chromosome, hl.str(ht_snp.position)], delimiter=":"), reference_genome=reference_genome
        ),
        alleles=[ht_snp.allele1, ht_snp.allele2],
    )
    if args.align_alleles:
        ht_gnomad = hl.read_table(gnomad_ht_path)
        ht_snp = align_alleles(ht_snp, ht_gnomad, flip_rows=["beta"])

    ht_snp = ht_snp.annotate(variant=hl.variant_str(ht_snp.locus, ht_snp.alleles))
    ht_snp = ht_snp.key_by("locus", "alleles")
    ht_snp = ht_snp.add_index("idx_snp")

    # annotate in novel CUPs and reject
    cup = hl.read_table(f"gs://finucane-requester-pays/slalom/cup_files/FASTA_BED.ALL_{reference_genome}.novel_CUPs.ht")
    reject = hl.read_table(
        f"gs://finucane-requester-pays/slalom/cup_files/FASTA_BED.ALL_{reference_genome}.reject_2.ht"
    )
    ht_snp = ht_snp.annotate(in_cups=hl.is_defined(cup[ht_snp.locus]) | hl.is_defined(reject[ht_snp.locus]))

    # annotate vep and freq
    if args.annotate_consequence or args.annotate_gnomad_freq:
        ht_gnomad = hl.read_table(gnomad_ht_path)
        consequences = ["most_severe", "gene_most_severe", "consequence"] if args.annotate_consequence else []
        freq_expr = (
            {f"gnomad_v{gnomad_version[0]}_af_{pop}": ht_gnomad.freq[pop].AF for pop in gnomad_pops[reference_genome]}
            if args.annotate_gnomad_freq
            else {}
        )
        ht_gnomad = ht_gnomad.select(*consequences, **freq_expr)
        ht_snp = ht_snp.join(ht_gnomad, how="left")
    ht_snp = ht_snp.checkpoint(new_temp_file())


In [226]:
LD[np.where(np.in1d(intersct,lead_variant))]

array([[ 0.007248,  1.      , -0.02397 , ...,  0.01753 , -0.02008 ,
         0.01118 ]], dtype=float16)

In [228]:
sumstat.variant

4            chr1_727233_G_A
9           chr1_733014_AG_A
13          chr1_758213_AT_A
16           chr1_758540_T_C
19       chr1_766399_GAATA_G
                ...         
15105       chr1_2886544_G_A
15109       chr1_2887241_C_T
15114       chr1_2887908_T_C
15116       chr1_2887986_T_C
15117       chr1_2888245_G_A
Name: variant, Length: 6668, dtype: object

In [224]:

lead_variant = "chr1_1000335_C_T"


In [218]:
intersct

array(['chr1_1000018_G_A', 'chr1_1000335_C_T', 'chr1_1001034_G_A', ...,
       'chr1_998364_C_T', 'chr1_999005_G_A', 'chr1_999842_C_A'],
      dtype=object)

In [None]:
import scipy as sp

In [155]:
lbf, prob = abf(sumstat.beta, sumstat.se)
sumstat["lbf"] = lbf
sumstat["prob"] = prob
sumstat["cs"] = sumstat.variant.isin(cs)
cs_99 = get_cs(sumstat.variant, prob, coverage=0.99)
sumstat["cs_99"] = sumstat.variant.isin(cs_99)

In [158]:
cs = get_cs(sumstat.variant, prob, coverage=0.95)


In [166]:
lead_idx_snp = sumstat.prob.idxmax()

In [168]:
sumstat.prob.max()

0.5219822472224083

In [167]:
lead_idx_snp

1164649

In [208]:
np.shape(LD[2,:])

(15423,)

In [173]:
def slalom(df,LD,abf_prior_variance = 0.4 ,nlog10p_dentist_s_threshold = 4, r2_threshold = 0.6  ):
    from scipy import stats
    lbf, prob = abf(df.beta, df.se, W=abf_prior_variance)
    cs = get_cs(df.variant, prob, coverage=0.95)
    cs_99 = get_cs(df.variant, prob, coverage=0.99)
    df["lbf"] = lbf
    df["prob"] = prob
    df["cs"] = df.variant.isin(cs)
    df["cs_99"] = df.variant.isin(cs_99)
    lead_idx_snp = df.prob.idxmax()
    lead_variant = df.variant[lead_idx_snp]
    df["lead_variant"] = False
    df["lead_variant"].iloc[lead_idx_snp] = True
    # annotate LD     
    ## This is to identify the R for each snp vs the lead snp
    df["r"] = [LD[np.where(np.in1d(df.variant,lead_variant))][:,np.where(np.in1d(df.variant,x))][0][0][0] for x in df.variant]
    lead_z = (df.beta / df.se).iloc[lead_idx_snp]
    df["t_dentist_s"] = ((df.beta / df.se) - df.r * lead_z) ** 2 / (1 - df.r ** 2)
    df["t_dentist_s"] = np.where(df["t_dentist_s"] < 0, np.inf, df["t_dentist_s"])
    df["t_dentist_s"].iloc[lead_idx_snp] = np.nan
    df["nlog10p_dentist_s"] = stats.chi2.logsf(df["t_dentist_s"], df=1) / -np.log(10)
    df["r2"] = df.r ** 2
    df["outliers"] = (df.r2 > r2_threshold) & (df.nlog10p_dentist_s > nlog10p_dentist_s_threshold)
    df_output = df
    n_r2 = np.sum(df.r2 > r2_threshold)
    n_dentist_s_outlier = np.sum(
        (df.r2 > r2_threshold) & (df.nlog10p_dentist_s > nlog10p_dentist_s_threshold)
    )
    max_pip_idx = df.prob.idxmax()
    df_summary = pd.DataFrame(
        {
            "lead_pip_variant": [df.variant.iloc[max_pip_idx]],
            "n_total": [len(df.index)],
            "n_r2": [n_r2],
            "n_dentist_s_outlier": [n_dentist_s_outlier],
            "fraction": [n_dentist_s_outlier / n_r2 if n_r2 > 0 else 0],
            "max_pip": [np.max(df.prob)]
        }
        )
    return df, df_summary


In [186]:
sumstat

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency
346227,17,60570542,C,T,chr17_60570542_C_T,0.0447,0.0294,0.12900,0.0225,85502,400497,0.0225
346229,17,60571044,C,A,chr17_60571044_C_A,0.0253,0.0352,0.47270,0.0151,85137,394391,0.0151
346234,17,60572325,C,T,chr17_60572325_C_T,0.0304,0.1131,0.78820,0.0026,77766,372990,0.0026
346244,17,60575249,G,A,chr17_60575249_G_A,0.0270,0.0171,0.11450,0.0687,85934,401577,0.0687
346246,17,60576951,T,G,chr17_60576951_T_G,-0.0273,0.2901,0.92510,0.0004,69576,360279,0.9996
...,...,...,...,...,...,...,...,...,...,...,...,...
370420,17,65146804,C,T,chr17_65146804_C_T,0.0034,0.0105,0.74350,0.1902,85569,395471,0.1902
370422,17,65147121,C,T,chr17_65147121_C_T,-0.0265,0.0174,0.12780,0.0791,85569,395471,0.0791
370426,17,65147645,C,T,chr17_65147645_C_T,0.0036,0.0105,0.73070,0.1900,85569,395471,0.1900
370431,17,65148432,G,A,chr17_65148432_G_A,-0.2126,0.1057,0.04432,0.0025,83196,386481,0.0025


In [89]:
sum((ss_qc.r2 > 0.4) & (ss_qc.nlog10p_dentist_s > 4))

0

In [39]:
LD = LD[np.ix_(indice[0].tolist(), indice[0].tolist())]    

In [41]:
len(LD)

6668

In [43]:
cs = get_cs(sumstat.variant, prob, coverage=0.95)

NameError: name 'prob' is not defined

In [46]:
abf_prior_variance = 0.4

In [52]:
lbf, prob = abf(sumstat.beta, sumstat.se, W=abf_prior_variance)

In [53]:
prob

0       0.000059
1       0.000050
2       0.000259
3       0.000138
4       0.000030
          ...   
6663    0.000053
6664    0.000013
6665    0.000043
6666    0.000161
6667    0.000018
Length: 6668, dtype: float64

In [51]:
sumstat = sumstat.reset_index()

In [54]:
cs = get_cs(sumstat.variant, prob, coverage=0.95)

In [174]:
ss_qc,ss_qc_sum = slalom(sumstat.reset_index(),LD)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["lead_variant"].iloc[lead_idx_snp] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["t_dentist_s"].iloc[lead_idx_snp] = np.nan


In [183]:
ss_qc_sum

Unnamed: 0,lead_pip_variant,n_total,n_r2,n_dentist_s_outlier,fraction,max_pip
0,chr17_63492371_G_A,9346,22,5,0.227273,0.322572


In [181]:
ss_qc.query("r2 > 0.6").query("nlog10p_dentist_s > 4")

Unnamed: 0,index,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,...,lbf,prob,cs,cs_99,lead_variant,r,t_dentist_s,nlog10p_dentist_s,r2,outliers
6074,361975,17,63498093,ACT,A,chr17_63498093_ACT_A,-0.0547,0.0084,8.092e-11,0.4673,...,16.877246,0.003979,False,False,False,0.945312,1657.15651,inf,0.893555,True
6081,361992,17,63499673,G,A,chr17_63499673_G_A,0.0488,0.0082,2.917e-09,0.4775,...,13.359971,0.000118,False,False,False,-0.850098,522.431636,114.90249,0.722656,True
6082,361994,17,63499854,G,A,chr17_63499854_G_A,0.0488,0.0082,2.786e-09,0.4774,...,13.359971,0.000118,False,False,False,-0.850586,524.618012,115.378159,0.723633,True
6085,362002,17,63500497,A,G,chr17_63500497_A_G,0.0495,0.0082,1.602e-09,0.4777,...,13.871559,0.000197,False,False,False,-0.849609,527.668525,116.041824,0.72168,True
6086,362005,17,63500774,G,A,chr17_63500774_G_A,0.0493,0.0082,1.832e-09,0.4777,...,13.724648,0.00017,False,False,False,-0.849121,524.287428,115.306238,0.721191,True


In [177]:
ss_qc.prob.idxmax()

6060

In [178]:
ss_qc.pvalue.idxmin()

6060

In [185]:
ss_qc.query("position == 63476980")

Unnamed: 0,index,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,...,lbf,prob,cs,cs_99,lead_variant,r,t_dentist_s,nlog10p_dentist_s,r2,outliers


In [79]:
ss_qc[ss_qc.nlog10p_dentist_s.isna()]

Unnamed: 0,level_0,index,chromosome,position,ref,alt,variant,beta,se,pvalue,...,original_effect_allele_frequency,lbf,prob,cs,cs_99,lead_variant,r,t_dentist_s,nlog10p_dentist_s,r2
4838,4838,11200,1,2314630,G,A,chr1_2314630_G_A,0.2503,0.0764,0.001059,...,0.0054,3.168614,0.017654,True,True,True,1.0,,,1.0


In [32]:
def abf(beta, se, W=0.04):
        from scipy import special 
        z = beta / se
        V = se ** 2
        r = W / (W + V)
        lbf = 0.5 * (np.log(1 - r) + (r * z ** 2))
        denom = special.logsumexp(lbf)
        prob = np.exp(lbf - denom)
        return lbf, prob
    
def get_cs(variant, prob, coverage=0.95):
        ordering = np.argsort(prob)[::-1]
        idx = np.where(np.cumsum(prob[ordering]) > coverage)[0][0]
        cs = variant[ordering][: (idx + 1)]
        return cs


In [189]:
np.where(np.in1d(snp_id,"chr17_63476980_C_T"))

(array([], dtype=int64),)

In [192]:
snp_id[-1]

'chr17_65149142_GTTGTTATGTT_*'

In [202]:
import rpy2

ModuleNotFoundError: No module named 'rpy2'

In [12]:
import rpy2
import rpy2.robjects.numpy2ri as numpy2ri
import rpy2.robjects as ro
ro.conversion.py2ri = numpy2ri
numpy2ri.activate()
from rpy2.robjects.packages import importr
susieR = importr('susieR')
R_null = ro.rinterface.NULL

R[write to console]: Error in dyn.load(file, DLLpath = DLLpath, ...) : 
  unable to load shared object '/mnt/mfs/cluster/bin/R-4.0.0/library/methods/libs/methods.so':
  libR.so: cannot open shared object file: No such file or directory



RRuntimeError: Error in dyn.load(file, DLLpath = DLLpath, ...) : 
  unable to load shared object '/mnt/mfs/cluster/bin/R-4.0.0/library/methods/libs/methods.so':
  libR.so: cannot open shared object file: No such file or directory


In [9]:
import os
os.environ['R_HOME'] = '/mnt/mfs/cluster/bin/R-4.0.0/'

In [204]:
import rpy2

In [11]:
import rpy2.robjects

R[write to console]: Error in dyn.load(file, DLLpath = DLLpath, ...) : 
  unable to load shared object '/mnt/mfs/cluster/bin/R-4.0.0/library/methods/libs/methods.so':
  libR.so: cannot open shared object file: No such file or directory



RRuntimeError: Error in dyn.load(file, DLLpath = DLLpath, ...) : 
  unable to load shared object '/mnt/mfs/cluster/bin/R-4.0.0/library/methods/libs/methods.so':
  libR.so: cannot open shared object file: No such file or directory


In [1]:
import pandas as pd

In [2]:
a = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/ADx/ukb_mfi_chr1_22_v3.tsv.ref","\t")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
a

Unnamed: 0,CHR,POS,REF,ALT
0,1,10177.0,A,AC
1,1,10235.0,T,TA
2,1,10352.0,T,TA
3,1,10505.0,A,T
4,1,10506.0,C,G
...,...,...,...,...
93095618,22,51241342.0,C,A
93095619,22,51241386.0,C,G
93095620,22,51244163.0,A,G
93095621,22,51244205.0,C,T


In [None]:
{ 
"language": "python",
 "argv": [
  "/mnt/mfs/cluster/bin/Singularity-ce-3.9.4/bin/singularity",
   "exec",
   "-B", 
   "/mnt/:/mnt/",
   "/home/hs3163/GIT/xqtl-pipeline/container/singularity/stephenslab_rpy2.sif",
   "/opt/conda/bin/python",
   "-m",
  "ipykernel",
  "-f",
  "{connection_file}"
 ],
 "display_name": "Singularity Susie_rpy2",
 "metadata": {
  "debugger": true
 }
}

In [None]:
{
 "language": "python",
 "argv": ["/usr/local/bin/singularity",
   "exec",
   "-B",
   "/mnt/:/mnt/"
   "/home/hs3163/GIT/xqtl-pipeline/container/singularity/stephenslab_rpy2.sif",
   "/opt/conda/bin/python",
   "-m",
   "ipykernel",
   "-f",
   "{connection_file}"
 ],
 "display_name": "Python 3 (Singularity)"
}