# Fine-mapping with SuSiE RSS model

This notebook take a list of LD file and a list of sumstat file and do salmon QC and susie RSS for each overlap LD block.

In [None]:
[global]

parameter: container = ""
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20

# getting the overlapped input
parameter: LD_list = path
parameter: sumstat_list = path
import pandas as pd
LD_list = pd.read_csv(LD_list,"\t")
sumstat_list = pd.read_csv(sumstat_list,"\t")
LD_list["#chr"] = [x[0].replace("chr", "") for x in  LD_list["#id"].str.split("_") ]
sumstat_list["#chr"] = [str(x).replace("chr", "") for x in  sumstat_list["#chr"] ]
input_inv = LD_list.merge(sumstat_list)
input_list = input_inv.iloc[:,[1,3]].values.tolist()
parameter: cwd = path("output")


[SuSiE_RSS_1]
input: input_list, group_by = 2
output: f'{cwd:a}/{_input[1]:bn}.{_input[0].split(".")[-3]}.unisusie_rss.fit.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
python: expand = '${ }', stdout = f"{_output:nn}.stdout", stderr = f"{_output:nn}.stderr", container = container
    import pandas as pd
    import numpy as np
    def load_npz_ld(path):
        np_ld_loaded = np.load(path,allow_pickle=True)
        # sort by start position
        snp_id = [x.replace(":","_") for x in np_ld_loaded.get("arr_1")]
        np_ld_loaded = np_ld_loaded.get("arr_0")
        new = np_ld_loaded + np_ld_loaded.T
        np.fill_diagonal(new, np.diag(new)/2)
        return new,snp_id

    def get_bcor_meta(bcor_obj):
        df_ld_snps = bcor_obj.getMeta()
        df_ld_snps.rename(columns={'rsid':'SNP', 'position':'BP', 'chromosome':'CHR', 'allele1':'A1', 'allele2':'A2'}, inplace=True, errors='raise')
        ###df_ld_snps['CHR'] = df_ld_snps['CHR'].astype(np.int64)
        df_ld_snps['BP'] = df_ld_snps['BP'].astype(np.int64)
        return df_ld_snps

    def load_ld_bcor(ld_prefix):
        bcor_file = ld_prefix+'.bcor'
        import os
        import time
        from ldstore.bcor import bcor
        if not os.path.exists(bcor_file):
            raise IOError('%s not found'%(bcor_file))
        t0 = time.time()
        bcor_obj = bcor(bcor_file)
        df_ld_snps = get_bcor_meta(bcor_obj)
        ld_arr = bcor_obj.readCorr([])
        assert np.all(~np.isnan(ld_arr))
        return ld_arr, df_ld_snps

    def abf(beta, se, W=0.04):
        from scipy import special 
        z = beta / se
        V = se ** 2
        r = W / (W + V)
        lbf = 0.5 * (np.log(1 - r) + (r * z ** 2))
        denom = special.logsumexp(lbf)
        prob = np.exp(lbf - denom)
        return lbf, prob
    
    def get_cs(variant, prob, coverage=0.95):
        ordering = np.argsort(prob)[::-1]
        idx = np.where(np.cumsum(prob[ordering]) > coverage)[0][0]
        cs = variant[ordering][: (idx + 1)]
        return cs
    def slalom(df,LD):
        lbf, prob = abf(df.beta, df.se, W=args.abf_prior_variance)
        cs = get_cs(df.variant, prob, coverage=0.95)
        cs_99 = get_cs(df.variant, prob, coverage=0.99)
        df["lbf"] = lbf
        df["prob"] = prob
        df["cs"] = df.variant.isin(cs)
        df["cs_99"] = df.variant.isin(cs_99)
        lead_idx_snp = df.prob.idxmax()
        lead_variant = df.variant[lead_idx_snp]
        df["lead_variant"] = False
        df["lead_variant"].iloc[lead_idx_snp] = True
        # annotate LD     
        ## This is to identify the R for each snp vs the lead snp
        df["r"] = [LD[np.where(np.in1d(df.variant,lead_variant))][:,np.where(np.in1d(df.variant,x))][0][0][0] for x in df.variant]
        lead_z = (df.beta / df.se).iloc[lead_idx_snp]
        # Dentist_test     
        ## This is to identify the outlier
        df["t_dentist_s"] = ((df.beta / df.se) - df.r * lead_z) ** 2 / (1 - df.r ** 2)
        df["t_dentist_s"] = np.where(df["t_dentist_s"] < 0, np.inf, df["t_dentist_s"])
        df["t_dentist_s"].iloc[lead_idx_snp] = np.nan
        df["nlog10p_dentist_s"] = sp.stats.chi2.logsf(df["t_dentist_s"], df=1) / -np.log(10)
        df_output = df
        df["r2"] = df.r ** 2
        n_r2 = np.sum(df.r2 > args.r2_threshold)
        n_dentist_s_outlier = np.sum(
            (df.r2 > args.r2_threshold) & (df.nlog10p_dentist_s > args.nlog10p_dentist_s_threshold)
        )
        max_pip_idx = df.prob.idxmax()
        variant = df.chromosome.str.cat([df.position.astype(str), df.allele1, df.allele2], sep=":")
        df_summary = pd.DataFrame(
            {
                "lead_pip_variant": [variant.iloc[max_pip_idx]],
                "n_total": [len(df.index)],
                "n_r2": [n_r2],
                "n_dentist_s_outlier": [n_dentist_s_outlier],
                "fraction": [n_dentist_s_outlier / n_r2 if n_r2 > 0 else 0],
                "max_pip": [np.max(df.prob)]
            }
            )
    return df, df_summary
    
    ## Load LD
    if "${_input[0]}".endswith("npz"):
        LD,snp_id = load_npz_ld(${_input[0]:r}) 
    if "${_input[0]}".endswith("bcor"):
        LD,snp_id = load_ld_bcor(${_input[0]:nr}) 
        
    sumstat = pd.read_csv(${_input[1]:r}, "\t")
    ## Get only intersect snp
    intersct = np.intersect1d(sumstat.variant.to_numpy(),snp_id)
    sumstat = sumstat.query("variant in @intersct")
    indice = np.where(np.in1d(snp_id, intersct))
    LD = LD[np.ix_(indice[0].tolist(), indice[0].tolist())]    
    ## slalom
    sumstat_qc = slalom(sumstat,LD)
    ## SuSiERSS
    

# Draft

In [None]:
    elif "${_input[0]}".endswith("bcor"):
        LD,snp_id = load_bcor_ld(${_input[0]:r})

In [53]:
import pandas as pd
import numpy as np

In [105]:
LD_list = pd.read_csv("/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output_npz_2/1300_hg38_EUR_LD_blocks_npz_files/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.ld.list","\t")
sumstat_list = pd.read_csv("/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS/data_intergration/ADGWAS2022/qced_sumstat_list.txt","\t")


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [106]:
sumstat_list

Unnamed: 0,#chr,ADGWAS_Bellenguez_2022
0,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1,2,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
2,3,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
3,4,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
4,5,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
5,6,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
6,7,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
7,8,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
8,9,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
9,10,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...


In [25]:
LD_list["#chr"] = [x[0].replace("chr", "") for x in  LD_list["#id"].str.split("_") ]

In [24]:
sumstat_list["#chr"] = [str(x).replace("chr", "") for x in  sumstat_list["#chr"] ]

In [27]:
input_inv = LD_list.merge(sumstat_list)

In [141]:
input_inv

Unnamed: 0,#id,dir,#chr,ADGWAS_Bellenguez_2022
0,chr1_16103_2888443,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1,chr1_2888443_4320284,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
2,chr1_4320284_5853833,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
3,chr1_5853833_7110219,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
4,chr1_7110219_9473386,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,1,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
...,...,...,...,...
1356,chr22_43251864_44603286,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1357,chr22_44603286_46177037,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1358,chr22_46177037_47876022,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...
1359,chr22_47876022_49355984,/mnt/vast/hpc/csg/molecular_phenotype_calling/...,22,/mnt/vast/hpc/csg/xqtl_workflow_testing/ADGWAS...


In [33]:
LD_file = input_inv.iloc[:,1].values.tolist()
sumstat_file = input_inv.iloc[:,3].values.tolist()

In [36]:
sumstat_file = input_inv.iloc[:,3].values.tolist()

In [47]:
test = input_inv.iloc[:,[1,3]].values.tolist()[0]

In [50]:
test[0].endswith("npz")

True

In [55]:
new

array([[ 1.000e+00, -4.330e-03, -4.761e-03, ..., -5.127e-03, -3.656e-03,
         5.508e-03],
       [-4.330e-03,  1.000e+00, -4.761e-03, ..., -5.127e-03, -3.656e-03,
        -2.812e-02],
       [-4.761e-03, -4.761e-03,  1.000e+00, ..., -5.638e-03, -4.021e-03,
        -2.127e-04],
       ...,
       [-5.127e-03, -5.127e-03, -5.638e-03, ...,  1.000e+00, -4.330e-03,
        -3.329e-02],
       [-3.656e-03, -3.656e-03, -4.021e-03, ..., -4.330e-03,  1.000e+00,
         4.654e-03],
       [ 5.508e-03, -2.812e-02, -2.127e-04, ..., -3.329e-02,  4.654e-03,
         1.000e+00]], dtype=float16)

In [127]:
def load_npz_ld(path):
        np_ld_loaded = np.load(path,allow_pickle=True)
        # sort by start position
        snp_id = [x.replace(":","_") for x in np_ld_loaded.get("arr_1")]
        np_ld_loaded = np_ld_loaded.get("arr_0")
        new = np_ld_loaded + np_ld_loaded.T
        np.fill_diagonal(new, np.diag(new)/2)
        return new,snp_id


def load_ld_bcor(ld_prefix):
    bcor_file = ld_prefix+'.bcor'
    import os
    import time
    from ldstore.bcor import bcor
    if not os.path.exists(bcor_file):
        raise IOError('%s not found'%(bcor_file))
    t0 = time.time()
    bcor_obj = bcor(bcor_file)
    df_ld_snps = get_bcor_meta(bcor_obj)
    ld_arr = bcor_obj.readCorr([])
    assert np.all(~np.isnan(ld_arr))
    return ld_arr, df_ld_snps

In [142]:
load_ld_bcor("/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/22/22_16050408_17674295.bcor")

NameError: name 'get_bcor_meta' is not defined

In [146]:
from ldstore.bcor import bcor

In [147]:
bcor("/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/22/22_16050408_17674295.bcor")


<ldstore.bcor.bcor at 0x2b8dcebaa910>

In [80]:
new,snp_id = load_npz_ld(test[0])

In [82]:
np_ld_loaded = np.load(test[0],allow_pickle=True)

In [87]:
np_ld_loaded.get("arr_1")

array('chr1:16433_C_G', dtype='<U14')

In [171]:
new

array([[ 1.000e+00, -4.330e-03, -4.761e-03, ..., -5.127e-03, -3.656e-03,
         5.508e-03],
       [-4.330e-03,  1.000e+00, -4.761e-03, ..., -5.127e-03, -3.656e-03,
        -2.812e-02],
       [-4.761e-03, -4.761e-03,  1.000e+00, ..., -5.638e-03, -4.021e-03,
        -2.127e-04],
       ...,
       [-5.127e-03, -5.127e-03, -5.638e-03, ...,  1.000e+00, -4.330e-03,
        -3.329e-02],
       [-3.656e-03, -3.656e-03, -4.021e-03, ..., -4.330e-03,  1.000e+00,
         4.654e-03],
       [ 5.508e-03, -2.812e-02, -2.127e-04, ..., -3.329e-02,  4.654e-03,
         1.000e+00]], dtype=float16)

In [107]:
sumstat = pd.read_csv(test[1], "\t")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [108]:
sumstat.variant

0              chr1_594445_C_T
1            chr1_595762_C_CTG
2              chr1_630947_G_A
3              chr1_646157_G_A
4              chr1_727233_G_A
                  ...         
1451875     chr1_248934257_A_C
1451876     chr1_248935675_T_C
1451877     chr1_248936715_C_T
1451878     chr1_248937246_A_G
1451879    chr1_248941807_AG_A
Name: variant, Length: 1451880, dtype: object

In [125]:
a = np.intersect1d(sumstat.variant.to_numpy(),[x.replace(":","_") for x in snp_id])

In [126]:
len(a)

6668

In [120]:
snp_id

array(['chr1:16206_T_A', 'chr1:16433_C_G', 'chr1:16619_C_T', ...,
       'chr1:2887986_T_C', 'chr1:2888101_C_G', 'chr1:2888245_G_A'],
      dtype=object)

In [111]:
sumstat.variant.to_numpy()

array(['chr1_594445_C_T', 'chr1_595762_C_CTG', 'chr1_630947_G_A', ...,
       'chr1_248936715_C_T', 'chr1_248937246_A_G', 'chr1_248941807_AG_A'],
      dtype=object)

In [77]:
sumstat.variant.to_numpy()

array(['chr1_100000223_C_T', 'chr1_100000723_G_A', 'chr1_100000874_G_T',
       ..., 'chr1_248936715_C_T', 'chr1_248937246_A_G',
       'chr1_248941807_AG_A'], dtype=object)

In [94]:
snp_id = pd.read_csv("/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output_npz_2/1300_hg38_EUR_LD_blocks_plink_files/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.chr1_16103_2888443.bim","\t",header = None)[1].to_numpy()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [95]:
snp_id

array(['chr1:16206_T_A', 'chr1:16433_C_G', 'chr1:16619_C_T', ...,
       'chr1:2887986_T_C', 'chr1:2888101_C_G', 'chr1:2888245_G_A'],
      dtype=object)

In [88]:
test[0]

'/mnt/vast/hpc/csg/molecular_phenotype_calling/LD/output_npz_2/1300_hg38_EUR_LD_blocks_npz_files/ROSMAP_NIA_WGS.leftnorm.filtered.filtered.chr1_16103_2888443.flt16.npz'

In [96]:
np.intersect1d(sumstat.variant.to_numpy(),snp_id)

array([], dtype=object)

In [97]:
snp_id

array(['chr1:16206_T_A', 'chr1:16433_C_G', 'chr1:16619_C_T', ...,
       'chr1:2887986_T_C', 'chr1:2888101_C_G', 'chr1:2888245_G_A'],
      dtype=object)

In [98]:
sumstat.variant.to_numpy()

array(['chr1_100000223_C_T', 'chr1_100000723_G_A', 'chr1_100000874_G_T',
       ..., 'chr1_248936715_C_T', 'chr1_248937246_A_G',
       'chr1_248941807_AG_A'], dtype=object)

In [104]:
sumstat.sort_values("position")

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency
0,1,100000223,C,T,chr1_100000223_C_T,0.1758,0.2049,0.391100,0.0011,69576,360279,0.0011
1,1,100000723,G,A,chr1_100000723_G_A,-0.0031,0.0134,0.817700,0.1023,85934,401577,0.1023
2,1,100000874,G,T,chr1_100000874_G_T,0.2609,0.0749,0.000499,0.0038,83561,392587,0.0038
3,1,100001444,AT,A,chr1_100001444_AT_A,-0.0070,0.2303,0.975800,0.0009,26798,28624,0.0009
4,1,100001756,T,C,chr1_100001756_T_C,-0.0312,0.0344,0.364400,0.0145,85934,401577,0.9855
...,...,...,...,...,...,...,...,...,...,...,...,...
811292,1,248934257,A,C,chr1_248934257_A_C,-0.0051,0.0302,0.866400,0.0386,34556,49251,0.9614
811293,1,248935675,T,C,chr1_248935675_T_C,0.1627,0.3225,0.614100,0.0007,20301,21839,0.9993
811294,1,248936715,C,T,chr1_248936715_C_T,-0.0207,0.0552,0.707500,0.0065,83831,387691,0.0065
811295,1,248937246,A,G,chr1_248937246_A_G,0.1434,0.1920,0.455200,0.0012,76073,367064,0.9988


In [149]:
sumstat

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency
0,1,594445,C,T,chr1_594445_C_T,-0.0152,0.3236,0.9626,0.0006,20301,21839,0.0006
1,1,595762,C,CTG,chr1_595762_C_CTG,-0.2494,0.2247,0.2670,0.0014,26798,28624,0.9986
2,1,630947,G,A,chr1_630947_G_A,0.0975,0.4320,0.8214,0.0003,20301,21839,0.0003
3,1,646157,G,A,chr1_646157_G_A,-0.3028,0.3180,0.3411,0.0013,20301,21839,0.0013
4,1,727233,G,A,chr1_727233_G_A,-0.0210,0.0453,0.6424,0.0191,81166,386644,0.0191
...,...,...,...,...,...,...,...,...,...,...,...,...
1451875,1,248934257,A,C,chr1_248934257_A_C,-0.0051,0.0302,0.8664,0.0386,34556,49251,0.9614
1451876,1,248935675,T,C,chr1_248935675_T_C,0.1627,0.3225,0.6141,0.0007,20301,21839,0.9993
1451877,1,248936715,C,T,chr1_248936715_C_T,-0.0207,0.0552,0.7075,0.0065,83831,387691,0.0065
1451878,1,248937246,A,G,chr1_248937246_A_G,0.1434,0.1920,0.4552,0.0012,76073,367064,0.9988


In [176]:
sumstat.query("variant in @a")

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency,lbf,prob,cs,cs_99
4,1,727233,G,A,chr1_727233_G_A,-0.0210,0.0453,0.6424,0.0191,81166,386644,0.0191,-1.407817,9.027621e-32,False,False
9,1,733014,AG,A,chr1_733014_AG_A,0.0108,0.0413,0.7935,0.0267,78401,374200,0.0267,-1.565541,7.710363e-32,False,False
13,1,758213,AT,A,chr1_758213_AT_A,0.1318,0.1069,0.2173,0.0038,79473,380718,0.0038,-0.160906,3.141232e-31,False,False
16,1,758540,T,C,chr1_758540_T_C,0.0429,0.1113,0.6996,0.0040,77766,372990,0.9960,-0.664266,1.898863e-31,False,False
19,1,766399,GAATA,G,chr1_766399_GAATA_G,0.0055,0.0249,0.8246,0.0678,80801,380538,0.0678,-2.067118,4.669203e-32,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15105,1,2886544,G,A,chr1_2886544_G_A,0.0071,0.0451,0.8746,0.0110,85502,400497,0.0110,-1.502443,8.212549e-32,False,False
15109,1,2887241,C,T,chr1_2887241_C_T,-0.0055,0.0090,0.5417,0.3818,84196,393797,0.3818,-2.915753,1.998412e-32,False,False
15114,1,2887908,T,C,chr1_2887908_T_C,-0.0194,0.0138,0.1594,0.1043,85665,400902,0.8957,-1.692573,6.790557e-32,False,False
15116,1,2887986,T,C,chr1_2887986_T_C,0.0809,0.1020,0.4275,0.0034,81503,380555,0.9966,-0.539330,2.151555e-31,False,False


In [172]:
len(new)

15423

In [174]:
len(snp_id)

15423

In [175]:
len(sumstat)

1451880

In [209]:
LD = new

In [212]:
np.shape(LD)

(6668, 6668)

In [178]:
intersct = np.intersect1d(sumstat.variant.to_numpy(),snp_id)
sumstat = sumstat.query("variant in @intersct")
indice = np.where(np.in1d(snp_id, intersct))
LD[np.ix_(indice[0].tolist(), indice[0].tolist())]

In [238]:
sumstat["r"] = [LD[np.where(np.in1d(intersct,lead_variant))][:,np.where(np.in1d(intersct,x))][0][0][0] for x in sumstat.variant]

In [240]:
sumstat.query('variant == @lead_variant')

Unnamed: 0,chromosome,position,ref,alt,variant,beta,se,pvalue,maf,n_cases,n_controls,original_effect_allele_frequency,lbf,prob,cs,cs_99,r
1385,1,1000335,C,T,chr1_1000335_C_T,-0.0241,0.0106,0.02273,0.2123,82635,393749,0.2123,-0.361512,2.570266e-31,False,False,1.0


In [191]:
LD[indice,:]

array([[[-1.4275e-02, -1.4275e-02,  4.0283e-02, ..., -1.5640e-02,
         -1.2062e-02,  5.0903e-02],
        [-1.6953e-02, -1.6953e-02,  2.8625e-02, ..., -2.0081e-02,
         -1.4320e-02,  1.1177e-02],
        [-6.1226e-03, -6.1226e-03, -6.7329e-03, ...,  8.5083e-02,
         -5.1727e-03,  3.3356e-02],
        ...,
        [ 2.7790e-03, -2.9465e-02, -2.9564e-03, ..., -3.4882e-02,
          2.3479e-03,  9.0967e-01],
        [-5.1270e-03, -5.1270e-03, -5.6381e-03, ...,  1.0000e+00,
         -4.3297e-03, -3.3295e-02],
        [ 5.5084e-03, -2.8122e-02, -2.1267e-04, ..., -3.3295e-02,
          4.6539e-03,  1.0000e+00]]], dtype=float16)

In [211]:
LD = LD[np.ix_(indice[0].tolist(), indice[0].tolist())]

In [229]:
sumstat = sumstat.query("variant in @intersct")

In [195]:
np.ix_(indice.values, indice)

AttributeError: 'tuple' object has no attribute 'values'

In [140]:
where = np.where(np.in1d(snp_id, a))
new[where,where]

In [138]:
snp_id = [x.replace(":","_") for x in snp_id]

In [None]:
    hl._set_flags(no_whole_stage_codegen="1")
    reference_genome = args.reference_genome
    gnomad_version = gnomad_latest_versions[reference_genome]
    gnomad_ht_path = f"gs://finucane-requester-pays/slalom/gnomad/release/{gnomad_version}/ht/genomes/gnomad.genomes.r{gnomad_version}.sites.most_severe.ht"

    ht_snp = hl.import_table(args.snp, impute=True, types={"chromosome": hl.tstr}, delimiter="\s+")
    ht_snp = ht_snp.annotate(
        locus=hl.parse_locus(
            hl.delimit([ht_snp.chromosome, hl.str(ht_snp.position)], delimiter=":"), reference_genome=reference_genome
        ),
        alleles=[ht_snp.allele1, ht_snp.allele2],
    )
    if args.align_alleles:
        ht_gnomad = hl.read_table(gnomad_ht_path)
        ht_snp = align_alleles(ht_snp, ht_gnomad, flip_rows=["beta"])

    ht_snp = ht_snp.annotate(variant=hl.variant_str(ht_snp.locus, ht_snp.alleles))
    ht_snp = ht_snp.key_by("locus", "alleles")
    ht_snp = ht_snp.add_index("idx_snp")

    # annotate in novel CUPs and reject
    cup = hl.read_table(f"gs://finucane-requester-pays/slalom/cup_files/FASTA_BED.ALL_{reference_genome}.novel_CUPs.ht")
    reject = hl.read_table(
        f"gs://finucane-requester-pays/slalom/cup_files/FASTA_BED.ALL_{reference_genome}.reject_2.ht"
    )
    ht_snp = ht_snp.annotate(in_cups=hl.is_defined(cup[ht_snp.locus]) | hl.is_defined(reject[ht_snp.locus]))

    # annotate vep and freq
    if args.annotate_consequence or args.annotate_gnomad_freq:
        ht_gnomad = hl.read_table(gnomad_ht_path)
        consequences = ["most_severe", "gene_most_severe", "consequence"] if args.annotate_consequence else []
        freq_expr = (
            {f"gnomad_v{gnomad_version[0]}_af_{pop}": ht_gnomad.freq[pop].AF for pop in gnomad_pops[reference_genome]}
            if args.annotate_gnomad_freq
            else {}
        )
        ht_gnomad = ht_gnomad.select(*consequences, **freq_expr)
        ht_snp = ht_snp.join(ht_gnomad, how="left")
    ht_snp = ht_snp.checkpoint(new_temp_file())


In [226]:
LD[np.where(np.in1d(intersct,lead_variant))]

array([[ 0.007248,  1.      , -0.02397 , ...,  0.01753 , -0.02008 ,
         0.01118 ]], dtype=float16)

In [228]:
sumstat.variant

4            chr1_727233_G_A
9           chr1_733014_AG_A
13          chr1_758213_AT_A
16           chr1_758540_T_C
19       chr1_766399_GAATA_G
                ...         
15105       chr1_2886544_G_A
15109       chr1_2887241_C_T
15114       chr1_2887908_T_C
15116       chr1_2887986_T_C
15117       chr1_2888245_G_A
Name: variant, Length: 6668, dtype: object

In [224]:

lead_variant = "chr1_1000335_C_T"


In [218]:
intersct

array(['chr1_1000018_G_A', 'chr1_1000335_C_T', 'chr1_1001034_G_A', ...,
       'chr1_998364_C_T', 'chr1_999005_G_A', 'chr1_999842_C_A'],
      dtype=object)

In [None]:
import scipy as sp

In [155]:
lbf, prob = abf(sumstat.beta, sumstat.se)
sumstat["lbf"] = lbf
sumstat["prob"] = prob
sumstat["cs"] = sumstat.variant.isin(cs)
cs_99 = get_cs(sumstat.variant, prob, coverage=0.99)
sumstat["cs_99"] = sumstat.variant.isin(cs_99)

In [158]:
cs = get_cs(sumstat.variant, prob, coverage=0.95)


In [166]:
lead_idx_snp = sumstat.prob.idxmax()

In [168]:
sumstat.prob.max()

0.5219822472224083

In [167]:
lead_idx_snp

1164649

In [202]:
len(LD)

15423

In [208]:
np.shape(LD[2,:])

(15423,)

In [169]:
def slalom(df,LD):
    lbf, prob = abf(df.beta, df.se, W=args.abf_prior_variance)
    cs = get_cs(df.variant, prob, coverage=0.95)
    cs_99 = get_cs(df.variant, prob, coverage=0.99)
    df["lbf"] = lbf
    df["prob"] = prob
    df["cs"] = df.variant.isin(cs)
    df["cs_99"] = df.variant.isin(cs_99)
    lead_idx_snp = df.prob.idxmax()
    lead_variant = df.variant[lead_idx_snp]
    df["lead_variant"] = False
    df["lead_variant"].iloc[lead_idx_snp] = True
    # annotate LD     
    ## This is to identify the R for each snp vs the lead snp
    df["r"] = [LD[np.where(np.in1d(df.variant,lead_variant))][:,np.where(np.in1d(df.variant,x))][0][0][0] for x in df.variant]
    lead_z = (df.beta / df.se).iloc[lead_idx_snp]
    df["t_dentist_s"] = ((df.beta / df.se) - df.r * lead_z) ** 2 / (1 - df.r ** 2)
    df["t_dentist_s"] = np.where(df["t_dentist_s"] < 0, np.inf, df["t_dentist_s"])
    df["t_dentist_s"].iloc[lead_idx_snp] = np.nan
    df["nlog10p_dentist_s"] = sp.stats.chi2.logsf(df["t_dentist_s"], df=1) / -np.log(10)
    df_output = df
    
    df.drop(columns=["variant"]).to_csv(f, sep="\t", na_rep="NA", index=False)

    df["r2"] = df.r ** 2
    n_r2 = np.sum(df.r2 > args.r2_threshold)
    n_dentist_s_outlier = np.sum(
        (df.r2 > args.r2_threshold) & (df.nlog10p_dentist_s > args.nlog10p_dentist_s_threshold)
    )
    max_pip_idx = df.prob.idxmax()
    variant = df.chromosome.str.cat([df.position.astype(str), df.allele1, df.allele2], sep=":")
    df_summary = pd.DataFrame(
        {
            "lead_pip_variant": [variant.iloc[max_pip_idx]],
            "n_total": [len(df.index)],
            "n_r2": [n_r2],
            "n_dentist_s_outlier": [n_dentist_s_outlier],
            "fraction": [n_dentist_s_outlier / n_r2 if n_r2 > 0 else 0],
            "max_pip": [np.max(df.prob)]
        }
        )
    return df, df_summary


SyntaxError: invalid syntax (784063505.py, line 22)