In [1]:
import pandas as pd
import re
import os
from functools import reduce
import numpy as np
from scipy.stats import norm

# Define bonferroni p value

20000 genes, 3 gene mask models

In [2]:
BONF_P = 0.05/(20000*3*3)

In [3]:
def get_meta_stats_helper(ser, ancestry, alpha=0.05):
    effect_sizes = np.array([ser[f"BETA_{a}"] for a in ancestry if not pd.isnull(ser[f"BETA_{a}"])])
    inverse_variances = np.array([(1/ser[f"SE_{a}"]**2) for a in ancestry if not pd.isnull(ser[f"SE_{a}"])])
    assert len(effect_sizes)==len(inverse_variances)

    if len(effect_sizes)==0:
        return pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA

    weighted_effect_size = np.sum(effect_sizes*inverse_variances)/np.sum(inverse_variances)
    weighted_variance = 1/np.sum(inverse_variances)
    weighted_se = np.sqrt(weighted_variance)

    # Calculate the Z-score
    z_score = weighted_effect_size / weighted_se
    # Calculate the two-tailed p-value
    p_value = 2 * norm.sf(abs(z_score)) #(1 - norm.cdf(abs(z_score)))

    # Calculate critical value for confidence interval
    z_critical = norm.ppf(1 - alpha / 2) 

    # Calculate confidence interval bounds
    lower_ci = weighted_effect_size - z_critical * weighted_se
    upper_ci = weighted_effect_size + z_critical * weighted_se

    nsamples = sum([ser[f"nsamples_{a}"] for a in ancestry if not  pd.isnull(ser[f"nsamples_{a}"])])
    return weighted_effect_size, weighted_se, lower_ci, upper_ci, z_score, p_value, nsamples


def get_meta_stats(ser):
    eur_ancestry = ["eur_aou", "eur_ukb"]
    noneur_ancestry =  ["afr_aou", "sas_aou", "eas_aou", "amr_aou",  "mid_aou", "afr_ukb", "sas_ukb", "eas_ukb", "amr_ukb",] #mid_ukb
    ees, ese, elci, ehci, ez_score, ep_value, esamples = get_meta_stats_helper(ser, eur_ancestry)
    nees, nese, nelci, nehci, nez_score, nep_value, nesamples = get_meta_stats_helper(ser, noneur_ancestry)
    es, se, lci, hci, z_score, p_value, nsamples = get_meta_stats_helper(ser, eur_ancestry+noneur_ancestry)
    return pd.Series(
        {"gene": ser.hgnc_gene, "gene_mask": ser.gene_mask, 
        "beta": es, "se": se, "ci_low": lci, "ci_high": hci, "z_score":z_score, "p_value": p_value, "nsamples": nsamples,
        "ebeta": ees, "ese": ese, "eci_low": elci, "eci_high": ehci, "ez_score": ez_score, "ep_value": ep_value, "esamples": esamples,
        "nebeta": nees, "nese": nese, "neci_low": nelci, "neci_high": nehci, "nez_score": nez_score, "nep_value": nep_value, "nesamples": nesamples,
        })


In [4]:
def create_meta_df(proj_dir, test_name, ancestry=["afr", "amr", "eas", "eur", "sas", "mid"], biobank=["aou", "ukb"]):
    assoc_df = []
    merge_columns = ["hgnc_gene", "gene_mask", "ID", "lf", "ALLELE0", "ALLELE1"]
    stats_columns = ["N", "BETA", "SE", "CHISQ", "LOG10P", "p_value", "nsamples"]
    for a in ancestry:
        for b in biobank:
            if (a=="mid") and (b=="ukb"):
                continue
            df = pd.read_csv(
                os.path.join(proj_dir, f"bmi_rint_{a}_{b}_{test_name}.tsv.gz"), 
                sep="\t", usecols=merge_columns + stats_columns
                )
            df.columns =[f"{c}_{a}_{b}" if c not in merge_columns else c for c in df.columns]
            assoc_df.append(df)

    meta_df = reduce(lambda x,y: x.merge(
        y, 
        on=merge_columns,
        how="outer"
        ), assoc_df
    )
    return meta_df


In [5]:
for lf in ["pa", "smoke", "alcohol"]:
    proj_dir = f"../data/meta/processed/{lf}/"
    meta_df = create_meta_df(proj_dir, "int")
    meta_res_df = meta_df.apply(get_meta_stats, axis=1)
    meta_res_df.to_csv(
        f"../data/meta/results/{lf}/ivw_fixed/monolf_meta.tsv.gz", 
        index=False, sep="\t"
    )

    sig_meta_res_df = meta_res_df.loc[
        ((meta_res_df.ep_value<BONF_P)|(meta_res_df.nep_value<BONF_P))&
        (meta_res_df.p_value<BONF_P)
        ]
    sig_meta_res_df.to_csv(
        f"../data/meta/results/{lf}/ivw_fixed/monolf_meta_sig.tsv", 
        index=False, sep="\t"
    )
