In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
def get_alternate_df(df, alternate_column):
    alternate_df = df.loc[:, ["hgnc_id", "symbol", alternate_column]]
    alternate_df = alternate_df.dropna(subset=[alternate_column])
    alternate_df[alternate_column] = alternate_df[alternate_column].str.split("|")
    alternate_df = alternate_df.explode(alternate_column)
    return alternate_df

def get_hgnc_harmony_dict(hgnc_df):
    current_symbols = dict(zip(hgnc_df.symbol, hgnc_df.symbol))
    prev_df = get_alternate_df(hgnc_df, "prev_symbol")
    prev_symbols = dict(zip(prev_df.prev_symbol, prev_df.symbol))
    current_symbols.update(prev_symbols)
    return current_symbols

In [3]:
def get_gene_mask(regenie_id):
    pattern = re.compile("(.+)\.(pLoF|Missense_strict|Missense_lenient)\.0\.001")
    m = re.match(pattern, regenie_id)
    if not m:
        print(regenie_id)
    gene = m.group(1)
    mask = m.group(2)
    return pd.Series({"gene": gene, "gene_mask": mask})

In [4]:
def harmonize_gene_symbols_and_filter(meta_df, hgnc_df, lf):
    # Add gene name and mask
    meta_df[["gene", "gene_mask"]] = meta_df.ID.apply(get_gene_mask)
    # Add hgnc current version annotation
    hgnc_dict = get_hgnc_harmony_dict(hgnc_df)
    meta_df["hgnc_gene"] = meta_df.gene.map(hgnc_dict)
    # Keep genes which have 
    # chrom and gene pos info, 
    # hgnc annotations, 
    # no duplicate hgnc annotations, gene mask, lf and TEST
    # at least one sample should be present for the gene mask pair: 
    # There might be some variant-sample pair which are not filtered from the bim files 
    # because the variant genotype were filtered as missing due to low quality in hail 
    # but not in bim - something to check
    meta_df = meta_df.loc[
        (meta_df.CHROM.notna())& 
        (meta_df.GENPOS.notna())&
        (meta_df.hgnc_gene.notna())&
        (~meta_df.duplicated(subset=["hgnc_gene", "gene_mask", "lf", "TEST"], keep=False))
    ].drop(columns=["EXTRA"])
    # replace REGENIE IDs with hgnc gene and gene mask
    meta_df["ID"] = meta_df.hgnc_gene + "::" + meta_df.gene_mask + "::" + lf
    # Divide by TEST
    cond_df = meta_df.loc[meta_df.TEST=="ADD-CONDTL"]
    gene_df = meta_df.loc[meta_df.TEST=="ADD-INT_SNP"]
    int_df = meta_df.loc[meta_df.TEST.str.startswith(f"ADD-INT_SNPx{lf}")]
    joint_df = meta_df.loc[meta_df.TEST=="ADD-INT_2DF"]
    return cond_df, gene_df, int_df, joint_df



In [5]:
hgnc_df = pd.read_csv("../data/hgnc/protein-coding_gene.txt", sep="\t", usecols=["hgnc_id", "symbol", "name", "alias_symbol", "alias_name", "prev_symbol", "prev_name"])

In [6]:
for biobank in ["aou", "ukb"]:
    for ancestry in ["afr", "amr", "eas", "eur", "sas", "mid"]:
        if (ancestry=="mid") and (biobank=="ukb"):
            # mid ancestry ukb run resulted in an error for hlm fitting,
            # so it will be droppped
            continue
        for lf in ["pa", "smoke", "alcohol"]:
            filename = f"../data/meta/raw/{lf}/bmi_rint_{ancestry}_{biobank}_meta_w_samples.tsv.gz"
            meta_df = pd.read_csv(filename, sep="\t")
            # cond - marginal test effect of the gene lf as covariate
            # gene - main effect of the gene 
            # int - effect of interaction term
            # joint - joint model with the effect of gene and the interaction term
            cond_df, gene_df, int_df, joint_df = harmonize_gene_symbols_and_filter(meta_df, hgnc_df, lf)
            for name, processed_meta_df in zip(["cond", "gene", "int", "joint"], [cond_df, gene_df, int_df, joint_df]):
                save_file = f"../data/meta/processed/{lf}/bmi_rint_{ancestry}_{biobank}_{name}.tsv.gz"
                processed_meta_df.to_csv(save_file, sep="\t", index=False)


In [7]:
int_df

Unnamed: 0,CHROM,GENPOS,ID,ALLELE0,ALLELE1,A1FREQ,N,TEST,BETA,SE,CHISQ,LOG10P,p_value,lf,nsamples,gene,gene_mask,hgnc_gene
2,1,33306784,A3GALT2::pLoF::alcohol,ref,pLoF.0.001,0.000861,8706,ADD-INT_SNPxalcohol=1,1.035850,0.957798,1.169610,0.553648,0.279481,alcohol,1.0,A3GALT2,pLoF,A3GALT2
6,1,33306784,A3GALT2::Missense_strict::alcohol,ref,Missense_strict.0.001,0.000861,8706,ADD-INT_SNPxalcohol=1,1.035850,0.957798,1.169610,0.553648,0.279481,alcohol,1.0,A3GALT2,Missense_strict,A3GALT2
10,1,33306784,A3GALT2::Missense_lenient::alcohol,ref,Missense_lenient.0.001,0.000861,8706,ADD-INT_SNPxalcohol=1,1.035850,0.957798,1.169610,0.553648,0.279481,alcohol,1.0,A3GALT2,Missense_lenient,A3GALT2
22,1,93993244,ABCA4::Missense_lenient::alcohol,ref,Missense_lenient.0.001,0.004422,8706,ADD-INT_SNPxalcohol=1,-0.161175,0.478517,0.113449,0.132974,0.736251,alcohol,4.0,ABCA4,Missense_lenient,ABCA4
31,1,94418499,ABCD3::Missense_lenient::alcohol,ref,Missense_lenient.0.001,0.001493,8706,ADD-INT_SNPxalcohol=1,1.213190,0.945580,1.646130,0.700087,0.199486,alcohol,1.0,ABCD3,Missense_lenient,ABCD3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86161,22,22514120,ZNF280A::Missense_strict::alcohol,ref,Missense_strict.0.001,0.000861,8706,ADD-INT_SNPxalcohol=1,-0.550456,0.961152,0.327990,0.246536,0.566845,alcohol,1.0,ZNF280A,Missense_strict,ZNF280A
86165,22,22514120,ZNF280A::Missense_lenient::alcohol,ref,Missense_lenient.0.001,0.000861,8706,ADD-INT_SNPxalcohol=1,-0.550456,0.961152,0.327990,0.246536,0.566845,alcohol,1.0,ZNF280A,Missense_lenient,ZNF280A
86169,22,22487774,ZNF280B::pLoF::alcohol,ref,pLoF.0.001,0.000172,8706,ADD-INT_SNPxalcohol=1,1.904760,1.148940,2.748410,1.011660,0.097351,alcohol,1.0,ZNF280B,pLoF,ZNF280B
86173,22,22487774,ZNF280B::Missense_strict::alcohol,ref,Missense_strict.0.001,0.000172,8706,ADD-INT_SNPxalcohol=1,1.904760,1.148940,2.748410,1.011660,0.097351,alcohol,1.0,ZNF280B,Missense_strict,ZNF280B
