In [1]:
# !pip install statsmodels
# !pip install scikit-learn

In [2]:
import re
import numpy as np
import pandas as pd
from scipy import stats
from functools import reduce
import statsmodels.api as sm
from patsy import dmatrices
from sklearn.preprocessing import StandardScaler
import json


In [3]:
def normalize_covariates(pheno_df, covariates, exclude=set(["genetic_sex"])):
    norm_pheno_df = pheno_df.copy()
    for cov in covariates:
        if cov not in exclude:
            scaler = StandardScaler()
            norm_pheno_df[cov] = scaler.fit_transform(norm_pheno_df.loc[:, [cov]])
    return norm_pheno_df

def create_gene_burden_table_helper(burden_df, annotations, maf, lf_samples_df, hgnc_dict):
    burden_df["gene"] = burden_df.gene.map(hgnc_dict)
    masked_burden_df = burden_df.loc[(burden_df.annotation.isin(annotations))&(burden_df.maf_max<=maf)].groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))}).reset_index()
    masked_burden_df = pd.concat([masked_burden_df, lf_samples_df])
    return masked_burden_df

def create_gene_burden_tables(burden_df, maf, lf_samples_df, hgnc_dict):
    masks = ["pLoF", "Missense_strict", "Missense_lenient"]
    annot_terms = [["lof"], ["lof", "missense_strict"], ["lof", "missense_strict", "missense_lenient"]]
    gene_burden_dict = dict(zip(masks, [create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df, hgnc_dict) for at in annot_terms]))
    return gene_burden_dict


def get_samples_helper(combos, genotype_df, cohort_samples):
    if len(set(combos).intersection(set(genotype_df.gene.values))) == len(combos):
        samples_per_gene = genotype_df.loc[genotype_df.gene.isin(combos)].samples.values
        samples_per_combo = reduce(lambda a,b: set(a).intersection(set(b)), samples_per_gene)
        samples_per_combo = cohort_samples.intersection(samples_per_combo)
    else:
        samples_per_combo = []
    return samples_per_combo

def get_samples(ser, gene_burden_dict, pop_samples):
    gene, mask = ser.gene, ser.gene_mask
    gene_samples_df = gene_burden_dict[mask]
    
    combos = [gene]
    if "lf" in ser.index:
        lf = ser.lf
        combos.append(lf)
    samples = get_samples_helper(combos, gene_samples_df, pop_samples)
    return samples

def train_model_sm(X, y):
    model = sm.OLS(y, X)
    results = model.fit()
    r2 = results.rsquared
    int_coef = results.params.loc["gene:bmi_prs"]
    int_se = results.bse.loc["gene:bmi_prs"]
    int_stat = results.tvalues.loc["gene:bmi_prs"]
    int_conf = results.conf_int().loc["gene:bmi_prs"].values
    int_p_val = results.pvalues.loc["gene:bmi_prs"]
    return int_coef, int_se, int_conf, int_p_val, int_stat, results.nobs

def create_feature_label(gene_pheno_df, label):
    equation = f"{label} ~ gene + bmi_prs + gene:bmi_prs + " + " + ".join(covariates)
    y, X = dmatrices(equation, data=gene_pheno_df, return_type='dataframe')
    int_coef, int_se, int_conf, int_p_val, int_stat, nobs = train_model_sm(X, y)
    return int_coef, int_se, int_conf, int_p_val, int_stat, nobs


def get_pgs_interaction(ser, gene_burden_dict, pop_samples, pheno_df, covariates):
    sample_names = get_samples(ser, gene_burden_dict, pop_samples)
    gene_pheno_df = pheno_df.copy()
    gene_pheno_df["gene"] = pheno_df.sample_names.isin(sample_names).astype(int)
    bmi_int_coef, bmi_int_se, bmi_int_conf, bmi_int_p_val, bmi_int_stat, bmi_nobs = create_feature_label(gene_pheno_df, "bmi")
    bmi_rint_int_coef, bmi_rint_int_se, bmi_rint_int_conf, bmi_rint_int_p_val, bmi_rint_int_stat, bmi_rint_nobs = create_feature_label(gene_pheno_df, "bmi_rint")
    return pd.Series({
        "gene": ser.gene, "gene_mask": ser.gene_mask, 
        "bmi_coef": bmi_int_coef, "bmi_se": bmi_int_se, "bmi_obs": int(bmi_nobs), "bmi_t_stat": bmi_int_stat, "bmi_p_value": bmi_int_p_val, "bmi_ci_low": bmi_int_conf[0], "bmi_ci_high": bmi_int_conf[1],
        "bmi_rint_coef": bmi_rint_int_coef, "bmi_rint_se": bmi_rint_int_se, "bmi_rint_obs": int(bmi_rint_nobs), "bmi_rint_t_stat": bmi_rint_int_stat, "bmi_rint_p_value": bmi_rint_int_p_val, "bmi_rint_ci_low": bmi_rint_int_conf[0], "bmi_rint_ci_high": bmi_rint_int_conf[1]
    })

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return


In [4]:
# get monogenic meta file
monogenic_meta_df = pd.read_csv("./bmi_rint_monogenic_meta.tsv", sep="\t")

# get gene burden file
gene_burden_df = pd.read_csv("/mnt/project/notebooks/wes/burden_preparation/data/ukb_burden.tsv.gz", sep="\t")
gnomad_df = pd.read_csv(
    "/mnt/project/notebooks/wes/burden_preparation/data/gnomad_annot.tsv.gz", sep="\t",
    usecols=["locus", "alleles", "maf_gnomad_popmax"]
)
gene_burden_df = gene_burden_df.merge(gnomad_df, on=["locus", "alleles"])
gene_burden_df["maf_max"] = gene_burden_df.apply(lambda ser: max(ser.maf, ser.maf_gnomad_popmax), axis=1)

# get pheno file
pheno_df = pd.read_csv("/mnt/project/notebooks/bmi/data/processed/eur_phenotype.tsv.gz", dtype={"FID": str, "IID": str}, sep="\t")
pheno_df = pheno_df.rename(columns={"IID": "sample_names"})

# prepare hgnc dict
with open('hgnc_gene_map.json', 'r') as f:
    # Load the JSON data into a Python dictionary
    hgnc_dict = json.load(f)


In [5]:
gene_burden_dict = create_gene_burden_tables(gene_burden_df, 0.001, pd.DataFrame(), hgnc_dict)
pop_samples = set(pheno_df.sample_names.astype(str))


In [6]:
pheno_df.bmi_rint.corr(pheno_df.bmi_prs)

0.27749386254450953

In [7]:
pheno_df.columns

Index(['FID', 'sample_names', 'bmi', 'bmi_rint', 'genetic_sex', 'age', 'age_2',
       'age_sex', 'exome_release_batch', 'genetic_pca1', 'genetic_pca2',
       'genetic_pca3', 'genetic_pca4', 'genetic_pca5', 'genetic_pca6',
       'genetic_pca7', 'genetic_pca8', 'genetic_pca9', 'genetic_pca10',
       'bmi_prs', 'pa', 'alcohol', 'smoke', 'sleep', 'sedentary', 'diet'],
      dtype='object')

In [8]:
# normalized pheno df
covariates = ["age", "age_2", "age_sex", "bmi_prs", "genetic_sex"] + [f"genetic_pca{i}" for i in range(1, 11)]

norm_pheno_df = normalize_covariates(pheno_df, covariates + ["bmi"])


In [9]:
monogenic_pgs_df = monogenic_meta_df.apply(get_pgs_interaction, axis=1, args=(gene_burden_dict, pop_samples, norm_pheno_df, covariates))


In [10]:
monogenic_pgs_df.to_csv("monogenic_pgs_int_ukb_eur.csv.gz", index=False)
