In [None]:
import numpy as np
import pandas as pd
import json
from scipy.stats import pearsonr
import re
from functools import reduce
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 7, 'axes.linewidth': 1, 'xtick.major.width': 1, 'xtick.major.size': 5, 'ytick.major.width': 1, 'ytick.major.size': 5})
from matplotlib.backends.backend_pdf import PdfPages

In [None]:

def create_gene_burden_table_helper(burden_df, annotations, maf, lf_samples_df):
    masked_burden_df = burden_df.loc[(burden_df.annotation.isin(annotations))&(burden_df.maf<=maf)].groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))}).reset_index()
    masked_burden_df = pd.concat([masked_burden_df, lf_samples_df])
    return masked_burden_df

def create_gene_burden_tables(burden_df, maf, lf_samples_df):
    masks = ["PTV", "PTV_Missense_strict", "PTV_Missense_lenient"]
    annot_terms = [["lof"], ["lof", "missense_strict"], ["lof", "missense_strict", "missense_lenient"]]
    gene_burden_dict = dict(zip(masks, [create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df) for at in annot_terms]))
    return gene_burden_dict


def get_samples_helper(combos, genotype_df, cohort_samples):
    if len(set(combos).intersection(set(genotype_df.gene.values))) == len(combos):
        samples_per_gene = genotype_df.loc[genotype_df.gene.isin(combos)].samples.values
        samples_per_combo = reduce(lambda a,b: set(a).intersection(set(b)), samples_per_gene)
        samples_per_combo = cohort_samples.intersection(samples_per_combo)
    else:
        samples_per_combo = []
    return samples_per_combo


def get_samples(ser, gene_burden_dict, pop_samples):
    pattern = re.compile("(.+)\.(PTV.*)\.0\.001")
    m = re.match(pattern, ser.ID)
    if not m:
        print(ser.ID)
    gene = m.group(1)
    mask = m.group(2)
    gene_samples_df = gene_burden_dict[mask]
    
    combos = [gene]
    if "lf" in ser.index:
        lf = ser.lf
        combos.append(lf)
    
    samples = get_samples_helper(combos, gene_samples_df, pop_samples)
    return gene, samples

def get_bmi_pgs_info(ser, gene_burden_dict, pop_samples, pheno_df):
    gene, sample_names = get_samples(ser, gene_burden_dict, pop_samples)
    bmi = pheno_df.loc[pheno_df.sample_names.isin(sample_names), "bmi"].values
    pgs = pheno_df.loc[pheno_df.sample_names.isin(sample_names), "bmi_prs"].values
    bmi_pgs = list(zip(bmi, pgs))
    return pd.Series({"ID": ser.ID, "gene": gene, "beta": ser.beta, "bmi_pgs": bmi_pgs})
    

In [None]:
monogenic_meta_df = pd.read_excel("./monogenic_meta.xlsx")
gene_burden_df = pd.read_csv("/mnt/project/notebooks/regenie/data/gene_burden.csv.gz")
pheno_df = pd.read_csv("/mnt/project/notebooks/regenie/data/pheno.csv.gz", dtype={"sample_names": str})

In [None]:
gene_burden_dict = create_gene_burden_tables(gene_burden_df, 0.001, pd.DataFrame())
pop_samples = set(pheno_df.sample_names.astype(str))


In [None]:
bmi_pgs_df = monogenic_meta_df.apply(get_bmi_pgs_info, axis=1, args=(gene_burden_dict, pop_samples, pheno_df))
bmi_pgs_df = bmi_pgs_df.explode("bmi_pgs").reset_index(drop=True).drop_duplicates(["gene", "beta", "bmi_pgs"])
bmi_pgs_df[['bmi', 'pgs']] = pd.DataFrame(bmi_pgs_df['bmi_pgs'].tolist(), index=bmi_pgs_df.index)

In [None]:
from patsy import dmatrices

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
pheno_df["age"] = scaler.fit_transform(pheno_df.loc[:, ["age"]])

In [None]:
def train_model_sm(X, y):
    model = sm.OLS(y, X)
    results = model.fit()
    r2 = results.rsquared
    int_coef = results.params.loc["gene:bmi_prs"]
    int_se = results.bse.loc["gene:bmi_prs"]
    int_stat = results.tvalues.loc["gene:bmi_prs"]
    int_conf = results.conf_int().loc["gene:bmi_prs"].values
    int_p_val = results.pvalues.loc["gene:bmi_prs"]
    return int_coef, int_se, int_conf, int_p_val, int_stat, results.nobs

def get_pgs_interaction(ser, gene_burden_dict, pop_samples, pheno_df):
    gene, sample_names = get_samples(ser, gene_burden_dict, pop_samples)
    gene_pheno_df = pheno_df.copy()
    gene_pheno_df["gene"] = pheno_df.sample_names.isin(sample_names).astype(int)
    equation = f"bmi ~ age + genetic_sex + " + " + ".join([f"genetic_pca{i}" for i in range(1, 11)]) + " + gene + bmi_prs + gene*bmi_prs"
    y, X = dmatrices(equation, data=gene_pheno_df, return_type='dataframe')
    int_coef, int_se, int_conf, int_p_val, int_stat, nobs = train_model_sm(X, y)
    return pd.Series({"ID": ser.ID, "gene": gene, "coef": int_coef, "se": int_se, "obs": int(nobs), "t_stat": int_stat, "p_value": int_p_val, "ci_low": int_conf[0], "ci_high": int_conf[1]})

In [None]:
monogenic_pgs_df = monogenic_meta_df.apply(get_pgs_interaction, axis=1, args=(gene_burden_dict, pop_samples, pheno_df))

In [None]:
monogenic_pgs_df.to_csv("monogenic_pgs_int_ukb.csv.gz", index=False)