In [None]:
# !pip install statsmodels
# !pip install openpyxl
# !pip install --upgrade seaborn
# !pip install scikit-learn

In [None]:
import os
import dxpy
import numpy as np
import pandas as pd
import json
from scipy.stats import pearsonr, ttest_ind
import re
import itertools as it
from functools import reduce
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols 
import seaborn as sns
from patsy import dmatrices
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 7, 'axes.linewidth': 1, 'xtick.major.width': 1, 'xtick.major.size': 5, 'ytick.major.width': 1, 'ytick.major.size': 5})
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from matplotlib.backends.backend_pdf import PdfPages


In [None]:
monogenic_meta_df = pd.read_excel("./monogenic_meta.xlsx")
gene_burden_df = pd.read_csv("/mnt/project/notebooks/regenie/data/gene_burden.csv.gz")
pheno_df = pd.read_csv("/mnt/project/notebooks/regenie/data/pheno.csv.gz", dtype={"sample_names": str})

In [None]:

def create_gene_burden_table_helper(burden_df, annotations, maf, lf_samples_df):
    masked_burden_df = burden_df.loc[(burden_df.annotation.isin(annotations))&(burden_df.maf<=maf)].groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))}).reset_index()
    masked_burden_df = pd.concat([masked_burden_df, lf_samples_df])
    return masked_burden_df

def create_gene_burden_tables(burden_df, maf, lf_samples_df):
    masks = ["PTV", "PTV_Missense_strict", "PTV_Missense_lenient"]
    annot_terms = [["lof"], ["lof", "missense_strict"], ["lof", "missense_strict", "missense_lenient"]]
    gene_burden_dict = dict(zip(masks, [create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df) for at in annot_terms]))
    return gene_burden_dict

In [None]:
gene_burden_dict = create_gene_burden_tables(gene_burden_df, 0.001, pd.DataFrame())


In [None]:



def get_samples_helper(combos, genotype_df, cohort_samples):
    if len(set(combos).intersection(set(genotype_df.gene.values))) == len(combos):
        samples_per_gene = genotype_df.loc[genotype_df.gene.isin(combos)].samples.values
        samples_per_combo = reduce(lambda a,b: set(a).intersection(set(b)), samples_per_gene)
        samples_per_combo = cohort_samples.intersection(samples_per_combo)
    else:
        samples_per_combo = []
    return samples_per_combo


def get_samples(ser, gene_burden_dict, pop_samples):
    pattern = re.compile("(.+)\.(PTV.*)\.0\.001")
    m = re.match(pattern, ser.ID)
    if not m:
        print(ser.ID)
    gene = m.group(1)
    mask = m.group(2)
    gene_samples_df = gene_burden_dict[mask]
    
    combos = [gene]
    if "lf" in ser.index:
        lf = ser.lf
        combos.append(lf)
    
    samples = get_samples_helper(combos, gene_samples_df, pop_samples)
    return gene, mask, samples


def get_ancova_res(ser, gene_burden_dict, pheno_df, lifestyles, covariates):
    pop_samples = pop_samples = set(pheno_df.sample_names.astype(str))
    gene, mask, sample_names = get_samples(ser, gene_burden_dict, pop_samples)
    gene_pheno_df = pheno_df.copy()
    gene = gene + "_" + mask
    gene_pheno_df["gene"] = gene_pheno_df.sample_names.isin(sample_names).astype(int)
    # Performing factorial ANCOVA
    model_equation = f"bmi ~ C(gene) + " + " + ".join([f"C({lf}) + C(gene):C({lf})" for lf in lifestyles]) + " + " + " + ".join([f"{cv}" for cv in covariates])
    model = ols(model_equation, data=gene_pheno_df).fit()
    result = sm.stats.anova_lm(model, type=2)
    data_dict = {"ID": ser.ID}
    for lf in lifestyles:
        data_dict[f"{lf}_F"] = result.loc[f"C(gene):C({lf})", "F"]
        data_dict[f"{lf}_pvalue"] = result.loc[f"C(gene):C({lf})", "PR(>F)"]
        
    return pd.Series(data_dict)

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return


In [None]:
lifestyles = ['pa', 'alcohol', 'smoke', 'sleep', 'sedentary', 'diet', 'genetic_sex']
covariates = ["bmi_prs", "age"] + [f"genetic_pca{i}" for i in range(1, 11)] 
RERUN=False

if RERUN:
    pheno_df["genetic_sex"] = pheno_df.genetic_sex.map({"Male": 0, "Female": 1})
    scaler = StandardScaler()
    pheno_df["age"] = scaler.fit_transform(pheno_df.loc[:, ["age"]])
    df = monogenic_meta_df.apply(get_ancova_res, axis=1, args=(gene_burden_dict, pheno_df, lifestyles, covariates))
    df.to_csv("./ukb_sex_lf_bias.csv.gz", index=False)
    proj_dir = "/notebooks/regenie/data/ancova/"
    upload_file_to_project("ukb_sex_lf_bias.csv.gz", proj_dir)

else:
    df = pd.read_csv("/mnt/project/notebooks/regenie/data/ancova/ukb_sex_lf_bias.csv.gz")

In [None]:
meta_df = pd.read_csv("./sex_lf_bias.csv.gz")

In [None]:
meta_df

In [None]:
pheno_df["prs_cat"] = pd.qcut(pheno_df.bmi_prs, 5, labels=False)

In [None]:


def create_violin(ser, gene_burden_dict, pheno_df, order):
    pop_samples = set(pheno_df.sample_names.astype(str))
    gene, mask, sample_names = get_samples(ser, gene_burden_dict, pop_samples)
    gene_pheno_df = pheno_df.copy()
    gene_pheno_df[gene] = gene_pheno_df.sample_names.isin(sample_names).astype(int)

    # create PRS categories and weight samples based on both factor and PRS categories
    factor = ser.lifestyle  # the factor (e.g., lifestyle variable)

    # For gene carriers
    gene_carrier_df = gene_pheno_df.loc[gene_pheno_df[gene] == 1]

    # For non-carriers: calculate weights based on both prs_cat and factor
    gene_noncarrier_df = gene_pheno_df.loc[gene_pheno_df[gene] == 0]

    # Calculate the counts for each prs_cat and factor combination in the carrier group
    carrier_counts = gene_carrier_df.groupby(["prs_cat", factor]).size()

    # Normalize counts for carriers
    total_carrier_count = carrier_counts.sum()
    carrier_counts_normalized = carrier_counts / total_carrier_count
    
    # Calculate weights for non-carriers based on prs_cat and factor
    def calculate_weight(row):
        key = (row["prs_cat"], row[factor])
        carrier_weight = carrier_counts_normalized.get(key, 0)
        if carrier_weight > 0:
            return carrier_weight
        else:
            return 0  # Avoid sampling if no carriers exist in the category

    gene_noncarrier_df["weight"] = gene_noncarrier_df.apply(calculate_weight, axis=1)

    # Ensure that the non-carrier group follows the same distribution as the carrier group
    gene_noncarrier_sampled_df = gene_noncarrier_df.sample(
        n=100 * len(gene_carrier_df), weights=gene_noncarrier_df["weight"], random_state=42
    )

    # Combine the carrier and sampled non-carrier groups
    plot_df = pd.concat([gene_carrier_df, gene_noncarrier_sampled_df])

    # Create the violin plot
    fig, ax = plt.subplots(1, 1, figsize=(1.75, 1.5))
    sns.violinplot(
        data=plot_df, x=factor, y="bmi", hue=gene, ax=ax, order=order, palette="Set2", split=True, gap=0.1,
        inner_kws=dict(box_width=5, whis_width=1)
    )

    text_pos = 0.0
    for i, (od) in enumerate(order):
        psd = plot_df.loc[plot_df[factor]==od]
        ttest_res = ttest_ind(psd.loc[psd[gene]==1, "bmi"].dropna(), psd.loc[psd[gene]==False, "bmi"].dropna(), alternative="two-sided")
        ttest_pval = ttest_res.pvalue
        if ttest_pval<0.05:
            pval_text = f"P={round(ttest_pval, 2)}"
            if ttest_pval<0.01:
                pval_text = "P<0.01"
        else:
            pval_text = "ns"

        ax.annotate(
            pval_text, xy=(text_pos, 70), xytext=(text_pos, 71), ha="center", va="bottom", fontsize=5,
            arrowprops=dict(arrowstyle=f'-[, widthB=0.75, lengthB=0.25', lw=0.5, color='k')
        )
        text_pos+=1
    
    ax.spines[["right", "top"]].set_visible(False)
    ax.legend(frameon=False)
    # Print summary statistics
    # print(plot_df.groupby([factor, gene]).agg({"bmi": "mean", "bmi_prs": "mean"}))
    # print(gene_pheno_df.groupby([factor, gene]).agg({"bmi": "mean", "bmi_prs": "mean"}))

    return fig, ax


In [None]:
def save_pdf(save_file, fig):
    os.makedirs(os.path.dirname(save_file), exist_ok=True)
    pdf = PdfPages(save_file)
    pdf.savefig(fig, bbox_inches='tight',dpi=300)
    pdf.close()
    return

In [None]:
fig, ax = create_violin(meta_df.iloc[0], gene_burden_dict, pheno_df, order=[0, 1])

In [None]:
save_pdf("./macrod1_bias_ukb.pdf", fig)

In [None]:


def create_violin(ser, gene_burden_dict, pheno_df, order):
    pop_samples = set(pheno_df.sample_names.astype(str))
    gene, mask, sample_names = get_samples(ser, gene_burden_dict, pop_samples)
    gene_pheno_df = pheno_df.copy()
    gene_pheno_df[gene] = gene_pheno_df.sample_names.isin(sample_names).astype(int)

    # create PRS categories and weight samples based on both factor and PRS categories
    factor = ser.lifestyle  # the factor (e.g., lifestyle variable)

    # For gene carriers
    gene_carrier_df = gene_pheno_df.loc[gene_pheno_df[gene] == 1]

    # For non-carriers: calculate weights based on both prs_cat and factor
    gene_noncarrier_df = gene_pheno_df.loc[gene_pheno_df[gene] == 0]

    # Calculate the counts for each prs_cat and factor combination in the carrier group
    carrier_counts = gene_carrier_df.groupby(["prs_cat", factor]).size()

    # Normalize counts for carriers
    total_carrier_count = carrier_counts.sum()
    carrier_counts_normalized = carrier_counts / total_carrier_count
    
    # Calculate weights for non-carriers based on prs_cat and factor
    def calculate_weight(row):
        key = (row["prs_cat"], row[factor])
        carrier_weight = carrier_counts_normalized.get(key, 0)
        if carrier_weight > 0:
            return carrier_weight
        else:
            return 0  # Avoid sampling if no carriers exist in the category

    gene_noncarrier_df["weight"] = gene_noncarrier_df.apply(calculate_weight, axis=1)

    # Ensure that the non-carrier group follows the same distribution as the carrier group
    gene_noncarrier_sampled_df = gene_noncarrier_df.sample(
        n=100 * len(gene_carrier_df), weights=gene_noncarrier_df["weight"], random_state=42
    )

    # Combine the carrier and sampled non-carrier groups
    plot_df = pd.concat([gene_carrier_df, gene_noncarrier_sampled_df]) # gene pheno df

    # Create the violin plot
    fig, ax = plt.subplots(1, 1, figsize=(1.75, 1.5))
    sns.violinplot(
        data=plot_df, x=factor, y="bmi", hue=gene, ax=ax, order=order, palette="Set2", split=True, gap=0.1,
        inner_kws=dict(box_width=5, whis_width=1)
    )

    text_pos = 0.0
    for i, (od) in enumerate(order):
        psd = plot_df.loc[plot_df[factor]==od]
        ttest_res = ttest_ind(psd.loc[psd[gene]==1, "bmi"].dropna(), psd.loc[psd[gene]==False, "bmi"].dropna(), alternative="two-sided")
        ttest_pval = ttest_res.pvalue
        if ttest_pval<0.05:
            pval_text = f"P={round(ttest_pval, 2)}"
            if ttest_pval<0.01:
                pval_text = "P<0.01"
        else:
            pval_text = "ns"

        ax.annotate(
            pval_text, xy=(text_pos, 60), xytext=(text_pos, 61), ha="center", va="bottom", fontsize=5,
            arrowprops=dict(arrowstyle=f'-[, widthB=0.75, lengthB=0.25', lw=0.5, color='k')
        )
        text_pos+=1
    
    ax.spines[["right", "top"]].set_visible(False)
    ax.get_legend().remove()
    # Print summary statistics
    # print(plot_df.groupby([factor, gene]).agg({"bmi": "mean", "bmi_prs": "mean"}))
    # print(gene_pheno_df.groupby([factor, gene]).agg({"bmi": "mean", "bmi_prs": "mean"}))

    return fig, ax


In [None]:
fig, ax = create_violin(meta_df.iloc[1], gene_burden_dict, pheno_df, order=[0, 1])

In [None]:
save_pdf("./virma_bias_ukb.pdf", fig)