In [None]:
# !pip install statsmodels
# !pip install scikit-learn
# !pip install openpyxl

In [None]:
import os
import numpy as np
import pandas as pd
import json
from scipy.stats import pearsonr
import re
from functools import reduce
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from patsy import dmatrices
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 7, 'axes.linewidth': 1, 'xtick.major.width': 1, 'xtick.major.size': 5, 'ytick.major.width': 1, 'ytick.major.size': 5})
from matplotlib.backends.backend_pdf import PdfPages


In [None]:
def save_pdf(save_file, fig):
    os.makedirs(os.path.dirname(save_file), exist_ok=True)
    pdf = PdfPages(save_file)
    pdf.savefig(fig, bbox_inches='tight',dpi=300)
    pdf.close()
    return

In [None]:
def prepare_npx_data(df0, df1, df2):
    df = df0.merge(df1, on="sample_names", how="outer", suffixes=("|0", "|1"))
    df2.columns = [f"{c}|2" if c!="sample_names" else "sample_names" for c in df2.columns]
    df = df.merge(df2, on="sample_names", how="outer")
    df = df.set_index("sample_names")
    df.index = df.index.astype(str)
    prefixes = [c+"|" for c in df0.columns if c!="sample_names"]
    grouper = [next(p for p in prefixes if c.startswith(p)) for c in df.columns]
    df_mean = df.groupby(grouper, axis=1).mean()
    df_mean.columns = [c.strip("|") for c in df_mean.columns]
    return df_mean

def create_gene_burden_table_helper(burden_df, annotations, maf, lf_samples_df):
    masked_burden_df = burden_df.loc[(burden_df.annotation.isin(annotations))&(burden_df.maf<=maf)].groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))}).reset_index()
    masked_burden_df = pd.concat([masked_burden_df, lf_samples_df])
    return masked_burden_df

def create_gene_burden_tables(burden_df, maf, lf_samples_df):
    masks = ["PTV", "PTV_Missense_strict", "PTV_Missense_lenient"]
    annot_terms = [["lof"], ["lof", "missense_strict"], ["lof", "missense_strict", "missense_lenient"]]
    gene_burden_dict = dict(zip(masks, [create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df) for at in annot_terms]))
    return gene_burden_dict


def get_samples_helper(combos, genotype_df, cohort_samples):
    if len(set(combos).intersection(set(genotype_df.gene.values))) == len(combos):
        samples_per_gene = genotype_df.loc[genotype_df.gene.isin(combos)].samples.values
        samples_per_combo = reduce(lambda a,b: set(a).intersection(set(b)), samples_per_gene)
        samples_per_combo = cohort_samples.intersection(samples_per_combo)
    else:
        samples_per_combo = []
    return samples_per_combo


def get_samples(ser, gene_burden_dict, pop_samples):
    pattern = re.compile("(.+)\.(PTV.*)\.0\.001")
    m = re.match(pattern, ser.ID)
    if not m:
        print(ser.ID)
    gene = m.group(1)
    mask = m.group(2)
    gene_samples_df = gene_burden_dict[mask]
    
    combos = [gene]
    if "lf" in ser.index:
        lf = ser.lf
        combos.append(lf)
    
    samples = get_samples_helper(combos, gene_samples_df, pop_samples)
    return samples

In [None]:
protein_files = [
    "/mnt/project/fields/data/proteomics/npx_inst0.csv.gz",
    "/mnt/project//fields/data/proteomics/npx_inst2.csv.gz",
    "/mnt/project//fields/data/proteomics/npx_inst3.csv.gz"
]
protein_dfs = [pd.read_csv(pf) for pf in protein_files]
protein_df = prepare_npx_data(protein_dfs[0], protein_dfs[1], protein_dfs[2])

protein_file = "/mnt/project/fields/field_names.txt"
proteins = pd.read_csv(protein_file)

g2p_dict_file = "./gene_protein.json"
with open(g2p_dict_file, "r") as json_file:
    g2p_dict = json.load(json_file)

monogenic_meta_df = pd.read_excel("./monogenic_meta.xlsx")
gene_burden_df = pd.read_csv("/mnt/project/notebooks/regenie/data/gene_burden.csv.gz")
pheno_df = pd.read_csv("/mnt/project/notebooks/regenie/data/pheno.csv.gz", dtype={"sample_names": str})
scaler = StandardScaler()
pheno_df["age"] = scaler.fit_transform(pheno_df.loc[:, ["age"]])

gene_burden_dict = create_gene_burden_tables(gene_burden_df, 0.001, pd.DataFrame())
pop_samples = set(pheno_df.sample_names.astype(str))


In [None]:
def train_model_sm(X, y, gene):
    model = sm.OLS(y, X)
    results = model.fit()
    r2 = results.rsquared
    coef = results.params.loc[gene]
    se = results.bse.loc[gene]
    stat = results.tvalues.loc[gene]
    conf = results.conf_int().loc[gene].values
    p_val = results.pvalues.loc[gene]
    return coef, se, conf, p_val, stat, results.nobs

def get_protein_coef(gene, pheno_df, protein_df):
    protein_dict = protein_df[gene].to_dict()
    gene_pheno_df = pheno_df.copy()
    gene_pheno_df[f"{gene}"] = pheno_df.sample_names.map(protein_dict)
    equation = f"bmi ~ age + genetic_sex + " + " + ".join([f"genetic_pca{i}" for i in range(1, 11)]) + f" + {gene} + bmi_prs + {gene}*bmi_prs" # 
    y, X = dmatrices(equation, data=gene_pheno_df, return_type='dataframe')
    int_coef, int_se, int_conf, int_p_val, int_stat, nobs = train_model_sm(X, y, gene)
    return pd.Series({"gene": gene, "coef": int_coef, "se": int_se, "obs": int(nobs), "t_stat": int_stat, "p_value": int_p_val, "ci_low": int_conf[0], "ci_high": int_conf[1]})

In [None]:
plasma_proteins = set(proteins.eid.str.upper())
monogenic_proteins = set(g2p_dict.values())
proteins_present = list(plasma_proteins.intersection(monogenic_proteins))

In [None]:
df = pd.concat([get_protein_coef(p, pheno_df, protein_df) for p in proteins_present], axis=1).T

In [None]:
df

In [None]:
df.to_csv("./protein_model_coefs.csv", index=False)

In [None]:
bmi_dict = dict(zip(pheno_df.sample_names, pheno_df.bmi))
bmi_pgs_dict = dict(zip(pheno_df.sample_names, pheno_df.bmi_prs))
age_dict = dict(zip(pheno_df.sample_names, pheno_df.age))
sex_dict = dict(zip(pheno_df.sample_names, pheno_df.genetic_sex))

mono_protein_df = protein_df.loc[:, list(proteins_present)]
mono_protein_df["bmi"] = mono_protein_df.index.astype(str).map(bmi_dict)
mono_protein_df["bmi_prs"] = mono_protein_df.index.astype(str).map(bmi_pgs_dict)
mono_protein_df["age"] = mono_protein_df.index.astype(str).map(age_dict)
mono_protein_df["sex"] = mono_protein_df.index.astype(str).map(sex_dict)

In [None]:
for gene in ["GLOD4", "DNER", "ROBO1", "MMP3", "CD27", "FGF2"]:
    mono_protein_df[f"{gene}_carrier"] = mono_protein_df.index.isin(gene_burden_dict["PTV"].loc[gene_burden_dict["PTV"].gene==gene, "samples"].values[0])
   

In [None]:
fig, ax = plt.subplots(2,3,figsize=(6,4),sharey=True)

gene="DNER"
sns.regplot(data=mono_protein_df, x=gene, y="bmi", ax=ax[0][0], scatter_kws={"s":0.15, "alpha":0.5, "color": "lightgrey", "rasterized": True}, line_kws={"color": "k", "lw": 1}) # "rasterized": True
sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==True], x=gene, y="bmi", color="red", marker="o", s=20, ax=ax[0][0])

gene="FGF2"
sns.regplot(data=mono_protein_df, x=gene, y="bmi", ax=ax[0][1], scatter_kws={"s":0.15, "alpha":0.5, "color": "lightgrey", "rasterized": True}, line_kws={"color": "k", "lw": 1})
sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==True], x=gene, y="bmi", color="red", marker="o", s=20, ax=ax[0][1])

gene="GLOD4"
sns.regplot(data=mono_protein_df, x=gene, y="bmi", ax=ax[0][2], scatter_kws={"s":0.15, "alpha":0.5, "color": "lightgrey", "rasterized": True}, line_kws={"color": "k", "lw": 1})
sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==True], x=gene, y="bmi", color="red", marker="o", s=20, ax=ax[0][2])

gene="CD27"
sns.regplot(data=mono_protein_df, x=gene, y="bmi", ax=ax[1][0], scatter_kws={"s":0.15, "alpha":0.5, "color": "lightgrey", "rasterized": True}, line_kws={"color": "k", "lw": 1})
sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==True], x=gene, y="bmi", color="red", marker="o", s=20, ax=ax[1][0])

gene="ROBO1"
sns.regplot(data=mono_protein_df, x=gene, y="bmi", ax=ax[1][1], scatter_kws={"s":0.15, "alpha":0.5, "color": "lightgrey", "rasterized": True}, line_kws={"color": "k", "lw": 1})
sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==True], x=gene, y="bmi", color="red", marker="o", s=20, ax=ax[1][1])

gene="MMP3"
sns.regplot(data=mono_protein_df, x=gene, y="bmi", ax=ax[1][2], scatter_kws={"s":0.15, "alpha":0.5, "color": "lightgrey", "rasterized": True}, line_kws={"color": "k", "lw": 1})
sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==True], x=gene, y="bmi", color="red", marker="o", s=20, ax=ax[1][2])



ax[0][0].set_ylim(10, 70)

for i in range(2):
    for j in range(3):
        ax[i][j].spines["top"].set_visible(False)
        ax[i][j].spines["right"].set_visible(False)

fig.tight_layout()

In [None]:
def save_pdf(save_file, fig):
    os.makedirs(os.path.dirname(save_file), exist_ok=True)
    pdf = PdfPages(save_file)
    pdf.savefig(fig, bbox_inches='tight',dpi=300)
    pdf.close()
    return

In [None]:
save_pdf("./proteomics.pdf", fig)

In [None]:
fig.savefig("./proteomics.svg", bbox_inches='tight', dpi=300)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(3,2))

sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==False], x=gene, y="bmi", alpha=0.75, color="lightgrey", marker="o", s=2.5, ax=ax)
sns.scatterplot(data=mono_protein_df.loc[mono_protein_df[f"{gene}_carrier"]==True], x=gene, y="bmi", color="red", marker="d", s=20, ax=ax)

ax.spines[["top", "right"]].set_visible(False)
ax.set_ylim(10, 70)

In [None]:
fig, ax = plt.subplots(2,2,figsize=(6,4),sharey=True)

sns.regplot(data=mono_protein_df, x="DNER", y="bmi", ax=ax[0][0], scatter_kws={"s":0.15, "alpha":0.5})
sns.regplot(data=mono_protein_df, x="FGF2", y="bmi", ax=ax[0][1], scatter_kws={"s":0.15, "alpha":0.5})
sns.regplot(data=mono_protein_df, x="CD27", y="bmi", ax=ax[1][0], scatter_kws={"s":0.15, "alpha":0.5})
sns.regplot(data=mono_protein_df, x="ROBO1", y="bmi", ax=ax[1][1], scatter_kws={"s":0.15, "alpha":0.5})
ax[0][0].set_ylim(10, 70)

for i in range(2):
    for j in range(2):
        ax[i][j].spines["top"].set_visible(False)
        ax[i][j].spines["right"].set_visible(False)

fig.tight_layout()
# # Rasterize the entire axes
# ax.set_rasterized(True)

In [None]:
save_pdf("./proteomics.pdf", fig)