In [1]:
# !pip install statsmodels
# !pip install scikit-learn

In [3]:
import os
import dxpy
import numpy as np
import pandas as pd
import json
from functools import reduce
import re
import numpy as np
import pandas as pd
from scipy import stats
import tqdm

import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from patsy import dmatrices

In [3]:
def normalize_covariates(pheno_df, covariates):
    norm_pheno_df = pheno_df.copy()
    for cov in covariates:
        scaler = StandardScaler()
        norm_pheno_df[cov] = scaler.fit_transform(norm_pheno_df.loc[:, [cov]])
    return norm_pheno_df

def create_gene_burden_table_helper(burden_df, annotations, maf, lf_samples_df, hgnc_dict):
    burden_df["gene"] = burden_df.gene.map(hgnc_dict)
    masked_burden_df = burden_df.loc[(burden_df.annotation.isin(annotations))&(burden_df.maf_max<=maf)].groupby("gene").agg({"samples": lambda x: set(",".join(x).split(","))}).reset_index()
    masked_burden_df = pd.concat([masked_burden_df, lf_samples_df])
    return masked_burden_df

def create_gene_burden_tables(burden_df, maf, lf_samples_df, hgnc_dict):
    masks = ["pLoF", "Missense_strict", "Missense_lenient"]
    annot_terms = [["lof"], ["lof", "missense_strict"], ["lof", "missense_strict", "missense_lenient"]]
    gene_burden_dict = dict(zip(masks, [create_gene_burden_table_helper(burden_df, at, maf, lf_samples_df, hgnc_dict) for at in annot_terms]))
    return gene_burden_dict

def get_samples_helper(combos, genotype_df, cohort_samples):
    if len(set(combos).intersection(set(genotype_df.gene.values))) == len(combos):
        samples_per_gene = genotype_df.loc[genotype_df.gene.isin(combos)].samples.values
        samples_per_combo = reduce(lambda a,b: set(a).intersection(set(b)), samples_per_gene)
        samples_per_combo = cohort_samples.intersection(samples_per_combo)
    else:
        samples_per_combo = []
    return samples_per_combo


def get_samples(ser, gene_burden_dict, pop_samples):
    gene, mask = ser.gene, ser.gene_mask
    gene_samples_df = gene_burden_dict[mask]
    
    combos = [gene]
    if "lf" in ser.index:
        lf = ser.lf
        combos.append(lf)
    samples = get_samples_helper(combos, gene_samples_df, pop_samples)
    return samples


def train_model_sm(X, y):
    model = sm.OLS(y, X)
    results = model.fit()
    r2 = results.rsquared
    coef = results.params.loc["gene"]
    se = results.bse.loc["gene"]
    stat = results.tvalues.loc["gene"]
    conf = results.conf_int().loc["gene"].values
    p_val = results.pvalues.loc["gene"]
    return coef, se, conf[0], conf[1], p_val, stat, results.nobs


def create_feature_label(gene_pheno_df, label, numerical_covariates, categorical_covariates):
    equation = f"{label} ~ gene + " + " + ".join(numerical_covariates) + " + " + " + ".join([f"C({cc})" for cc in categorical_covariates]) 
    y, X = dmatrices(equation, data=gene_pheno_df, return_type='dataframe')
    return X, y 


def train(gene_pheno_df, protein, numerical_covariates, categorical_covariates):
    X, y = create_feature_label(gene_pheno_df, protein, numerical_covariates, categorical_covariates)
    coef, se, ci_low, ci_high, p_val, stat, nobs = train_model_sm(X, y)
    return coef, se, ci_low, ci_high, p_val, stat, nobs


def get_protein_models(ser, gene_burden_dict, pop_samples, pheno_df, numerical_covariates, categorical_covariates, all_proteins):
    sample_names = get_samples(ser, gene_burden_dict, pop_samples)
    gene_pheno_df = pheno_df.copy()
    gene_pheno_df["gene"] = pheno_df.sample_names.isin(sample_names).astype(int)
    nptv = len(gene_pheno_df.loc[gene_pheno_df.gene==1])
    protein_dict = {
        "protein": [], "coef": [], "se": [], "ci_low": [], "ci_high": [],
        "p_value": [], "stat": [], "nobs": [], "ncarrier": []
    }
    
    for protein in all_proteins:
        coef, se, ci_low, ci_high, p_val, stat, nobs = train(
            gene_pheno_df, protein, numerical_covariates, categorical_covariates
        )
        protein_dict["protein"].append(protein)
        protein_dict["coef"].append(coef)
        protein_dict["se"].append(se)
        protein_dict["ci_low"].append(ci_low)
        protein_dict["ci_high"].append(ci_high)
        protein_dict["p_value"].append(p_val)
        protein_dict["stat"].append(stat)
        protein_dict["nobs"].append(nobs)
        protein_dict["ncarrier"].append(nptv)
    return pd.DataFrame(protein_dict)


def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return


In [4]:
# get monogenic meta file
most_del_sig_meta_df = pd.read_csv("/mnt/project/notebooks/bmi/data/monogenic_meta_most_deleterious.tsv", sep="\t")

# get gene burden file
gene_burden_df = pd.read_csv("/mnt/project/notebooks/wes/burden_preparation/data/ukb_burden.tsv.gz", sep="\t")
gnomad_df = pd.read_csv(
    "/mnt/project/notebooks/wes/burden_preparation/data/gnomad_annot.tsv.gz", sep="\t",
    usecols=["locus", "alleles", "maf_gnomad_popmax"]
)
gene_burden_df = gene_burden_df.merge(gnomad_df, on=["locus", "alleles"])
gene_burden_df["maf_max"] = gene_burden_df.apply(lambda ser: max(ser.maf, ser.maf_gnomad_popmax), axis=1)

# get pheno file
pheno_df = pd.read_csv("/mnt/project/notebooks/bmi/data/pheno.csv.gz", dtype={"sample_names": str})

# prepare hgnc dict
with open('/mnt/project/notebooks/bmi/data/hgnc_gene_map.json', 'r') as f:
    # Load the JSON data into a Python dictionary
    hgnc_dict = json.load(f)


In [5]:
gene_burden_dict = create_gene_burden_tables(gene_burden_df, 0.001, pd.DataFrame(), hgnc_dict)
pop_samples = set(pheno_df.sample_names.astype(str))


In [6]:
# get pheno file
pheno_df = pd.read_csv("/mnt/project/notebooks/bmi/data/pheno.csv.gz", dtype={"sample_names": str})

pop_samples = set(pheno_df.sample_names.astype(str))

# Read and parse protein df

In [7]:
protein_meta_df = pd.read_csv("/mnt/project/notebooks/proteomics/data/protein_metadata_processed.csv.gz", dtype={"sample_names": str})


In [8]:
protein_meta_df["age_2"] = protein_meta_df.age**2
protein_meta_df["age_sex"] = protein_meta_df.age* protein_meta_df.genetic_sex
protein_meta_df["age_2_sex"] = protein_meta_df.age_2* protein_meta_df.genetic_sex


In [9]:
# normalized pheno df
numerical_covariates = ["age_2", "age_2_sex", "age_sex", "num_proteins"] + [f"genetic_pca{i}" for i in range(1, 21)]
categorical_covariates = ["batch", "genetic_array"]

norm_protein_meta_df = normalize_covariates(protein_meta_df, numerical_covariates)


In [10]:
norm_protein_meta_df = norm_protein_meta_df.loc[norm_protein_meta_df.sample_names.isin(pop_samples)]

In [11]:
npx_df = pd.read_csv(
    "/mnt/project/notebooks/proteomics/data/npx_processed.csv.gz", 
    dtype={"sample_names": str},
)


In [12]:
def normalize(ser):
    scaler = StandardScaler()
    norm_ser = scaler.fit_transform(ser.to_frame())
    return pd.Series(norm_ser.flatten(), name=ser.name)

def rint_normalization(ser):
    ranks = ser.rank()
    normalized = stats.norm.ppf((ranks - 0.5)/ranks.notna().sum())
    return normalized


In [13]:
norm_npx_df = npx_df.set_index("sample_names").apply(rint_normalization)

In [14]:
norm_npx_df = norm_npx_df.reset_index()

In [None]:
protein_df = norm_protein_meta_df.merge(norm_npx_df, on="sample_names")

In [None]:
all_proteins = [c for c in npx_df.columns if c!="sample_names"]

In [None]:
len(all_proteins)

# Run model

In [23]:
proj_dir = "/notebooks/bmi/data/downstream/proteomics/"

for i in tqdm.tqdm(range(len(most_del_sig_meta_df))):
    ser = most_del_sig_meta_df.iloc[i]
    ser_df = get_protein_models(ser, gene_burden_dict, pop_samples, protein_df, numerical_covariates, categorical_covariates, all_proteins)
    filename = f"{ser.gene}_protein_assoc.csv.gz"
    ser_df.to_csv(filename, index=False)
    upload_file_to_project(filename, proj_dir)


# References

1. https://github.com/dnanexus/UKB_RAP/blob/main/proteomics/protein_DE_analysis/1_preprocess_explore_data.ipynb
2. https://www.nature.com/articles/s41588-024-01694-x#Sec2
