In [1]:
# !pip install statsmodels
# !pip install scikit-learn


In [2]:
import os
import dxpy
import numpy as np
import pandas as pd
import json
from scipy.stats import pearsonr, norm
import re
from functools import reduce

import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from patsy import dmatrices

# Get associated proteins

In [3]:
assoc_prot_df = pd.DataFrame()

for gene in ["MC4R", "SLTM", "APBA1", "UBR3",  "BSN", "RIF1", "YLPM1", "PCSK1", "UBR2", "GIGYF1", "SLC5A3", "BLTP1", "GRM7"]:
    gene_df = pd.read_csv(
        f"/mnt/project/notebooks/bmi/data/downstream/proteomics/{gene}_protein_assoc.csv.gz"
    )
    gene_df = gene_df.loc[
        (gene_df.p_value<0.05/2923)&
        (gene_df.nobs>40000)
    ]
    gene_df["gene"] = gene
    assoc_prot_df = pd.concat((assoc_prot_df, gene_df))


In [6]:
gene_df = pd.read_csv(
    f"/mnt/project/notebooks/bmi/data/downstream/proteomics/PCSK1_protein_assoc.csv.gz"
)

In [8]:
gene_df.sort_values("p_value")

Unnamed: 0,protein,coef,se,ci_low,ci_high,p_value,stat,nobs,ncarrier
2213,PYY,1.071546,0.281939,0.518938,1.624153,0.000145,3.800624,40833.0,15
1690,MDGA1,0.904417,0.257264,0.400177,1.408657,0.000439,3.515525,48497.0,15
2676,THY1,0.778538,0.251126,0.286327,1.270748,0.001935,3.100184,49284.0,15
1165,GIP,0.802068,0.264361,0.283915,1.320221,0.002415,3.033990,41096.0,15
2818,UBQLN3,0.875519,0.288940,0.309190,1.441849,0.002446,3.030103,40833.0,15
...,...,...,...,...,...,...,...,...,...
1856,NFX1,0.000501,0.286943,-0.561914,0.562916,0.998607,0.001746,40833.0,15
327,CA3,0.000447,0.257952,-0.505142,0.506035,0.998619,0.001731,49377.0,15
580,CLEC12A,-0.000435,0.288318,-0.565545,0.564675,0.998797,-0.001508,40977.0,15
2359,SCGN,0.000139,0.263062,-0.515466,0.515744,0.999579,0.000528,47979.0,15


In [4]:
assoc_prot_df = assoc_prot_df.reset_index(drop=True)


In [5]:
assoc_prot_df

Unnamed: 0,protein,coef,se,ci_low,ci_high,p_value,stat,nobs,ncarrier,gene
0,LECT2,1.266139,0.292291,0.693242,1.839036,1.5e-05,4.331769,42214.0,10,SLTM
1,CD276,2.211144,0.481721,1.266966,3.155322,4e-06,4.590097,48847.0,4,BSN
2,NCAN,-0.668971,0.154042,-0.970895,-0.367047,1.4e-05,-4.342786,49284.0,37,GIGYF1
3,ODAM,-0.755516,0.171796,-1.092239,-0.418794,1.1e-05,-4.397751,47679.0,37,GIGYF1
4,CD164,0.514831,0.10881,0.301562,0.728099,2e-06,4.73147,48877.0,85,BLTP1
5,TNFSF12,0.482567,0.106399,0.274024,0.691111,6e-06,4.535443,49397.0,85,BLTP1


In [9]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    os.remove(filename)
    return


In [10]:
assoc_prot_df.to_csv("./assoc_protein_gene_carrier.csv.gz", index=False)
proj_dir = "/notebooks/bmi/data/downstream/proteomics/"
upload_file_to_project("assoc_protein_gene_carrier.csv.gz", proj_dir)


*********assoc_protein_gene_carrier.csv.gz uploaded!!*********


# Read and parse NPX df

In [11]:
npx_df = pd.read_csv(
    "/mnt/project/notebooks/proteomics/data/npx_processed.csv.gz", 
    dtype={"sample_names": str},
    usecols=["sample_names"] + list(assoc_prot_df.protein.values)
)


In [12]:
def normalize(ser):
    scaler = StandardScaler()
    norm_ser = scaler.fit_transform(ser.to_frame())
    return pd.Series(norm_ser.flatten(), name=ser.name)


In [13]:
norm_npx_df = npx_df.set_index("sample_names").apply(normalize)
norm_npx_df["sample_names"] = npx_df.sample_names


# Read and process the pheno file

In [14]:
# get pheno file
pheno_df = pd.read_csv("/mnt/project/notebooks/bmi/data/pheno.csv.gz", dtype={"sample_names": str})


In [15]:
def normalize_covariates(pheno_df, covariates, exclude=set(["genetic_sex"])):
    norm_pheno_df = pheno_df.copy()
    for cov in covariates:
        if cov not in exclude:
            scaler = StandardScaler()
            norm_pheno_df[cov] = scaler.fit_transform(norm_pheno_df.loc[:, [cov]])
    return norm_pheno_df

def rint_normalization(ser):
    ranks = ser.rank()
    normalized = norm.ppf((ranks - 0.5)/ranks.notna().sum())
    return normalized

In [16]:
pheno_df["bmi_rint"] = pheno_df.groupby(["ancestry_pred", "sex"])["bmi"].transform(rint_normalization)
pheno_df["genetic_sex"] = (pheno_df.genetic_sex=="Female").astype(int)
pheno_df["age_2"] = pheno_df.age**2
pheno_df["age_sex"] = pheno_df.age*pheno_df.genetic_sex
pheno_df["exome_release_batch"] = (pheno_df.exome_release_batch=="50K Release").astype(int)


In [17]:
# normalized pheno df
numerical_covariates = ["age", "age_2", "age_sex", "bmi_prs"] + [f"genetic_pca{i}" for i in range(1, 11)]
categorical_covariates = ["exome_release_batch", "genetic_sex"]

norm_pheno_df = normalize_covariates(pheno_df, numerical_covariates)


In [18]:
protein_pheno_df = norm_pheno_df.merge(norm_npx_df, on="sample_names")

In [19]:
def train_model_sm(X, y, protein):
    model = sm.OLS(y, X)
    results = model.fit()
    r2 = results.rsquared
    coef = results.params.loc[protein]
    se = results.bse.loc[protein]
    stat = results.tvalues.loc[protein]
    conf = results.conf_int().loc[protein].values
    p_val = results.pvalues.loc[protein]
    return coef, se, conf[0], conf[1], p_val, stat, r2, results.nobs


def create_feature_label(gene_pheno_df, protein, numerical_covariates, categorical_covariates):
    equation = f"bmi_rint ~ {protein} + " + " + ".join(numerical_covariates) + " + " + " + ".join([f"C({cc})" for cc in categorical_covariates]) 
    y, X = dmatrices(equation, data=gene_pheno_df, return_type='dataframe')
    return X, y 


def train(gene_pheno_df, protein, numerical_covariates, categorical_covariates):
    X, y = create_feature_label(gene_pheno_df, protein, numerical_covariates, categorical_covariates)
    coef, se, ci_low, ci_high, p_val, stat, r2, nobs = train_model_sm(X, y, protein)
    return coef, se, ci_low, ci_high, p_val, stat, r2, nobs


def get_bmi_assoc(ser, pheno_df, numerical_covariates, categorical_covariates):
    gene_pheno_df = pheno_df.copy()
    protein = ser.protein
    coef, se, ci_low, ci_high, p_val, stat, r2, nobs = train(gene_pheno_df, protein, numerical_covariates, categorical_covariates)
    return pd.Series({
        "gene": ser.gene, "protein": protein, "coef": coef, "se": se, 
        "ci_low": ci_low, "ci_high": ci_high, "p_value": p_val, "stat": stat, 
        "r_squared": r2, "nobs": nobs
    })


In [20]:
bmi_assoc_df = assoc_prot_df.apply(
    get_bmi_assoc, 
    args=(protein_pheno_df, numerical_covariates, categorical_covariates,),
    axis=1
)

In [21]:
bmi_assoc_df

Unnamed: 0,gene,protein,coef,se,ci_low,ci_high,p_value,stat,r_squared,nobs
0,SLTM,LECT2,0.207184,0.005194,0.197004,0.217364,0.0,39.889819,0.112721,42460.0
1,BSN,CD276,0.012911,0.004492,0.004107,0.021716,0.004049423,2.874422,0.080847,48847.0
2,GIGYF1,NCAN,-0.285877,0.004444,-0.294588,-0.277165,0.0,-64.321624,0.152155,49284.0
3,GIGYF1,ODAM,-0.077381,0.004417,-0.086039,-0.068724,1.6687320000000001e-68,-17.519511,0.08596,47679.0
4,BLTP1,CD164,0.082257,0.004334,0.073764,0.090751,4.709386e-80,18.981564,0.087478,48877.0
5,BLTP1,TNFSF12,-0.06082,0.004395,-0.069434,-0.052205,1.818761e-43,-13.837756,0.084273,49397.0


In [22]:
bmi_assoc_df.to_csv("./protein_bmi_assoc.csv.gz", index=False)
proj_dir = "/notebooks/bmi/data/downstream/proteomics/"
upload_file_to_project("protein_bmi_assoc.csv.gz", proj_dir)


*********protein_bmi_assoc.csv.gz uploaded!!*********
