In [1]:
import pandas as pd
import numpy as np
import optuna
import plotnine as pn
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, explained_variance_score, average_precision_score, precision_recall_curve
import statsmodels.formula.api as smf
import statsmodels.api as sm

import wandb
import torch
import pyarrow.parquet as pq
from sklearn.linear_model import LinearRegression, ElasticNetCV
import wandb

# import dataloader
import dataloader_clean
import pickle
from tqdm import tqdm


In [2]:
TRAITS = list(pd.read_csv("phenotype_list_41.txt", header=None)[0].values)

with open('/s/project/geno2pheno/data/replication_sets/genebass_backman_gene_dict.pkl', 'rb') as f:
    replication_dict = pickle.load(f)
    
with open('/s/project/geno2pheno/data/trait_phenocode_dict.pkl', 'rb') as f:
    genebass_phenocode_dict = pickle.load(f)

genebass_phenocode_dict_flipped = {v: k for k, v in genebass_phenocode_dict.items()}

gb_rep = pd.read_parquet('/s/project/geno2pheno/data/replication_sets/genebass_backman_41traits.pq')

# LM - no pheno pred

In [6]:
TRAITS = ['LDL_direct', 'HDL_cholesterol', 'Triglycerides', 'glycated_haemoglobin_hba1c', 'Alanine_aminotransferase', 'Aspartate_aminotransferase', 'Apolipoprotein_A', 'Apolipoprotein_B', 'Albumin', 'Alkaline_phosphatase', 'Calcium', 'c_reactive_protein']
TRAITS

['LDL_direct',
 'HDL_cholesterol',
 'Triglycerides',
 'glycated_haemoglobin_hba1c',
 'Alanine_aminotransferase',
 'Aspartate_aminotransferase',
 'Apolipoprotein_A',
 'Apolipoprotein_B',
 'Albumin',
 'Alkaline_phosphatase',
 'Calcium',
 'c_reactive_protein']

In [9]:
# No pheno pred
genotype = 'deepRVAT'
version = "filteredv3"
test_size = 0
use_prs = True
normalize_covariates = True
only_test = False

covariates = True
shuffled_phenotype = False

genes_df_list = []

for trait in tqdm(TRAITS):
    phenocode = genebass_phenocode_dict[trait]
    
    # assoc_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/ols_{genotype}/{version}_at_{genotype}{'_cov' if covariates else ''}{'_shuffledpheno' if shuffled_phenotype else ''}{'_test' if only_test else ''}_CLEANsplit_{trait}.pq")
    assoc_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/ols_{genotype}/filteredv3_at_{genotype}_cov{f'_{test_size}' if test_size else ''}{'_test' if only_test else ''}_NEWsplit_{trait}.pq")
    if sum(assoc_df["neglog_pval"].isna())>0:
        print("NaN pvalues")
    
    assoc_df['phenocode'] = str(phenocode)
    genes_df_list.append(assoc_df)

ols_df = pd.concat(genes_df_list).reset_index(names='gene_id')
# ols_df["phenocode"] = str(phenocode)
ols_df["var_beta"] = ols_df["std_err"]**2
ols_df["neglog_pval"] = -np.log(ols_df["pval"])
ols_df["significant"] = ols_df["neglog_pval"] > -np.log(0.05/len(ols_df))
ols_df["replicated"] = ols_df.index.isin(replication_dict[phenocode])
ols_df["genotype"] = genotype
ols_df["model"] = ols_df["model"] + "_cov"

# Merge with gene names
gene_names = pd.read_csv('/s/project/geno2pheno/data/hgnc2ensg.tsv', sep='\t')[['Ensembl gene ID', 'Approved symbol']].drop_duplicates().rename(columns={'Ensembl gene ID':'gene_id', 'Approved symbol':'gene_name'})
ols_df = ols_df.merge(gene_names, on='gene_id')

# ols_df.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/at_{version}_{genotype}{'_test' if only_test else ''}_genes_CLEANsplit.pq")

ols_df

100%|██████████| 12/12 [00:00<00:00, 17.09it/s]


KeyError: '30710'

In [16]:
ols_df

Unnamed: 0,gene_id,trait,model,beta,std_err,pval,neglog_pval,test_split_size,phenocode,var_beta,significant
0,ENSG00000000419,LDL_direct,ols_deepRVAT,-0.081114,0.115827,0.483742,0.726205,0.0,30780,0.013416,False
1,ENSG00000000457,LDL_direct,ols_deepRVAT,0.133896,0.106003,0.206543,1.577246,0.0,30780,0.011237,False
2,ENSG00000000460,LDL_direct,ols_deepRVAT,0.108301,0.073100,0.138462,1.977158,0.0,30780,0.005344,False
3,ENSG00000000938,LDL_direct,ols_deepRVAT,0.002078,0.092747,0.982122,0.018040,0.0,30780,0.008602,False
4,ENSG00000000971,LDL_direct,ols_deepRVAT,-0.032892,0.119871,0.783782,0.243625,0.0,30780,0.014369,False
...,...,...,...,...,...,...,...,...,...,...,...
210199,ENSG00000272636,c_reactive_protein,ols_deepRVAT,0.135226,0.098804,0.171117,1.765408,0.0,30710,0.009762,False
210200,ENSG00000273045,c_reactive_protein,ols_deepRVAT,0.402986,0.455183,0.375981,0.978215,0.0,30710,0.207192,False
210201,ENSG00000273079,c_reactive_protein,ols_deepRVAT,0.196242,0.119828,0.101487,2.287826,0.0,30710,0.014359,False
210202,ENSG00000273173,c_reactive_protein,ols_deepRVAT,0.245163,0.151688,0.106045,2.243893,0.0,30710,0.023009,False


In [15]:
# ols_df.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/at_{version}_{genotype}{'_test' if only_test else ''}_genes_CLEANsplit.pq")
ols_df.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/at_{version}_{genotype}{f'_{test_size}' if test_size else ''}{'_test' if only_test else ''}_genes_NEWsplit.pq")
# ols_df.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/OLS_12traits_ukbb370k.pq")

# LM result consolidation and phenotype prediction

In [5]:

genotype = 'deepRVAT'
version = "filteredv3"
test_size = 0.15
use_prs = True
normalize_covariates = True
only_test = False

# ukb_version = "big_train"
covariates = True
shuffled_phenotype = False

genes_df_list = []
pred_df_list = []

for i, trait in enumerate(TRAITS):
    phenocode = genebass_phenocode_dict[trait]
    
    # assoc_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/ols_{genotype}/{version}_ols_{genotype}_cov{'_shuffledpheno' if shuffled_phenotype else ''}{'_test' if only_test else ''}{f'_{ukb_version}' if ukb_version else ''}_full_{trait}.pq")
    # assoc_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/ols_{genotype}/{version}_at_{genotype}{'_cov' if covariates else ''}{'_shuffledpheno' if shuffled_phenotype else ''}{'_test' if only_test else ''}_CLEANsplit_{trait}.pq")
    # assoc_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/ols_deepRVAT/filteredv3_at_deepRVAT_cov__{f'_{ukb_version}' if ukb_version else ''}_CLEANsplit_{trait}.pq")
    assoc_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/ols_{genotype}/filteredv3_at_{genotype}_cov{f'_{test_size}' if test_size else ''}{'_test' if only_test else ''}_NEWsplit_{trait}.pq")
    if sum(assoc_df["neglog_pval"].isna())>0:
        print("NaN pvalues")
    
    assoc_df['phenocode'] = str(phenocode)
    assoc_df["significant"] = assoc_df["pval"] < 0.05/len(assoc_df)
    genes_df_list.append(assoc_df)

    # (gt_train, gt_val, gt_test), (resid_train, resid_val, resid_test), _, gene_list, (id_train, id_val, id_test), (trait_measurement_train, trait_measurement_val, trait_measurement_test), (covariates_train, covariates_val, covariates_test) = dataloader.load_data(trait, embedding_type=None, extend_covariates=True, normalize=True, version=version, genotype=genotype, ukb_version=ukb_version)
    # gt_trainval = pd.DataFrame(np.concatenate([gt_train, gt_val]), columns=gene_list)[assoc_df.query("significant").index].values
    # gt_test = pd.DataFrame(gt_test, columns=gene_list)[assoc_df.query("significant").index].values
    
    (gt_train, gt_val, gt_test), (resid_train, resid_val, resid_test), _, gene_list, (id_train, id_val, id_test), (trait_measurement_train, trait_measurement_val, trait_measurement_test), (covariates_train, covariates_val, covariates_test) = dataloader_clean.load_data(trait, embedding_type=None, use_prs=use_prs, normalize_covariates=normalize_covariates, version=version, genotype=genotype, test_split_size=test_size, split_seed=0, small_gene_list=list(assoc_df.query("significant").index))
    gt_trainval = pd.DataFrame(np.concatenate([gt_train, gt_val]), columns=gene_list)
    
    covariates_trainval = np.concatenate([covariates_train, covariates_val])
    
    for sign_genes in [True, False]:
        print(trait, f"{i+1}/{len(TRAITS)}")

        # output_dir = f"/s/project/uk_biobank/processed/g2p/modelling/LM_significant_genes/{genotype}"
        # outfile_pred = f"{output_dir}/{trait}_LM{'_sign_genes' if sign_genes else ''}_{version}{'_shuffledpheno' if shuffled_phenotype else ''}{'_test' if only_test else ''}{f'_{ukb_version}' if ukb_version else ''}_CLEANsplit.pq"
        
        # if os.path.isfile(outfile_pred):
        #     continue
            
        if sign_genes:
            x_trainval = np.concatenate([covariates_trainval, gt_trainval], axis=1)
            x_test = np.concatenate([covariates_test, gt_test], axis=1)
        else:
            x_trainval = covariates_trainval
            x_test = covariates_test
        
        lm = LinearRegression().fit(x_trainval, np.concatenate([trait_measurement_train, trait_measurement_val]))

        # print("test score", lm.score(x_test, trait_measurement_test))
        print("test score trait_measurement", r2_score(trait_measurement_test , lm.predict(x_test)))

        pred_df_list.append(pd.DataFrame({"trait_measurement": trait_measurement_test, 
                                          "common_residual": resid_test, 
                                          "pred": lm.predict(x_test), 
                                          "trait": trait, 
                                          "phenocode": str(phenocode),
                                          "model": f"lm{f'_sign_genes_{genotype}' if sign_genes else '_cov'}"}, 
                                         index=id_test))#.to_parquet(outfile_pred)
    
    
    # lm_sign_genes_df["phenocode"] = str(phenocode)
    # lm_sign_genes_df["model"] = f"lm{'_sign_genes' if sign_genes else ''}_{genotype}"
    
lm_sign_genes_df = pd.concat(pred_df_list)    
lm_sign_genes_df["genotype"] = genotype
    

ols_df = pd.concat(genes_df_list).reset_index(names='gene_id')
# ols_df["phenocode"] = str(phenocode)
ols_df["var_beta"] = ols_df["std_err"]**2
ols_df["neglog_pval"] = -np.log(ols_df["pval"])
# ols_df["significant"] = ols_df["pval"] < 0.05/len(ols_df)
# ols_df["significant"] =  ols_df["neglog_pval"] > -np.log(0.05/len(ols_df))
ols_df["replicated"] = ols_df.index.isin(replication_dict[phenocode])
ols_df["genotype"] = genotype
ols_df["model"] = ols_df["model"] + "_cov"


LDL_direct 1/12


ValueError: Found array with 0 sample(s) (shape=(0, 40)) while a minimum of 1 is required by LinearRegression.

In [13]:
ols_df

Unnamed: 0,gene_id,trait,model,beta,std_err,pval,neglog_pval,test_split_size,phenocode,significant,var_beta,replicated,genotype
0,ENSG00000000419,Apolipoprotein_A,ols_pLoF_cov,-0.078403,0.059164,0.185114,1.686782,0.25,30630,False,0.003500,False,pLoF
1,ENSG00000000457,Apolipoprotein_A,ols_pLoF_cov,0.055636,0.091198,0.541827,0.612808,0.25,30630,False,0.008317,False,pLoF
2,ENSG00000000460,Apolipoprotein_A,ols_pLoF_cov,0.036318,0.035701,0.309020,1.174350,0.25,30630,False,0.001275,False,pLoF
3,ENSG00000000938,Apolipoprotein_A,ols_pLoF_cov,0.091917,0.173768,0.596832,0.516119,0.25,30630,False,0.030195,False,pLoF
4,ENSG00000000971,Apolipoprotein_A,ols_pLoF_cov,0.121119,0.115511,0.294384,1.222869,0.25,30630,False,0.013343,False,pLoF
...,...,...,...,...,...,...,...,...,...,...,...,...,...
714830,ENSG00000272636,BodyMassIndex,ols_pLoF_cov,0.181281,0.205666,0.378084,0.972639,0.25,21001,False,0.042299,False,pLoF
714831,ENSG00000273045,BodyMassIndex,ols_pLoF_cov,-0.226222,0.265513,0.394205,0.930885,0.25,21001,False,0.070497,False,pLoF
714832,ENSG00000273079,BodyMassIndex,ols_pLoF_cov,0.033304,0.563239,0.952848,0.048300,0.25,21001,False,0.317238,False,pLoF
714833,ENSG00000273173,BodyMassIndex,ols_pLoF_cov,-0.246167,0.796530,0.757284,0.278017,0.25,21001,False,0.634459,False,pLoF


In [14]:
lm_sign_genes_df.trait.unique()

array(['Apolipoprotein_A', 'Calcium', 'HDL_cholesterol', 'IGF1',
       'LDL_direct', 'Erythrocyte_count', 'standing_height',
       'Triglycerides', 'Apolipoprotein_B', 'Cholesterol',
       'Lymphocyte_percentage', 'Mean_corpuscular_volume',
       'Thrombocyte_volume', 'Mean_reticulocyte_volume',
       'Neutrophill_count', 'Platelet_count', 'Platelet_crit',
       'Platelet_distribution_width', 'SHBG', 'Total_bilirubin', 'Urate',
       'Glucose', 'Direct_bilirubin', 'Albumin', 'Vitamin_D',
       'systolic_blood_pressure', 'Urea', 'Phosphate',
       'Mean_corpuscular_haemoglobin', 'Erythrocyte_distribution_width',
       'Mean_sphered_cell_volume',
       'High_light_scatter_reticulocyte_count',
       'High_light_scatter_reticulocyte_percentage', 'Reticulocyte_count',
       'Alkaline_phosphatase', 'Reticulocyte_percentage',
       'glycated_haemoglobin_hba1c', 'Creatinine', 'Cystatin_C',
       'Gamma_glutamyltransferase', 'BodyMassIndex'], dtype=object)

In [15]:
# Write to file
# ols_df.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/at_{version}_{genotype}{f'_{ukb_version}' if ukb_version else ''}_genes_CLEANsplit.pq")
# lm_sign_genes_df.reset_index().to_parquet(f"/s/project/geno2pheno/predictions/bayesian/lm_{version}_{genotype}{f'_{ukb_version}' if ukb_version else ''}_predictions_CLEANsplit.pq")

ols_df.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/at_{version}_{genotype}{f'_{test_size}' if test_size else ''}{'_test' if only_test else ''}_genes_NEWsplit.pq")
lm_sign_genes_df.reset_index().to_parquet(f"/s/project/geno2pheno/predictions/bayesian/lm_{version}_{genotype}{f'_{test_size}' if test_size else ''}{'_test' if only_test else ''}_predictions_NEWsplit.pq")


# If load model pred

In [13]:
pd.read_parquet('/s/project/geno2pheno/predictions/load_model/v108cov_deepRVAT_omics_pops/Albumin_bayes_pred.pq')

Unnamed: 0_level_0,model,trait,trait_measurement,common_residual,pred
individual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000020,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.667584,0.483786,-1.117489
1000092,omics_pops_bayesian_v108cov_deepRVAT,Albumin,0.334431,-0.778526,1.209450
1000107,omics_pops_bayesian_v108cov_deepRVAT,Albumin,0.005263,-0.314928,0.269550
1000110,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-1.132168,-0.925599,-0.206483
1000135,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.687956,-0.650334,-0.047045
...,...,...,...,...,...
6026061,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.827109,-0.606971,-0.219054
6026118,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.224884,-0.644705,0.452796
6026137,omics_pops_bayesian_v108cov_deepRVAT,Albumin,1.295493,0.387148,0.980042
6026151,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.089690,0.027771,-0.041227


In [14]:
pd.read_parquet('/s/project/geno2pheno/predictions/load_model/v108cov_deepRVAT_omics_pops/Albumin_mean_betas.pq')

Unnamed: 0,gene_id,model,trait,mean_beta,std_err,fE,pd
0,ENSG00000000419,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.013280,0.036601,0.001408,0.641636
1,ENSG00000000457,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.002850,0.036541,0.001406,0.531089
2,ENSG00000000460,omics_pops_bayesian_v108cov_deepRVAT,Albumin,0.006236,0.037977,0.001534,0.565219
3,ENSG00000000938,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.011577,0.034260,0.001251,0.632286
4,ENSG00000000971,omics_pops_bayesian_v108cov_deepRVAT,Albumin,0.002200,0.053577,0.003138,0.516380
...,...,...,...,...,...,...,...
17512,ENSG00000272636,omics_pops_bayesian_v108cov_deepRVAT,Albumin,0.008468,0.035318,0.001308,0.594744
17513,ENSG00000273045,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.001590,0.036186,0.001312,0.517519
17514,ENSG00000273079,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.000756,0.037399,0.001454,0.508069
17515,ENSG00000273173,omics_pops_bayesian_v108cov_deepRVAT,Albumin,-0.004350,0.035994,0.001324,0.548093


In [15]:
version = 'v108cov' 
genotype = 'deepRVAT'
emb = 'omics_pops'

genes_dt_list = []
pheno_dt_list = []

for trait in tqdm(TRAITS):
    phenocode = genebass_phenocode_dict[trait]
    
    # Concatenate gene effect files across all traits
    mean_betas_df = pd.read_parquet(f'/s/project/geno2pheno/predictions/load_model/{version}_{genotype}_{emb}/{trait}_mean_betas.pq')
    mean_betas_df["phenocode"] = str(phenocode)
    mean_betas_df["replicated"] = mean_betas_df.index.isin(replication_dict[phenocode])
    genes_dt_list.append(mean_betas_df)
    
    # Concatenate phenotype prediction files across all traits
    bayes_pred_df = pd.read_parquet(f'/s/project/geno2pheno/predictions/load_model/{version}_{genotype}_{emb}/{trait}_bayes_pred.pq').reset_index()
    bayes_pred_df["phenocode"] = str(phenocode)
    bayes_pred_df["genotype"] = genotype
    pheno_dt_list.append(bayes_pred_df)

genes_dt = pd.concat(genes_dt_list)
genes_dt.to_parquet(f'/s/project/geno2pheno/predictions/bayesian/load_model/{version}_{genotype}_{emb}_genes_CLEANsplit.pq', index=False)

pheno_dt = pd.concat(pheno_dt_list)
pheno_dt.to_parquet(f'/s/project/geno2pheno/predictions/bayesian/load_model/{version}_{genotype}_{emb}_predictions_CLEANsplit.pq', index=False)


100%|██████████| 41/41 [00:12<00:00,  3.22it/s]


# If hyperopt was NOT done:

In [32]:
study_version = 'NEWsplit_randEmb' 
genotype = 'deepRVAT'
emb = 'omics_pops'
test_size = 0.25

genes_dt_list = []
pheno_dt_list = []
skip_list = []
for trait in tqdm(TRAITS):
    # if trait in ['Glucose']: #['IGF1', 'LDL_direct']:
    #     continue
    # else:
    try:
        phenocode = genebass_phenocode_dict[trait]
        
        # Concatenate gene effect files across all traits
        # mean_betas_df = pd.read_parquet(f'/s/project/uk_biobank/processed/g2p/modelling/hyperopt/fixed_arch/study_{study_version}_{genotype}_{emb}/{trait}_mean_betas.pq')
        mean_betas_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/hyperopt/fixed_arch/study_{study_version}_{genotype}{f'_testsplit{test_size}' if test_size else ''}_{emb}/{trait}_mean_betas.pq")
        mean_betas_df["phenocode"] = str(phenocode)
        mean_betas_df["replicated"] = mean_betas_df.index.isin(replication_dict[phenocode])
        genes_dt_list.append(mean_betas_df)
        
        # Concatenate phenotype prediction files across all traits
        # bayes_pred_df = pd.read_parquet(f'/s/project/uk_biobank/processed/g2p/modelling/hyperopt/fixed_arch/study_{study_version}_{genotype}_{emb}/{trait}_bayes_pred.pq').reset_index()
        bayes_pred_df = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/modelling/hyperopt/fixed_arch/study_{study_version}_{genotype}{f'_testsplit{test_size}' if test_size else ''}_{emb}/{trait}_bayes_pred.pq").reset_index()
        # bayes_pred_df['trait_measurement'] = bayes_pred_df["trait_measurement"]
        # pheno_dt_list.append(bayes_pred_df.drop(columns="measurement"))
        bayes_pred_df["phenocode"] = str(phenocode)
        bayes_pred_df["genotype"] = genotype
        pheno_dt_list.append(bayes_pred_df)
    except:
        skip_list.append(trait)
        continue

genes_dt = pd.concat(genes_dt_list)
# genes_dt.to_parquet(f'/s/project/geno2pheno/predictions/bayesian/fixed_arch/{study_version}_{genotype}_{emb}_genes_extended.pq', index=False)

pheno_dt = pd.concat(pheno_dt_list)
# pheno_dt.to_parquet(f'/s/project/geno2pheno/predictions/bayesian/fixed_arch/{study_version}_{genotype}_{emb}_predictions_extended.pq', index=False)
print(skip_list)

100%|██████████| 41/41 [00:08<00:00,  4.62it/s]


[]


In [34]:
# study_version = 'v1NEWsplit_randEmb'
# emb = 'noemb'
genes_dt.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/fixed_arch/{study_version}_{genotype}{f'_testsplit{test_size}' if test_size else ''}_{emb}_genes_extended.pq", index=False)
pheno_dt.to_parquet(f"/s/project/geno2pheno/predictions/bayesian/fixed_arch/{study_version}_{genotype}{f'_testsplit{test_size}' if test_size else ''}_{emb}_predictions_extended.pq", index=False)


In [13]:
genes_dt[(genes_dt.trait=='Alkaline_phosphatase') & (genes_dt.significant==True)]

Unnamed: 0,gene_id,best_r2_mean_beta,best_r2_var_beta,best_r2_intercept,best_r2_base_var,best_r2_last_layer_bias,best_r2_epoch,best_r2_fE,best_loss_mean_beta,best_loss_var_beta,...,mean_beta,trait,pd,neglog_pval,significant,base_var_const,version,gene_name,phenocode,replicated
203,ENSG00000008710,-0.246869,0.001066,0.023837,2.8e-05,-7.667099,49,0.049253,-0.246869,0.001066,...,-0.246869,Alkaline_phosphatase,1.0,13.701737,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,PKD1,30610,False
1144,ENSG00000073734,0.074686,0.000315,0.023837,2.8e-05,-7.667099,49,0.000347,0.074686,0.000315,...,0.074686,Alkaline_phosphatase,0.999987,4.886918,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,ABCB11,30610,False
2246,ENSG00000100815,0.149451,0.002047,0.023837,2.8e-05,-7.667099,49,0.006721,0.149451,0.002047,...,0.149451,Alkaline_phosphatase,0.999523,3.321111,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,TRIP11,30610,False
3767,ENSG00000112293,-2.085201,0.002656,0.023837,2.8e-05,-7.667099,49,1.532765,-2.085201,0.002656,...,-2.085201,Alkaline_phosphatase,1.0,inf,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,GPLD1,30610,False
5170,ENSG00000124253,-0.1498,0.001702,0.023837,2.8e-05,-7.667099,49,0.00451,-0.1498,0.001702,...,-0.1498,Alkaline_phosphatase,0.999859,3.851074,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,PCK1,30610,False
7280,ENSG00000139044,0.507478,0.001729,0.023837,2.8e-05,-7.667099,49,0.131987,0.507478,0.001729,...,0.507478,Alkaline_phosphatase,1.0,inf,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,B4GALNT3,30610,False
7640,ENSG00000141505,2.548835,0.007076,0.023837,2.8e-05,-7.667099,49,2.076465,2.548835,0.007076,...,2.548835,Alkaline_phosphatase,1.0,inf,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,ASGR1,30610,False
10100,ENSG00000162551,-4.4833,0.002222,0.023837,2.8e-05,-7.667099,49,3.401659,-4.4833,0.002222,...,-4.4833,Alkaline_phosphatase,1.0,inf,True,0,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,ALPL,30610,False


In [14]:
pheno_dt

Unnamed: 0,individual,trait_measurement,common_residual,best_r2_pred,best_loss_pred,trait,model,version,genotype,phenocode
0,1000092,-0.687939,-0.332038,-0.308662,-0.303451,Apolipoprotein_A,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,30630
1,1000107,0.056486,0.041709,-0.037376,-0.031667,Apolipoprotein_A,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,30630
2,1000199,-0.305796,-0.707093,0.344192,0.360908,Apolipoprotein_A,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,30630
3,1000270,1.913722,0.537718,1.362688,1.347520,Apolipoprotein_A,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,30630
4,1000294,-0.036391,-0.213285,0.167707,0.157294,Apolipoprotein_A,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,30630
...,...,...,...,...,...,...,...,...,...,...
92996,6025996,-1.460899,-1.082295,-0.429345,-0.429345,BodyMassIndex,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,21001
92997,6026035,-0.225882,0.882343,-1.078775,-1.078775,BodyMassIndex,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,21001
92998,6026118,-0.552360,0.382219,-0.931331,-0.931331,BodyMassIndex,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,21001
92999,6026137,-0.767050,-0.468234,-0.270852,-0.270852,BodyMassIndex,omics_pops_bayesian_v1NEWsplit_shuffEmb_shuffl...,v1NEWsplit_shuffEmb_shuffledemb5_deepRVAT,deepRVAT,21001



# If hyperopt results need to be consolidated:


In [3]:

# Read optuna journal to get hyperopt data:

# Model gene effect variance
storage = optuna.storages.JournalStorage(
    optuna.storages.JournalFileStorage(f"/s/project/uk_biobank/processed/g2p/optuna/journal.log"),
)

# Model gene effect expectation
# storage = optuna.storages.JournalStorage(
#     optuna.storages.JournalFileStorage(f"/s/project/uk_biobank/processed/g2p/mean_model/optuna/journal.log"),
# )




In [4]:
best_trials = {}
mean_betas = []
ols_pLoF = []
eval_r2 = []
test_r2 = []
test_auPRC = []
predictions = []
shuffled_phenotype = False
study_versions =  ["v4bigsplitHO_deepRVAT"] 

for trait in TRAITS:
    phenocode = genebass_phenocode_dict[trait]
    best_trials[trait] = {}
    for study_version in study_versions:
        for embedding_type in ["omics_pops"]:
            study_name = f"study_{study_version}_{trait}_{embedding_type}"
            try:
                study = optuna.load_study(study_name=study_name, storage=storage)
            except KeyError:
                continue
            if len(study.trials_dataframe())<1 or len(study.trials_dataframe().query("state=='COMPLETE'"))<1:
                continue
            if study._is_multi_objective():
                best_trial = max(study.best_trials, key=lambda t: t.values[1])
            else:
                best_trial = study.best_trial
            best_trials[trait][embedding_type] = {
                "trial_number": best_trial.number,
                "r2": best_trial.values[0],
                "n_total_trials": len(study.trials_dataframe().query("state=='COMPLETE'"))
            }

            if "best_r2_list" in best_trial.user_attrs:
                best_repetition = best_trial.user_attrs['best_r2_list'].index(best_trial.values[0])
                best_repetition = f"{best_repetition}_"
            else:
                best_repetition = ""

            results_dir = f"/s/project/uk_biobank/processed/g2p/modelling/hyperopt/{study_name}"
            # results_dir = f"/s/project/uk_biobank/processed/g2p/mean_model/hyperopt/{study_name}"

            run_id = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}run_id.pq").values.item()
            best_epoch = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}mean_betas.pq")["best_r2_epoch"].iloc[0]
            
            mean_betas_df = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}mean_betas.pq", columns={"best_r2_mean_beta": "mean_beta", "best_r2_var_beta": "var_beta"}).rename(columns={"best_r2_mean_beta": "mean_beta", "best_r2_var_beta": "var_beta"})
            mean_betas_df["model"] = embedding_type + f"_bayesian_{study_version.rstrip(',')}"

            mean_betas_df["fE"] = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/wandb/{run_id}_gE.pq", columns=[str(best_epoch)]).rename(columns={best_epoch: "fE"})["fE"]
            # mean_betas_df["fE"] = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/mean_model/wandb/{run_id}_gE.pq", columns=[str(best_epoch)]).rename(columns={best_epoch: "fE"})["fE"]

            mean_betas_df["trait"] = trait
            mean_betas_df["phenocode"] = str(phenocode)

            mean_betas_df["std_err"] = np.sqrt(mean_betas_df["var_beta"])
            mean_betas_df["neglog_pval"] = -np.minimum(np.log(2) + scipy.stats.norm.logcdf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]), np.log(2) + scipy.stats.norm.logsf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]))
            
            mean_betas_df["pd"] = np.maximum(scipy.stats.norm.cdf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]), scipy.stats.norm.sf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]))
            
            mean_betas_df["significant"] = mean_betas_df["pd"] > 0.999
            mean_betas_df["replicated"] = mean_betas_df.index.isin(replication_dict[phenocode])
            mean_betas_df["version"] = study_version.rstrip(',')
            mean_betas_df["genotype"] = "deepRVAT" if "deepRVAT" in study_version else "pLoF"

            mean_betas.append(mean_betas_df.reset_index().rename(columns={"mean_beta": "beta", "index": "gene_id"})[["trait", "phenocode", "gene_id", "model", "genotype", "fE", "beta", "std_err", "neglog_pval", "pd", "significant", "replicated", "version"]])

            eval_r2_df = pd.DataFrame(best_trials[trait][embedding_type], index=[0])
            eval_r2_df["trait"] = trait
            eval_r2_df["model"] = embedding_type
            eval_r2.append(eval_r2_df)

            bayes_pred_df = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}bayes_pred.pq", columns=[f'{trait}_measurement', 'common_residual', 'best_r2_pred']).reset_index().rename(columns={f'{trait}_measurement': 'trait_measurement', 'best_r2_pred': 'pred'})
            bayes_pred_df["trait"] = trait
            bayes_pred_df["phenocode"] = str(phenocode)
            bayes_pred_df["model"] = embedding_type + f"_bayesian_{study_version.rstrip(',')}"
            bayes_pred_df["version"] = study_version.rstrip(',')
            bayes_pred_df["genotype"] = "deepRVAT" if "deepRVAT" in study_version else "pLoF"

            predictions.append(bayes_pred_df)


# Dataframe with posterior gene effects
mean_betas_df = pd.concat(mean_betas).reset_index(drop=True)

# Dataframe with phenotype prediction on test data
pred_df = pd.concat(predictions)[["trait", "phenocode", "model", "genotype", "individual", "trait_measurement", "common_residual", "pred", "version"]].reset_index(drop=True)

if pred_df[["trait", "model"]].drop_duplicates().groupby("trait").size().nunique() != 1:
    print("Different number of models per trait in pred_df")
    
if mean_betas_df[["trait", "model"]].drop_duplicates().groupby("trait").size().nunique() != 1:
    print("Different number of models per trait in mean_betas_df")
    
model_list = pred_df[["model", "genotype"]].drop_duplicates().sort_values(["genotype", "model"], ascending=[False, True])["model"].to_list()
pred_df["model"] = pd.Categorical(pred_df['model'], categories=model_list)


In [5]:
pred_df

Unnamed: 0,trait,phenocode,model,genotype,individual,trait_measurement,common_residual,pred,version
0,Apolipoprotein_A,30630,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,1000020,-1.900108,-1.275890,-0.663370,v4bigsplitHO_deepRVAT
1,Apolipoprotein_A,30630,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,1000092,-0.687259,-0.332038,-0.293450,v4bigsplitHO_deepRVAT
2,Apolipoprotein_A,30630,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,1000110,0.027349,-0.979018,1.000941,v4bigsplitHO_deepRVAT
3,Apolipoprotein_A,30630,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,1000355,-1.036394,-0.252077,-0.796524,v4bigsplitHO_deepRVAT
4,Apolipoprotein_A,30630,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,1000387,0.269973,-0.525341,0.774218,v4bigsplitHO_deepRVAT
...,...,...,...,...,...,...,...,...,...
3250547,BodyMassIndex,21001,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,6025867,0.456340,0.052595,0.388153,v4bigsplitHO_deepRVAT
3250548,BodyMassIndex,21001,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,6025889,1.651937,1.526797,0.156530,v4bigsplitHO_deepRVAT
3250549,BodyMassIndex,21001,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,6025890,-1.927886,-2.616609,0.662006,v4bigsplitHO_deepRVAT
3250550,BodyMassIndex,21001,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,6026056,-0.927659,-0.079545,-0.858121,v4bigsplitHO_deepRVAT


In [9]:
mean_betas_df

Unnamed: 0,trait,phenocode,gene_id,model,genotype,fE,beta,std_err,neglog_pval,pd,significant,replicated,version
0,Apolipoprotein_A,30630,ENSG00000000419,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000056,0.000390,0.007496,0.042327,0.520722,False,False,v4bigsplitHO_deepRVAT
1,Apolipoprotein_A,30630,ENSG00000000457,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000057,0.000115,0.007513,0.012339,0.506131,False,False,v4bigsplitHO_deepRVAT
2,Apolipoprotein_A,30630,ENSG00000000460,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000056,-0.000037,0.007446,0.003963,0.501977,False,False,v4bigsplitHO_deepRVAT
3,Apolipoprotein_A,30630,ENSG00000000938,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000062,-0.000460,0.007858,0.047830,0.523352,False,False,v4bigsplitHO_deepRVAT
4,Apolipoprotein_A,30630,ENSG00000000971,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000056,-0.000279,0.007444,0.030316,0.514931,False,False,v4bigsplitHO_deepRVAT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
718192,BodyMassIndex,21001,ENSG00000272636,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000068,0.000690,0.008241,0.069034,0.533353,False,False,v4bigsplitHO_deepRVAT
718193,BodyMassIndex,21001,ENSG00000273045,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000210,0.000153,0.014499,0.008449,0.504207,False,False,v4bigsplitHO_deepRVAT
718194,BodyMassIndex,21001,ENSG00000273079,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000071,-0.000329,0.008396,0.031769,0.515635,False,False,v4bigsplitHO_deepRVAT
718195,BodyMassIndex,21001,ENSG00000273173,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.000096,-0.000262,0.009768,0.021599,0.510684,False,False,v4bigsplitHO_deepRVAT


In [10]:
mean_betas_df[mean_betas_df.significant==True]

Unnamed: 0,trait,phenocode,gene_id,model,genotype,fE,beta,std_err,neglog_pval,pd,significant,replicated,version
124139,Triglycerides,30870,ENSG00000084674,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.007573,-0.174495,0.045317,9.045889,0.999941,True,True,v4bigsplitHO_deepRVAT
141656,Apolipoprotein_B,30640,ENSG00000084674,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.013845,-0.264521,0.05365,14.013942,1.0,True,True,v4bigsplitHO_deepRVAT
246116,Neutrophill_count,30140,ENSG00000065621,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.035248,0.250483,0.067436,8.498868,0.999898,True,False,v4bigsplitHO_deepRVAT
253308,Neutrophill_count,30140,ENSG00000144426,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.015162,-0.167322,0.047213,7.838905,0.999803,True,False,v4bigsplitHO_deepRVAT
555268,High_light_scatter_reticulocyte_count,30300,ENSG00000171490,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.067932,0.314743,0.100691,6.335072,0.999113,True,False,v4bigsplitHO_deepRVAT
621812,Reticulocyte_percentage,30240,ENSG00000150275,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.008523,-0.12725,0.037546,7.262859,0.999649,True,False,v4bigsplitHO_deepRVAT
627895,Reticulocyte_percentage,30240,ENSG00000186818,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.066318,0.245181,0.07458,6.896949,0.999495,True,False,v4bigsplitHO_deepRVAT
634126,glycated_haemoglobin_hba1c,30750,ENSG00000110446,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.025253,0.152387,0.048635,6.360496,0.999136,True,False,v4bigsplitHO_deepRVAT
640007,glycated_haemoglobin_hba1c,30750,ENSG00000156535,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.02544,-0.154384,0.044696,7.501767,0.999724,True,False,v4bigsplitHO_deepRVAT
640362,glycated_haemoglobin_hba1c,30750,ENSG00000159788,omics_pops_bayesian_v4bigsplitHO_deepRVAT,deepRVAT,0.018913,0.137165,0.043559,6.413901,0.999181,True,False,v4bigsplitHO_deepRVAT


In [11]:
study_version

'v4bigsplitHO_deepRVAT'

In [12]:
# Write to file

emb = 'omics_pops'

for study_version in study_versions:
    study_version = study_version.rstrip(',')
    pred_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/{study_version}_{emb}_predictions_CLEANsplit.pq")
    mean_betas_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/{study_version}_{emb}_genes_CLEANsplit.pq")
    
    # pred_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/best_model_pred/{study_version}_predictions_extended.pq")
    # mean_betas_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/best_model_pred/{study_version}_genes_extended.pq")
