In [2]:
import pandas as pd
import numpy as np
import optuna
import plotnine as pn
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, explained_variance_score, average_precision_score, precision_recall_curve
import statsmodels.formula.api as smf
import wandb
import torch
import statsmodels.api as sm
import pyarrow.parquet as pq
from sklearn.linear_model import LinearRegression, ElasticNetCV
import wandb
import dataloader
import pickle


In [3]:
TRAITS = list(pd.read_csv("phenotype_list_41.txt", header=None)[0].values)

with open('/s/project/geno2pheno/data/replication_sets/genebass_backman_gene_dict.pkl', 'rb') as f:
    replication_dict = pickle.load(f)
    
with open('/s/project/geno2pheno/data/trait_phenocode_dict.pkl', 'rb') as f:
    genebass_phenocode_dict = pickle.load(f)

gb_rep = pd.read_parquet('/s/project/geno2pheno/data/replication_sets/genebass_backman_41traits.pq')

# If hyperopt was NOT done:

In [8]:
study_version = 'v115arch' 
emb = 'omics_pops' 

genes_dt_list = []
pheno_dt_list = []

for trait in TRAITS:
    phenocode = genebass_phenocode_dict[trait]

    # Concatenate gene effect files across all traits
    mean_betas_df = pd.read_parquet(f'/s/project/uk_biobank/processed/g2p/modelling/hyperopt/fixed_arch/study_{study_version}_deepRVAT_{emb}/{trait}_mean_betas.pq')
    mean_betas_df["phenocode"] = str(phenocode)
    mean_betas_df["replicated"] = mean_betas_df.index.isin(replication_dict[phenocode])
    genes_dt_list.append(mean_betas_df)

    # Concatenate phenotype prediction files across all traits
    bayes_pred_df = pd.read_parquet(f'/s/project/uk_biobank/processed/g2p/modelling/hyperopt/fixed_arch/study_{study_version}_deepRVAT_{emb}/{trait}_bayes_pred.pq')
    bayes_pred_df['trait_measurement'] = bayes_pred_df[f"{trait}_measurement"]
    bayes_pred_df["phenocode"] = str(phenocode)     
    pheno_dt_list.append(bayes_pred_df.drop(columns=f"{trait}_measurement"))

    
genes_dt = pd.concat(genes_dt_list)
genes_dt.to_parquet(f'/s/project/geno2pheno/predictions/bayesian/fixed_arch/{study_version}_deepRVAT_{emb}_genes_extended.pq')

pheno_dt = pd.concat(pheno_dt_list)
pheno_dt.to_parquet(f'/s/project/geno2pheno/predictions/bayesian/fixed_arch/{study_version}_deepRVAT_{emb}_predictions_extended.pq')



# If hyperopt results need to be consolidated:


In [None]:
# Read optuna journal to get hyperopt data:

# Model gene effect variance
storage = optuna.storages.JournalStorage(
    optuna.storages.JournalFileStorage(f"/s/project/uk_biobank/processed/g2p/optuna/journal.log"),
)

# Model gene effect expectation
# storage = optuna.storages.JournalStorage(
#     optuna.storages.JournalFileStorage(f"/s/project/uk_biobank/processed/g2p/mean_model/optuna/journal.log"),
# )


In [None]:
best_trials = {}
mean_betas = []
ols_pLoF = []
eval_r2 = []
test_r2 = []
test_auPRC = []
predictions = []
shuffled_phenotype = False
study_versions =  ["v83cov_deepRVAT"] 

for trait in TRAITS:
    phenocode = genebass_phenocode_dict[trait]
    best_trials[trait] = {}
    for study_version in study_versions:
        for embedding_type in ["omics_pops"]:
            study_name = f"study_{study_version}_{trait}_{embedding_type}"
            try:
                study = optuna.load_study(study_name=study_name, storage=storage)
            except KeyError:
                continue
            if len(study.trials_dataframe())<1 or len(study.trials_dataframe().query("state=='COMPLETE'"))<1:
                continue
            if study._is_multi_objective():
                best_trial = max(study.best_trials, key=lambda t: t.values[1])
            else:
                best_trial = study.best_trial
            best_trials[trait][embedding_type] = {
                "trial_number": best_trial.number,
                "r2": best_trial.values[0],
                "n_total_trials": len(study.trials_dataframe().query("state=='COMPLETE'"))
            }

            if "best_r2_list" in best_trial.user_attrs:
                best_repetition = best_trial.user_attrs['best_r2_list'].index(best_trial.values[0])
                best_repetition = f"{best_repetition}_"
            else:
                best_repetition = ""

            results_dir = f"/s/project/uk_biobank/processed/g2p/modelling/hyperopt/{study_name}"
            # results_dir = f"/s/project/uk_biobank/processed/g2p/mean_model/hyperopt/{study_name}"

            run_id = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}run_id.pq").values.item()
            best_epoch = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}mean_betas.pq")["best_r2_epoch"].iloc[0]
            
            mean_betas_df = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}mean_betas.pq", columns={"best_r2_mean_beta": "mean_beta", "best_r2_var_beta": "var_beta"}).rename(columns={"best_r2_mean_beta": "mean_beta", "best_r2_var_beta": "var_beta"})
            mean_betas_df["model"] = embedding_type + f"_bayesian_{study_version.rstrip(',')}"

            mean_betas_df["fE"] = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/wandb/{run_id}_gE.pq", columns=[str(best_epoch)]).rename(columns={best_epoch: "fE"})["fE"]
            # mean_betas_df["fE"] = pd.read_parquet(f"/s/project/uk_biobank/processed/g2p/mean_model/wandb/{run_id}_gE.pq", columns=[str(best_epoch)]).rename(columns={best_epoch: "fE"})["fE"]

            mean_betas_df["trait"] = trait
            mean_betas_df["phenocode"] = str(phenocode)

            mean_betas_df["std_err"] = np.sqrt(mean_betas_df["var_beta"])
            mean_betas_df["neglog_pval"] = -np.minimum(np.log(2) + scipy.stats.norm.logcdf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]), np.log(2) + scipy.stats.norm.logsf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]))
            
            mean_betas_df["pd"] = np.maximum(scipy.stats.norm.cdf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]), scipy.stats.norm.sf(0, mean_betas_df["mean_beta"], mean_betas_df["std_err"]))
            
            mean_betas_df["significant"] = mean_betas_df["pd"] > 0.999
            mean_betas_df["replicated"] = mean_betas_df.index.isin(replication_dict[phenocode])
            mean_betas_df["version"] = study_version.rstrip(',')
            mean_betas_df["genotype"] = "deepRVAT" if "deepRVAT" in study_version else "pLoF"

            mean_betas.append(mean_betas_df.reset_index().rename(columns={"mean_beta": "beta", "index": "gene_id"})[["trait", "phenocode", "gene_id", "model", "genotype", "fE", "beta", "std_err", "neglog_pval", "pd", "significant", "replicated", "version"]])

            eval_r2_df = pd.DataFrame(best_trials[trait][embedding_type], index=[0])
            eval_r2_df["trait"] = trait
            eval_r2_df["model"] = embedding_type
            eval_r2.append(eval_r2_df)

            bayes_pred_df = pd.read_parquet(f"{results_dir}/{best_trial.number}_{best_repetition}bayes_pred.pq", columns=[f'{trait}_measurement', 'common_residual', 'best_r2_pred']).reset_index().rename(columns={f'{trait}_measurement': 'trait_measurement', 'best_r2_pred': 'pred'})
            bayes_pred_df["trait"] = trait
            bayes_pred_df["phenocode"] = str(phenocode)
            bayes_pred_df["model"] = embedding_type + f"_bayesian_{study_version.rstrip(',')}"
            bayes_pred_df["version"] = study_version.rstrip(',')
            bayes_pred_df["genotype"] = "deepRVAT" if "deepRVAT" in study_version else "pLoF"

            predictions.append(bayes_pred_df)


# Dataframe with posterior gene effects
mean_betas_df = pd.concat(mean_betas).reset_index(drop=True)

# Dataframe with phenotype prediction on test data
pred_df = pd.concat(predictions)[["trait", "phenocode", "model", "genotype", "individual", "trait_measurement", "common_residual", "pred", "version"]].reset_index(drop=True)

if pred_df[["trait", "model"]].drop_duplicates().groupby("trait").size().nunique() != 1:
    print("Different number of models per trait in pred_df")
    
if mean_betas_df[["trait", "model"]].drop_duplicates().groupby("trait").size().nunique() != 1:
    print("Different number of models per trait in mean_betas_df")
    
model_list = pred_df[["model", "genotype"]].drop_duplicates().sort_values(["genotype", "model"], ascending=[False, True])["model"].to_list()
pred_df["model"] = pd.Categorical(pred_df['model'], categories=model_list)


In [None]:
# Write to file
for study_version in study_versions:
    study_version = study_version.rstrip(',')
    pred_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/{study_version}_predictions_extended.pq")
    mean_betas_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/{study_version}_genes_extended.pq")
    
    # pred_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/best_model_pred/{study_version}_predictions_extended.pq")
    # mean_betas_df.query(f"(version.isna()) or (version=='{study_version}')").reset_index(drop=True).to_parquet(f"/s/project/geno2pheno/predictions/bayesian/best_model_pred/{study_version}_genes_extended.pq")
