# Description

**TODO:** update

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import numpy as np
from scipy.spatial.distance import squareform
import pandas as pd
from scipy import stats

# import matplotlib.pyplot as plt
# import seaborn as sns

import conf
import utils
from entity import Gene

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = "1000G_EUR"

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = "1000G"

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = "MASHR"

In [4]:
OUTPUT_DIR_BASE = (
    conf.RESULTS["GLS"]
    / "gene_corrs"
    / "cohorts"
    / COHORT_NAME.lower()
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
display(OUTPUT_DIR_BASE)
assert OUTPUT_DIR_BASE.exists()

# OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/results/gls/gene_corrs/cohorts/1000g_eur/1000g/mashr')

In [5]:
OUTPUT_DIR = utils.get_git_repository_path() / "tests" / "data" / "gls"
display(OUTPUT_DIR)
assert OUTPUT_DIR.exists()

PosixPath('/opt/code/tests/data/gls')

# Load data

## MultiPLIER Z

In [6]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [7]:
multiplier_z_genes = multiplier_z.index.tolist()

In [8]:
len(multiplier_z_genes)

6750

In [9]:
multiplier_z_genes[:10]

['GAS6',
 'MMP14',
 'DSP',
 'MARCKSL1',
 'SPARC',
 'CTSD',
 'EPAS1',
 'PALLD',
 'PHC2',
 'LGALS3BP']

## Function to load MultiXcan's results on random phenotypes

In [10]:
def load_multixcan_random_phenotype(phenotype_code):
    multixcan_random_results = pd.read_csv(
        conf.RESULTS["GLS_NULL_SIMS"]
        / "twas"
        / "smultixcan"
        / f"random.pheno{phenotype_code}-gtex_v8-mashr-smultixcan.txt",
        sep="\t",
        index_col="gene_name",
    )

    return multixcan_random_results

In [11]:
load_multixcan_random_phenotype(0).head()

Unnamed: 0_level_0,gene,pvalue,n,n_indep,p_i_best,t_i_best,p_i_worst,t_i_worst,eigen_max,eigen_min,eigen_min_kept,z_min,z_max,z_mean,z_sd,tmi,status
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
RHPN2,ENSG00000131941.7,4e-05,48.0,3.0,0.000213947,Artery_Tibial,0.990132,Brain_Nucleus_accumbens_basal_ganglia,36.556432,7.692089e-16,2.519701,-2.721185,3.701952,1.283152,1.825567,3.0,0
GPATCH1,ENSG00000076650.6,7.8e-05,40.0,3.0,0.000453439,Brain_Cerebellum,0.817384,Brain_Frontal_Cortex_BA9,29.990208,2.086487e-15,1.815203,-3.506853,2.383485,-2.016745,1.715495,3.0,0
NFKBIA,ENSG00000100906.10,9.6e-05,1.0,1.0,9.591208e-05,Brain_Frontal_Cortex_BA9,9.6e-05,Brain_Frontal_Cortex_BA9,1.0,1.0,1.0,-3.900707,-3.900707,-3.900707,,1.0,0
TTC5,ENSG00000136319.11,0.000109,47.0,5.0,0.001402826,Brain_Hippocampus,0.961887,Colon_Sigmoid,21.272442,8.142339e-16,0.732606,-3.194069,1.397514,-0.916662,1.068989,5.0,0
ADGRA3,ENSG00000152990.13,0.000135,41.0,12.0,3.211289e-07,Heart_Atrial_Appendage,0.653657,Whole_Blood,12.988248,3.499412e-16,0.444682,-5.110605,3.59941,-0.464735,2.316607,12.0,0


## MultiXcan real results (PhenomeXcan)

In [12]:
multixcan_real_results = pd.read_pickle(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [13]:
multixcan_real_results = multixcan_real_results[
    ~multixcan_real_results.index.duplicated(keep="first")
].dropna(how="all", axis=0)

In [14]:
multixcan_real_results.shape

(22508, 3752)

In [15]:
multixcan_real_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


In [16]:
assert not multixcan_real_results.isna().any(None)

## Load full correlation matrix

In [17]:
orig_corr_mat = pd.read_pickle(OUTPUT_DIR / "corr_mat.pkl.xz")

In [18]:
orig_corr_mat.shape

(6442, 6442)

In [19]:
orig_corr_mat.head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.115011,0.173138,0.056096,0.008032,0.008727,0.006797,0.004533,0.00735,0.010391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.115011,1.0,0.681368,0.360588,0.011545,0.010729,0.003577,0.01023,0.010747,0.008769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.173138,0.681368,1.0,0.381394,0.011774,0.012527,0.003754,0.012096,0.012679,0.010442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.056096,0.360588,0.381394,1.0,0.013005,0.015775,0.006184,0.006813,0.010775,0.009189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.008032,0.011545,0.011774,0.013005,1.0,0.356676,0.45401,0.137643,0.20034,0.09321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load cohort metadata

In [20]:
gene_tissues_df = pd.read_pickle(
    utils.get_git_repository_path()
    / "tests"
    / "data"
    / "gls"
    / "cohort_1000g_eur_metadata"
    / "gene_tissues.pkl.gz"
).set_index("gene_name")

In [21]:
gene_tissues_df.shape

(22314, 6)

In [22]:
gene_tissues_df = gene_tissues_df.loc[~gene_tissues_df.index.duplicated(keep="first")]

In [23]:
gene_tissues_df.shape

(22308, 6)

In [24]:
assert gene_tissues_df.index.is_unique

In [25]:
gene_tissues_df.head()

Unnamed: 0_level_0,tissue,n_tissues,n_snps_used_sum,n_snps_in_model_sum,unique_n_snps_in_model,unique_n_snps_used
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DPM1,"(Brain_Substantia_nigra, Brain_Hypothalamus)",2,2,2,2,2
SCYL3,"(Brain_Hippocampus, Nerve_Tibial, Brain_Anteri...",48,88,90,18,17
C1orf112,"(Brain_Hippocampus, Nerve_Tibial, Brain_Substa...",39,62,64,24,22
FGR,"(Brain_Hippocampus, Nerve_Tibial, Brain_Anteri...",36,40,40,5,5
CFH,"(Brain_Hippocampus, Nerve_Tibial, Brain_Anteri...",34,44,44,12,12


# Functions

In [35]:
import statsmodels.api as sm
from sklearn.preprocessing import scale

In [205]:
def get_data(
    lv_code,
    random_phenotype_code=None,
    real_phenotype_code=None,
    add_covars=False,
    add_snplevel_covars=False,
    add_covars_logs=False,
):
    if random_phenotype_code is not None:
        target_data = load_multixcan_random_phenotype(random_phenotype_code)["pvalue"]
        y = pd.Series(
            data=np.abs(stats.norm.ppf(target_data.to_numpy() / 2)),
            index=target_data.index.copy(),
        )
    elif real_phenotype_code is not None:
        y = multixcan_real_results[real_phenotype_code]

    y = y[~y.index.duplicated(keep="first")]
    y = y.dropna()

    X = multiplier_z[lv_code].copy()

    common_genes = orig_corr_mat.index.intersection(y.index).intersection(X.index)
    y = y.loc[common_genes]

    X = X.loc[common_genes]
    X = sm.add_constant(X)

    if add_covars or add_snplevel_covars:
        covars = load_multixcan_random_phenotype(random_phenotype_code)[
            ["n", "n_indep"]
        ]
        covars = covars[~covars.index.duplicated(keep="first")]
        covars = covars.loc[X.index]

    if add_covars:
        covars = covars.rename(
            columns={
                "n_indep": "gene_size",
            }
        )
        covars = covars.assign(
            gene_density=covars.apply(lambda x: x["gene_size"] / x["n"], axis=1)
        )

        if add_covars_logs:
            covars["gene_size_log"] = np.log(covars["gene_size"])
            covars["gene_density_log"] = -np.log(covars["gene_density"])

    if add_snplevel_covars:
        covars = covars.assign(gene_n_snps_used=gene_tissues_df["n_snps_used_sum"])

        covars = covars.assign(
            gene_n_snps_used_density=gene_tissues_df.apply(
                lambda x: x["n_snps_used_sum"] / x["n_snps_in_model_sum"], axis=1
            )
        )

        if add_covars_logs:
            covars["gene_n_snps_used_log"] = np.log(covars["gene_n_snps_used"])
            covars["gene_n_snps_used_density_log"] = -np.log(
                covars["gene_n_snps_used_density"]
            )

    if add_covars or add_snplevel_covars:
        if not add_covars:
            covars = covars.drop(
                columns=[
                    c
                    for c in covars.columns
                    if c.startswith(("gene_size", "gene_density"))
                ]
            )

        if not add_snplevel_covars:
            covars = covars.drop(
                columns=[
                    c
                    for c in covars.columns
                    if c.startswith(("gene_n_snps_used", "gene_n_snps_used_density"))
                ]
            )

        covars = covars.drop(
            columns=[c for c in covars.columns if c in ("n", "n_indep")]
        )

        X = X.join(covars)

    return X, y

In [206]:
# testing
_X, _y = get_data("LV7", 10)
assert _X.shape[0] < 7000
assert _X.shape[1] == 2
assert "LV7" in _X.columns
assert "const" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [207]:
_X.head()

Unnamed: 0,const,LV7
NOC2L,1.0,0.0
HES4,1.0,0.0
ISG15,1.0,0.0
AGRN,1.0,0.0
TNFRSF18,1.0,0.0


In [208]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [209]:
# testing
_X, _y = get_data("LV7", 10, add_covars=True)
assert _X.shape[0] < 7000
assert _X.shape[1] == 4
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_size" in _X.columns
assert "gene_density" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [210]:
_X.head()

Unnamed: 0,const,LV7,gene_size,gene_density
NOC2L,1.0,0.0,5.0,0.106383
HES4,1.0,0.0,3.0,0.068182
ISG15,1.0,0.0,6.0,0.142857
AGRN,1.0,0.0,4.0,0.086957
TNFRSF18,1.0,0.0,4.0,0.088889


In [211]:
# load_multixcan_random_phenotype(10).loc["TNFRSF18"]

In [212]:
assert _X.loc["TNFRSF18", "gene_size"] == 4.0
assert _X.loc["TNFRSF18", "gene_density"] == 4 / 45.0

In [213]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [214]:
# testing
_X, _y = get_data("LV7", 10, add_snplevel_covars=True)
assert _X.shape[0] < 7000
assert _X.shape[1] == 4
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_n_snps_used" in _X.columns
assert "gene_n_snps_used_density" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [215]:
_X.head()

Unnamed: 0,const,LV7,gene_n_snps_used,gene_n_snps_used_density
NOC2L,1.0,0.0,103,0.980952
HES4,1.0,0.0,55,0.785714
ISG15,1.0,0.0,59,0.967213
AGRN,1.0,0.0,75,0.728155
TNFRSF18,1.0,0.0,65,0.984848


In [216]:
# gene_tissues_df.loc["AGRN"]

In [217]:
assert _X.loc["AGRN", "gene_n_snps_used"] == 75.0
assert _X.loc["AGRN", "gene_n_snps_used_density"] == 75 / 103.0

In [218]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [219]:
# testing
_X, _y = get_data("LV7", 10, add_covars=True, add_snplevel_covars=True)
assert _X.shape[0] < 7000
assert _X.shape[1] == 6
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_size" in _X.columns
assert "gene_density" in _X.columns
assert "gene_n_snps_used" in _X.columns
assert "gene_n_snps_used_density" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [220]:
_X.head()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_n_snps_used,gene_n_snps_used_density
NOC2L,1.0,0.0,5.0,0.106383,103,0.980952
HES4,1.0,0.0,3.0,0.068182,55,0.785714
ISG15,1.0,0.0,6.0,0.142857,59,0.967213
AGRN,1.0,0.0,4.0,0.086957,75,0.728155
TNFRSF18,1.0,0.0,4.0,0.088889,65,0.984848


In [221]:
# gene_tissues_df.loc["AGRN"]

In [222]:
assert _X.loc["AGRN", "gene_n_snps_used"] == 75.0
assert _X.loc["AGRN", "gene_n_snps_used_density"] == 75 / 103.0

In [223]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [224]:
# testing
_X, _y = get_data(
    "LV7", 10, add_covars=True, add_snplevel_covars=True, add_covars_logs=True
)
assert _X.shape[0] < 7000
assert _X.shape[1] == 10
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_size" in _X.columns
assert "gene_size_log" in _X.columns
assert "gene_density" in _X.columns
assert "gene_density_log" in _X.columns
assert "gene_n_snps_used" in _X.columns
assert "gene_n_snps_used_log" in _X.columns
assert "gene_n_snps_used_density" in _X.columns
assert "gene_n_snps_used_density_log" in _X.columns
assert not _X.isna().any(None)

assert _X["gene_density"].between(0.0, 1.0, inclusive="right").all()
assert _X["gene_density_log"].min() >= 0.0
assert _X["gene_size"].min() >= 0.0
assert _X["gene_size_log"].min() >= 0.0
assert _X["gene_n_snps_used"].min() >= 0.0
assert _X["gene_n_snps_used_log"].min() >= 0.0
assert _X["gene_n_snps_used_density"].between(0.0, 1.0, inclusive="right").all()
assert _X["gene_n_snps_used_density_log"].min() >= 0.0

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [225]:
_X["gene_size_log"].describe()

count    6442.000000
mean        1.328358
std         0.564777
min         0.000000
25%         1.098612
50%         1.386294
75%         1.791759
max         2.890372
Name: gene_size_log, dtype: float64

In [226]:
_X["gene_density_log"].describe()

count    6442.000000
mean        2.134227
std         0.801612
min        -0.000000
25%         1.658228
50%         2.219203
75%         2.708050
max         3.891820
Name: gene_density_log, dtype: float64

In [227]:
_X["gene_n_snps_used_log"].describe()

count    6442.000000
mean        3.816128
std         0.795153
min         0.000000
25%         3.555348
50%         4.007333
75%         4.343805
max         5.318120
Name: gene_n_snps_used_log, dtype: float64

In [228]:
_X["gene_n_snps_used_density_log"].describe()

count    6442.000000
mean        0.056216
std         0.125470
min        -0.000000
25%        -0.000000
50%        -0.000000
75%         0.039221
max         1.252763
Name: gene_n_snps_used_density_log, dtype: float64

In [229]:
_X.head()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_size_log,gene_density_log,gene_n_snps_used,gene_n_snps_used_density,gene_n_snps_used_log,gene_n_snps_used_density_log
NOC2L,1.0,0.0,5.0,0.106383,1.609438,2.24071,103,0.980952,4.634729,0.019231
HES4,1.0,0.0,3.0,0.068182,1.098612,2.685577,55,0.785714,4.007333,0.241162
ISG15,1.0,0.0,6.0,0.142857,1.791759,1.94591,59,0.967213,4.077537,0.033336
AGRN,1.0,0.0,4.0,0.086957,1.386294,2.442347,75,0.728155,4.317488,0.317241
TNFRSF18,1.0,0.0,4.0,0.088889,1.386294,2.420368,65,0.984848,4.174387,0.015267


In [230]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [231]:
def standardize_data(X, y):
    X = X.copy()
    y = y.copy()

    c = [c for c in X.columns if c != "const"]
    X[c] = (X[c] - X[c].mean()) / X[c].std()

    return X, (y - y.mean()) / y.std()

In [232]:
def get_aligned_corr_mat(X, perc=1.0):
    # perc == 1.0 means select all nonzero genes;
    # perc = None means do not subset the correlation matrix
    gene_corrs = orig_corr_mat.loc[X.index, X.index]

    if perc is None:
        return gene_corrs

    corr_mat_sub = pd.DataFrame(
        np.identity(gene_corrs.shape[0]),
        index=gene_corrs.index.copy(),
        columns=gene_corrs.columns.copy(),
    )

    X = X.iloc[:, 1]

    X_non_zero = X[X > 0]
    X_thres = X_non_zero.quantile(1 - perc)
    lv_nonzero_genes = X[X >= X_thres].index

    lv_nonzero_genes = lv_nonzero_genes.intersection(gene_corrs.index)
    corr_mat_sub.loc[lv_nonzero_genes, lv_nonzero_genes] = gene_corrs.loc[
        lv_nonzero_genes, lv_nonzero_genes
    ]

    return corr_mat_sub

In [233]:
# testing
_X_test = pd.DataFrame(
    {
        "const": 1.0,
        "LV1": [1.0, 0.4, 0.0],  # the last gene has zero weight
    },
    index=[
        "PSMB10",  # the first two genes have a high sum of correlations, to make sure the sum is not close to 1.0
        "SLC12A4",
        "ACD",
    ],
)

# do not subset
_tmp_corr = get_aligned_corr_mat(_X_test, perc=None)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.73],
            [0.77, 1.0, 0.63],
            [0.73, 0.63, 1.00],
        ]
    ),
)

# do subset: include all non-zero LV genes
_tmp_corr = get_aligned_corr_mat(_X_test, perc=1.0)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.00],
            [0.77, 1.0, 0.00],
            [0.00, 0.00, 1.00],
        ]
    ),
)

# do subset: include all non-zero LV genes with weight > 99% percentile
_tmp_corr = get_aligned_corr_mat(_X_test, perc=0.99)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.00, 0.00],
            [0.00, 1.0, 0.00],
            [0.00, 0.00, 1.00],
        ]
    ),
)

In [234]:
def train_statsmodels_gls(X, y, corr_mat):
    gls_model = sm.GLS(y, X, sigma=corr_mat)
    gls_results = gls_model.fit()
    return gls_results

# [full corr matrix] GLS on randomly generated phenotypes

In [32]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [33]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [34]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [35]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1459
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.702
Time:                        15:42:55   Log-Likelihood:                -8695.5
No. Observations:                6442   AIC:                         1.739e+04
Df Residuals:                    6440   BIC:                         1.741e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0278      0.024      1.152      0.2

In [36]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.429
Date:                Mon, 25 Jul 2022   Prob (F-statistic):             0.0641
Time:                        15:42:55   Log-Likelihood:                -5683.6
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.138e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7650      0.007    104.807      0.0

In [37]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.003281047962518868
0.008589327735370812
-0.3819912411779944
0.702480465360728
0.6487597673196361


In [38]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
HIST1H2BO,1.0,8.480948
HIST1H2BF,1.0,8.426226
HIST1H2BK,1.0,8.245903
HIST1H2BD,1.0,8.119013
HIST1H2BC,1.0,7.744137
...,...,...
TREM1,1.0,0.000000
TREML2,1.0,0.000000
TREM2,1.0,0.000000
NFYA,1.0,0.000000


In [39]:
Xs.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
HIST1H2BO,1.0,27.865226
HIST1H2BF,1.0,27.685041
HIST1H2BK,1.0,27.091293
HIST1H2BD,1.0,26.673482
HIST1H2BC,1.0,25.439130
...,...,...
TREM1,1.0,-0.059957
TREML2,1.0,-0.059957
TREM2,1.0,-0.059957
NFYA,1.0,-0.059957


In [40]:
y.sort_values(ascending=False)

CHPF2     4.039680
PRR5      3.726033
MMP12     3.686147
RBM38     3.581041
SOS1      3.528183
            ...   
GPX3      0.000826
SUOX      0.000686
SPRED2    0.000364
DEGS1     0.000152
SAFB      0.000072
Length: 6442, dtype: float64

In [41]:
ys.sort_values(ascending=False)

CHPF2     5.600117
PRR5      5.063874
MMP12     4.995681
RBM38     4.815982
SOS1      4.725610
            ...   
GPX3     -1.305113
SUOX     -1.305353
SPRED2   -1.305903
DEGS1    -1.306266
SAFB     -1.306403
Length: 6442, dtype: float64

In [42]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 6 / LV455

In [43]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [44]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [45]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                   0.02003
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.887
Time:                        15:43:18   Log-Likelihood:                -8695.5
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6440   BIC:                         1.741e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0280      0.024      1.158      0.2

In [46]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2956
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.587
Time:                        15:43:18   Log-Likelihood:                -5685.2
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.139e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7639      0.007    104.494      0.0

In [47]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.0015604132867500819
0.011025668844737165
0.14152549915326973
0.8874592440993045
0.4437296220496523


In [48]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV455
CACNA1A,1.0,8.294351
ZNF26,1.0,7.956442
ARHGAP42,1.0,5.592084
UBE2B,1.0,5.379685
GAB2,1.0,3.946462
...,...,...
ZNF655,1.0,0.000000
ZKSCAN5,1.0,0.000000
ZNF394,1.0,0.000000
ATP5J2,1.0,0.000000


In [49]:
y.sort_values(ascending=False)

CHPF2     4.039680
PRR5      3.726033
MMP12     3.686147
RBM38     3.581041
SOS1      3.528183
            ...   
GPX3      0.000826
SUOX      0.000686
SPRED2    0.000364
DEGS1     0.000152
SAFB      0.000072
Length: 6442, dtype: float64

In [50]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 0 / LV801

In [59]:
lv_code = "LV801"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [60]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [61]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5235
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.469
Time:                        15:47:30   Log-Likelihood:                -8680.1
No. Observations:                6442   AIC:                         1.736e+04
Df Residuals:                    6440   BIC:                         1.738e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0218      0.024     -0.905      0.3

In [62]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.082
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.149
Time:                        15:47:50   Log-Likelihood:                -5887.0
No. Observations:                6442   AIC:                         1.178e+04
Df Residuals:                    6440   BIC:                         1.179e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7704      0.008    101.372      0.0

In [63]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.007927779814996835
0.010957049553795768
0.7235323502074036
0.4693791652354944
0.2346895826177472


In [64]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV801
SPATS2,1.0,7.137544
SSR3,1.0,6.096376
H3F3C,1.0,4.471034
COLEC12,1.0,4.057876
AP4M1,1.0,2.904084


In [65]:
y.sort_values(ascending=False).head()

GPATCH1    3.951082
NFKBIA     3.900707
TTC5       3.870256
ZNF17      3.807992
ZNF563     3.747230
dtype: float64

In [66]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

# [sub corr matrix ] GLS on randomly generated phenotypes

In [67]:
PERC_NONZERO_GENES = 1.00

## Random phenotype 6 / LV45

In [68]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [69]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [70]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5651
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.452
Time:                        15:53:29   Log-Likelihood:                -9137.9
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.829e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0003      0.013      0.027      0.9

In [71]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.429
Date:                Mon, 25 Jul 2022   Prob (F-statistic):             0.0641
Time:                        15:53:55   Log-Likelihood:                -5683.6
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.138e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7650      0.007    104.807      0.0

In [72]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.007758262616074735
0.010320432763493127
-0.7517381096186531
0.45223603891122643
0.7738819805443868


In [73]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV45
HIST1H2BO,1.0,8.480948
HIST1H2BF,1.0,8.426226
HIST1H2BK,1.0,8.245903
HIST1H2BD,1.0,8.119013
HIST1H2BC,1.0,7.744137


In [74]:
y.sort_values(ascending=False).head()

CHPF2    4.039680
PRR5     3.726033
MMP12    3.686147
RBM38    3.581041
SOS1     3.528183
dtype: float64

In [75]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 6 / LV455

In [76]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [77]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [78]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2917
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.589
Time:                        15:56:28   Log-Likelihood:                -9088.5
No. Observations:                6442   AIC:                         1.818e+04
Df Residuals:                    6440   BIC:                         1.819e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0047      0.014      0.336      0.7

In [79]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2956
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.587
Time:                        15:56:43   Log-Likelihood:                -5685.2
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.139e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7639      0.007    104.494      0.0

In [80]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.006302906163160309
0.011669857436439947
0.5401013849131561
0.5891457914757958
0.2945728957378979


In [81]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV455
CACNA1A,1.0,8.294351
ZNF26,1.0,7.956442
ARHGAP42,1.0,5.592084
UBE2B,1.0,5.379685
GAB2,1.0,3.946462


In [82]:
y.sort_values(ascending=False).head()

CHPF2    4.039680
PRR5     3.726033
MMP12    3.686147
RBM38    3.581041
SOS1     3.528183
dtype: float64

In [83]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 10 / LV100

In [84]:
lv_code = "LV100"
phenotype_code = 10

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype10-pvalues'

In [85]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [86]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                  0.009376
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.923
Time:                        15:58:34   Log-Likelihood:                -8978.3
No. Observations:                6442   AIC:                         1.796e+04
Df Residuals:                    6440   BIC:                         1.797e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0153      0.015      0.999      0.3

In [87]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5978
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.439
Time:                        15:58:40   Log-Likelihood:                -5800.3
No. Observations:                6442   AIC:                         1.160e+04
Df Residuals:                    6440   BIC:                         1.162e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7726      0.008    102.837      0.0

In [88]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.0004030553781634736
0.004162484428442315
0.0968304831146972
0.9228640287589255
0.46143201437946274


In [89]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV100
STX4,1.0,6.438276
RAB11B,1.0,6.34148
MED11,1.0,5.910379
NDUFB7,1.0,4.358643
MRPL34,1.0,3.903225


In [90]:
y.sort_values(ascending=False).head()

RPL15     3.852128
VAMP4     3.543077
HMGCS1    3.404068
MED9      3.360888
ABCB10    3.335043
dtype: float64

In [91]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 0 / LV800

In [92]:
lv_code = "LV800"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [93]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [94]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                   0.08856
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.766
Time:                        16:00:19   Log-Likelihood:                -9056.1
No. Observations:                6442   AIC:                         1.812e+04
Df Residuals:                    6440   BIC:                         1.813e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0021      0.014      0.152      0.8

In [95]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4172
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.518
Time:                        16:00:29   Log-Likelihood:                -5887.9
No. Observations:                6442   AIC:                         1.178e+04
Df Residuals:                    6440   BIC:                         1.179e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7706      0.008     98.662      0.0

In [96]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.004170133621705775
0.014013329781567654
0.2975833500465346
0.7660307826493885
0.38301539132469425


In [97]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV800
ZNF20,1.0,1.9807
ZNF606,1.0,1.931659
ZNF551,1.0,1.832833
ZNF543,1.0,1.701683
ZNF14,1.0,1.640789


In [98]:
y.sort_values(ascending=False).head()

GPATCH1    3.951082
NFKBIA     3.900707
TTC5       3.870256
ZNF17      3.807992
ZNF563     3.747230
dtype: float64

In [99]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

# GLS on real phenotypes

In [100]:
PERC_NONZERO_GENES = 1.00

In [101]:
multixcan_real_results.columns

Index(['100001_raw-Food_weight', '100002_raw-Energy', '100003_raw-Protein',
       '100004_raw-Fat', '100005_raw-Carbohydrate', '100006_raw-Saturated_fat',
       '100007_raw-Polyunsaturated_fat', '100008_raw-Total_sugars',
       '100009_raw-Englyst_dietary_fibre', '100010-Portion_size',
       ...
       'visual impairment', 'vitiligo', 'vitreous body disease',
       'vocal cord polyp', 'voice disorders',
       'wellbeing measurement AND family relationship', 'wheezing',
       'whooping cough', 'worry measurement', 'wrist fracture'],
      dtype='object', length=3752)

## whooping cough / LV570

In [102]:
lv_code = "LV570"
phenotype_code = "whooping cough"

phenotype_name = f"multixcan-phenomexcan-{phenotype_code.replace(' ', '_')}-pvalues"
display(phenotype_name)

'multixcan-phenomexcan-whooping_cough-pvalues'

In [103]:
X, y = get_data(lv_code, real_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [104]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:         whooping cough   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2320
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.630
Time:                        16:04:43   Log-Likelihood:                -9096.4
No. Observations:                6442   AIC:                         1.820e+04
Df Residuals:                    6440   BIC:                         1.821e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0017      0.014     -0.119      0.9

In [105]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:         whooping cough   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.665
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.197
Time:                        16:04:53   Log-Likelihood:                -5875.5
No. Observations:                6442   AIC:                         1.176e+04
Df Residuals:                    6440   BIC:                         1.177e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7908      0.008    103.754      0.0

In [106]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.0049802005250721
0.010338597588777377
0.4817094854797467
0.6300287052648046
0.3150143526324023


In [107]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [108]:
y

NOC2L       1.301498
HES4        0.491604
ISG15       0.300490
AGRN        1.595415
TNFRSF18    0.362178
              ...   
CPT1B       0.283934
CHKB        0.621814
MAPK8IP2    0.479153
ARSA        0.274866
SHANK3      1.125992
Name: whooping cough, Length: 6442, dtype: float64

## wheezing and LV400

In [109]:
lv_code = "LV400"
phenotype_code = "wheezing"

phenotype_name = f"multixcan-phenomexcan-{phenotype_code.replace(' ', '_')}-pvalues"
display(phenotype_name)

'multixcan-phenomexcan-wheezing-pvalues'

In [110]:
X, y = get_data(lv_code, real_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [111]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:               wheezing   R-squared:                       0.030
Model:                            GLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     200.6
Date:                Mon, 25 Jul 2022   Prob (F-statistic):           7.17e-45
Time:                        16:06:01   Log-Likelihood:                -10037.
No. Observations:                6442   AIC:                         2.008e+04
Df Residuals:                    6440   BIC:                         2.009e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0370      0.017     -2.199      0.0

In [112]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:               wheezing   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.553
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.213
Time:                        16:06:27   Log-Likelihood:                -8840.7
No. Observations:                6442   AIC:                         1.769e+04
Df Residuals:                    6440   BIC:                         1.770e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1791      0.012     95.965      0.0

In [113]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.1787005272685236
0.012616403393185567
-14.164141847671436
7.167212012881414e-45
1.0


In [114]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [115]:
y

NOC2L       2.745848
HES4        0.208291
ISG15       0.136734
AGRN        0.229924
TNFRSF18    0.408283
              ...   
CPT1B       0.483466
CHKB        0.264246
MAPK8IP2    0.709193
ARSA        0.786883
SHANK3      0.404021
Name: wheezing, Length: 6442, dtype: float64

# [full corr matrix] GLS on randomly generated phenotypes using gene-level covariates

In [186]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [187]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [188]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [189]:
X.head()

Unnamed: 0,const,LV45,gene_size,gene_density
NOC2L,1.0,0.0,5.0,0.106383
HES4,1.0,0.0,3.0,0.068182
ISG15,1.0,0.0,6.0,0.142857
AGRN,1.0,0.0,4.0,0.086957
TNFRSF18,1.0,0.0,4.0,0.088889


In [190]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [202]:
Xs, ys = standardize_data(X, y)

In [208]:
_Xs_desc = Xs[[lv_code, "gene_size", "gene_density"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV45,gene_size,gene_density
count,6442.0,6442.0,6442.0
mean,1.764776e-17,1.985372e-17,6.13811e-15
std,1.0,1.0,1.0
min,-0.05995686,-1.451864,-0.844314
25%,-0.05995686,-0.5898049,-0.5795094
50%,-0.05995686,-0.1587756,-0.3389165
75%,-0.05995686,0.703283,0.1292323
max,27.86523,5.875635,4.763313


In [192]:
Xs.head()

Unnamed: 0,const,LV45,gene_size,gene_density
NOC2L,1.0,-0.059957,0.272254,-0.352155
HES4,1.0,-0.059957,-0.589805,-0.570836
ISG15,1.0,-0.059957,0.703283,-0.143361
AGRN,1.0,-0.059957,-0.158776,-0.463361
TNFRSF18,1.0,-0.059957,-0.158776,-0.452299


In [193]:
ys.head()

NOC2L      -0.749961
HES4       -0.837645
ISG15       0.726586
AGRN        1.372890
TNFRSF18    0.250993
dtype: float64

In [194]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [195]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1685
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.918
Time:                        18:37:33   Log-Likelihood:                -8695.3
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0239      0.025      0.956   

In [209]:
_gls_results.params

const           0.023948
LV45           -0.003233
gene_size      -0.007980
gene_density    0.000162
dtype: float64

In [197]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.0032326620431897984
0.008590802143381404
-0.37629338788582534
0.7067111941216788
0.6466444029391605


In [198]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [199]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-covars'

In [200]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_size,gene_density
NOC2L,5.0,0.106383
HES4,3.0,0.068182
ISG15,6.0,0.142857
AGRN,4.0,0.086957
TNFRSF18,4.0,0.088889


In [201]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [196]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.210
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.304
Time:                        18:37:33   Log-Likelihood:                -5683.5
No. Observations:                6442   AIC:                         1.138e+04
Df Residuals:                    6438   BIC:                         1.140e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.7647      0.016     48.535   

## Random phenotype 6 / LV455

In [210]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [211]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [212]:
X.head()

Unnamed: 0,const,LV455,gene_size,gene_density
NOC2L,1.0,0.005223,5.0,0.106383
HES4,1.0,0.0,3.0,0.068182
ISG15,1.0,0.0,6.0,0.142857
AGRN,1.0,0.002763,4.0,0.086957
TNFRSF18,1.0,0.003323,4.0,0.088889


In [213]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [214]:
Xs, ys = standardize_data(X, y)

In [215]:
_Xs_desc = Xs[[lv_code, "gene_size", "gene_density"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV455,gene_size,gene_density
count,6442.0,6442.0,6442.0
mean,9.485668000000001e-17,1.985372e-17,6.13811e-15
std,1.0,1.0,1.0
min,-0.07851697,-1.451864,-0.844314
25%,-0.07851697,-0.5898049,-0.5795094
50%,-0.07851697,-0.1587756,-0.3389165
75%,-0.04700189,0.703283,0.1292323
max,41.51043,5.875635,4.763313


In [216]:
Xs.head()

Unnamed: 0,const,LV455,gene_size,gene_density
NOC2L,1.0,-0.052328,0.272254,-0.352155
HES4,1.0,-0.078517,-0.589805,-0.570836
ISG15,1.0,-0.078517,0.703283,-0.143361
AGRN,1.0,-0.064664,-0.158776,-0.463361
TNFRSF18,1.0,-0.061858,-0.158776,-0.452299


In [217]:
ys.head()

NOC2L      -0.749961
HES4       -0.837645
ISG15       0.726586
AGRN        1.372890
TNFRSF18    0.250993
dtype: float64

In [218]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [219]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1268
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.944
Time:                        19:30:27   Log-Likelihood:                -8695.4
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0241      0.025      0.961   

In [220]:
_gls_results.params

const           0.024066
LV455           0.001452
gene_size      -0.008025
gene_density    0.000116
dtype: float64

In [221]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.0014516113831524813
0.011295356092071307
0.12851399914442976
0.8977462345701058
0.4488731172850529


In [222]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [223]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-covars'

In [224]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_size,gene_density
NOC2L,5.0,0.106383
HES4,3.0,0.068182
ISG15,6.0,0.142857
AGRN,4.0,0.086957
TNFRSF18,4.0,0.088889


In [225]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [226]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1623
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.922
Time:                        19:31:29   Log-Likelihood:                -9140.1
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6438   BIC:                         1.832e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         8.844e-17      0.012    7.1e-15   

## Random phenotype 0 / LV801 (using logarithms)

In [233]:
lv_code = "LV801"
phenotype_code = 0

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [234]:
X, y = get_data(
    lv_code, random_phenotype_code=phenotype_code, add_covars=True, add_covars_logs=True
)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [235]:
X.head()

Unnamed: 0,const,LV801,gene_size,gene_density,gene_size_log,gene_density_log
NOC2L,1.0,0.063959,5.0,0.106383,1.609438,2.24071
HES4,1.0,0.006702,3.0,0.068182,1.098612,2.685577
ISG15,1.0,0.0,6.0,0.142857,1.791759,1.94591
AGRN,1.0,0.005386,4.0,0.086957,1.386294,2.442347
TNFRSF18,1.0,0.0,4.0,0.088889,1.386294,2.420368


In [236]:
y.head()

NOC2L       1.192661
HES4        0.615835
ISG15       1.388647
AGRN        0.098554
TNFRSF18    0.085000
dtype: float64

In [237]:
Xs, ys = standardize_data(X, y)

In [240]:
Xs.head()

Unnamed: 0,const,LV801,gene_size,gene_density,gene_size_log,gene_density_log
NOC2L,1.0,0.229344,0.273065,-0.352578,0.498865,0.132924
HES4,1.0,-0.106576,-0.589866,-0.571477,-0.406298,0.687919
ISG15,1.0,-0.145894,0.704531,-0.143574,0.821931,-0.234854
AGRN,1.0,-0.114292,-0.158401,-0.463895,0.103463,0.384477
TNFRSF18,1.0,-0.145894,-0.158401,-0.452822,0.103463,0.357057


In [241]:
ys.head()

NOC2L       0.697014
HES4       -0.258629
ISG15       1.021711
AGRN       -1.115624
TNFRSF18   -1.138078
dtype: float64

In [242]:
_Xs_desc = Xs[
    [lv_code, "gene_size", "gene_density", "gene_size_log", "gene_density_log"]
].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV801,gene_size,gene_density,gene_size_log,gene_density_log
count,6442.0,6442.0,6442.0,6442.0,6442.0
mean,-2.15082e-17,1.036806e-16,6.203186e-15,-5.531027e-14,-3.746067e-14
std,1.0,1.0,1.0,1.0,1.0
min,-0.1458936,-1.452798,-0.8452287,-2.352994,-2.662478
25%,-0.1458936,-0.5898664,-0.5801592,-0.4062975,-0.5937521
50%,-0.1458936,-0.1584006,-0.3393255,0.1034634,0.1060938
75%,-0.03586973,0.7045311,0.1292916,0.821931,0.7159554
max,41.72866,5.882121,4.768008,2.768627,2.19277


In [243]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [244]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4873
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.786
Time:                        19:42:12   Log-Likelihood:                -8679.2
No. Observations:                6442   AIC:                         1.737e+04
Df Residuals:                    6436   BIC:                         1.741e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -0.0153      0.025  

In [245]:
_gls_results.params

const              -0.015310
LV801               0.008381
gene_size           0.017351
gene_density        0.009620
gene_size_log      -0.000114
gene_density_log    0.010104
dtype: float64

In [246]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.008380935035632255
0.010981760912289579
0.7631685940506349
0.4453908279763241
0.22269541398816206


In [247]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [248]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-covars"
display(phenotype_covars_name)

'multixcan-random_phenotype0-covars'

In [249]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_size,gene_density,gene_size_log,gene_density_log
NOC2L,5.0,0.106383,1.609438,2.24071
HES4,3.0,0.068182,1.098612,2.685577
ISG15,6.0,0.142857,1.791759,1.94591
AGRN,4.0,0.086957,1.386294,2.442347
TNFRSF18,4.0,0.088889,1.386294,2.420368


In [250]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [251]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.166
Date:                Mon, 25 Jul 2022   Prob (F-statistic):              0.323
Time:                        19:42:40   Log-Likelihood:                -9137.4
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6436   BIC:                         1.833e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             5.551e-15      0.012  

# [full corr matrix] GLS on randomly generated phenotypes using SNP-level covariates

In [235]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [236]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [237]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_snplevel_covars=True)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [238]:
X.head()

Unnamed: 0,const,LV45,gene_n_snps_used,gene_n_snps_used_density
NOC2L,1.0,0.0,103,0.980952
HES4,1.0,0.0,55,0.785714
ISG15,1.0,0.0,59,0.967213
AGRN,1.0,0.0,75,0.728155
TNFRSF18,1.0,0.0,65,0.984848


In [239]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [240]:
Xs, ys = standardize_data(X, y)

In [241]:
_Xs_desc = Xs[[lv_code, "gene_n_snps_used", "gene_n_snps_used_density"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV45,gene_n_snps_used,gene_n_snps_used_density
count,6442.0,6442.0,6442.0
mean,1.3235820000000001e-17,-1.3235820000000001e-17,-4.720774e-16
std,1.0,1.0,1.0
min,-0.05995686,-1.842751,-6.719716
25%,-0.05995686,-0.7128726,0.09723914
50%,-0.05995686,-0.0482381,0.485196
75%,-0.05995686,0.6828598,0.485196
max,27.86523,4.903289,0.485196


In [242]:
Xs.head()

Unnamed: 0,const,LV45,gene_n_snps_used,gene_n_snps_used_density
NOC2L,1.0,-0.059957,1.546885,0.293065
HES4,1.0,-0.059957,-0.048238,-1.676278
ISG15,1.0,-0.059957,0.084689,0.154479
AGRN,1.0,-0.059957,0.616396,-2.256868
TNFRSF18,1.0,-0.059957,0.284079,0.332364


In [243]:
ys.head()

NOC2L      -0.749961
HES4       -0.837645
ISG15       0.726586
AGRN        1.372890
TNFRSF18    0.250993
dtype: float64

In [244]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [245]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2187
Date:                Wed, 27 Jul 2022   Prob (F-statistic):              0.883
Time:                        19:01:06   Log-Likelihood:                -8695.2
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [246]:
_gls_results.params

const                       0.027752
LV45                       -0.003228
gene_n_snps_used           -0.006508
gene_n_snps_used_density   -0.000690
dtype: float64

In [247]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.003227698021391237
0.00859069240125604
-0.37572035764187267
0.7071371805607842
0.646431409719608


In [248]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [249]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-snplevel_covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-snplevel_covars'

In [250]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_n_snps_used,gene_n_snps_used_density
NOC2L,103,0.980952
HES4,55,0.785714
ISG15,59,0.967213
AGRN,75,0.728155
TNFRSF18,65,0.984848


In [251]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [252]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.301
Date:                Wed, 27 Jul 2022   Prob (F-statistic):              0.272
Time:                        19:01:23   Log-Likelihood:                -9138.4
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6438   BIC:                         1.831e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

## Random phenotype 6 / LV455

In [259]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [260]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_snplevel_covars=True)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [261]:
X.head()

Unnamed: 0,const,LV455,gene_n_snps_used,gene_n_snps_used_density
NOC2L,1.0,0.005223,103,0.980952
HES4,1.0,0.0,55,0.785714
ISG15,1.0,0.0,59,0.967213
AGRN,1.0,0.002763,75,0.728155
TNFRSF18,1.0,0.003323,65,0.984848


In [262]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [263]:
Xs, ys = standardize_data(X, y)

In [264]:
_Xs_desc = Xs[[lv_code, "gene_n_snps_used", "gene_n_snps_used_density"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV455,gene_n_snps_used,gene_n_snps_used_density
count,6442.0,6442.0,6442.0
mean,-4.411939e-18,-1.3235820000000001e-17,-4.720774e-16
std,1.0,1.0,1.0
min,-0.07851697,-1.842751,-6.719716
25%,-0.07851697,-0.7128726,0.09723914
50%,-0.07851697,-0.0482381,0.485196
75%,-0.04700189,0.6828598,0.485196
max,41.51043,4.903289,0.485196


In [265]:
Xs.head()

Unnamed: 0,const,LV455,gene_n_snps_used,gene_n_snps_used_density
NOC2L,1.0,-0.052328,1.546885,0.293065
HES4,1.0,-0.078517,-0.048238,-1.676278
ISG15,1.0,-0.078517,0.084689,0.154479
AGRN,1.0,-0.064664,0.616396,-2.256868
TNFRSF18,1.0,-0.061858,0.284079,0.332364


In [266]:
ys.head()

NOC2L      -0.749961
HES4       -0.837645
ISG15       0.726586
AGRN        1.372890
TNFRSF18    0.250993
dtype: float64

In [267]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [268]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1802
Date:                Wed, 27 Jul 2022   Prob (F-statistic):              0.910
Time:                        19:04:12   Log-Likelihood:                -8695.3
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [269]:
_gls_results.params

const                       0.027903
LV455                       0.001828
gene_n_snps_used           -0.006525
gene_n_snps_used_density   -0.000758
dtype: float64

In [270]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.0018276464593873108
0.01139385148318154
0.16040637900933666
0.8725659975131534
0.4362829987565767


In [271]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [272]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-snplevel_covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-snplevel_covars'

In [273]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_n_snps_used,gene_n_snps_used_density
NOC2L,103,0.980952
HES4,55,0.785714
ISG15,59,0.967213
AGRN,75,0.728155
TNFRSF18,65,0.984848


In [274]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [275]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2819
Date:                Wed, 27 Jul 2022   Prob (F-statistic):              0.838
Time:                        19:04:34   Log-Likelihood:                -9139.9
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6438   BIC:                         1.831e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

## Random phenotype 0 / LV801 (using logarithms)

In [281]:
lv_code = "LV801"
phenotype_code = 0

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [282]:
X, y = get_data(
    lv_code,
    random_phenotype_code=phenotype_code,
    add_snplevel_covars=True,
    add_covars_logs=True,
)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [283]:
X.head()

Unnamed: 0,const,LV801,gene_n_snps_used,gene_n_snps_used_density,gene_n_snps_used_log,gene_n_snps_used_density_log
NOC2L,1.0,0.063959,103,0.980952,4.634729,0.019231
HES4,1.0,0.006702,55,0.785714,4.007333,0.241162
ISG15,1.0,0.0,59,0.967213,4.077537,0.033336
AGRN,1.0,0.005386,75,0.728155,4.317488,0.317241
TNFRSF18,1.0,0.0,65,0.984848,4.174387,0.015267


In [284]:
y.head()

NOC2L       1.192661
HES4        0.615835
ISG15       1.388647
AGRN        0.098554
TNFRSF18    0.085000
dtype: float64

In [285]:
Xs, ys = standardize_data(X, y)

In [286]:
_Xs_desc = Xs[
    [
        lv_code,
        "gene_n_snps_used",
        "gene_n_snps_used_log",
        "gene_n_snps_used_density",
        "gene_n_snps_used_density_log",
    ]
].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV801,gene_n_snps_used,gene_n_snps_used_log,gene_n_snps_used_density,gene_n_snps_used_density_log
count,6442.0,6442.0,6442.0,6442.0,6442.0
mean,1.8750740000000002e-17,-1.3235820000000001e-17,-5.404625e-17,-4.720774e-16,-2.2059690000000002e-17
std,1.0,1.0,1.0,1.0,1.0
min,-0.1458936,-1.842751,-4.79924,-6.719716,-0.4480473
25%,-0.1458936,-0.7128726,-0.3279627,0.09723914,-0.4480473
50%,-0.1458936,-0.0482381,0.2404629,0.485196,-0.4480473
75%,-0.03586973,0.6828598,0.6636172,0.485196,-0.1354569
max,41.72866,4.903289,1.888935,0.485196,9.536515


In [287]:
Xs.head()

Unnamed: 0,const,LV801,gene_n_snps_used,gene_n_snps_used_density,gene_n_snps_used_log,gene_n_snps_used_density_log
NOC2L,1.0,0.229344,1.546885,0.293065,1.029488,-0.294773
HES4,1.0,-0.106576,-0.048238,-1.676278,0.240463,1.474022
ISG15,1.0,-0.145894,0.084689,0.154479,0.328753,-0.182355
AGRN,1.0,-0.114292,0.616396,-2.256868,0.63052,2.080373
TNFRSF18,1.0,-0.145894,0.284079,0.332364,0.450553,-0.326365


In [288]:
ys.head()

NOC2L       0.697014
HES4       -0.258629
ISG15       1.021711
AGRN       -1.115624
TNFRSF18   -1.138078
dtype: float64

In [289]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [290]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9619
Date:                Wed, 27 Jul 2022   Prob (F-statistic):              0.440
Time:                        19:06:45   Log-Likelihood:                -8678.0
No. Observations:                6442   AIC:                         1.737e+04
Df Residuals:                    6436   BIC:                         1.741e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

In [291]:
_gls_results.params

const                          -0.021673
LV801                           0.008384
gene_n_snps_used                0.026364
gene_n_snps_used_density        0.027107
gene_n_snps_used_log           -0.003973
gene_n_snps_used_density_log    0.029269
dtype: float64

In [292]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.008384051885022486
0.010977363554150646
0.7637582415544938
0.44503932280432557
0.22251966140216278


In [293]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [294]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-snplevel_covars"
display(phenotype_covars_name)

'multixcan-random_phenotype0-snplevel_covars'

In [295]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_n_snps_used,gene_n_snps_used_density,gene_n_snps_used_log,gene_n_snps_used_density_log
NOC2L,103,0.980952,4.634729,0.019231
HES4,55,0.785714,4.007333,0.241162
ISG15,59,0.967213,4.077537,0.033336
AGRN,75,0.728155,4.317488,0.317241
TNFRSF18,65,0.984848,4.174387,0.015267


In [296]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [297]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.7497
Date:                Wed, 27 Jul 2022   Prob (F-statistic):              0.586
Time:                        19:07:02   Log-Likelihood:                -9138.4
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6436   BIC:                         1.833e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           