# Description

It generates tests cases for the GLS model.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm

import conf
import utils
from entity import Gene

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = "1000G_EUR"

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = "1000G"

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = "MASHR"

In [5]:
OUTPUT_DIR = utils.get_git_repository_path() / "tests" / "data" / "gls"
display(OUTPUT_DIR)
assert OUTPUT_DIR.exists()

PosixPath('/opt/code/tests/data/gls')

# Load data

## MultiPLIER Z

In [6]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [7]:
multiplier_z_genes = multiplier_z.index.tolist()

In [8]:
len(multiplier_z_genes)

6750

In [9]:
multiplier_z_genes[:10]

['GAS6',
 'MMP14',
 'DSP',
 'MARCKSL1',
 'SPARC',
 'CTSD',
 'EPAS1',
 'PALLD',
 'PHC2',
 'LGALS3BP']

## Function to load MultiXcan's results on random phenotypes

In [10]:
def load_multixcan_random_phenotype(phenotype_code):
    multixcan_random_results = pd.read_csv(
        conf.RESULTS["GLS_NULL_SIMS"]
        / "twas"
        / "smultixcan"
        / f"random.pheno{phenotype_code}-gtex_v8-mashr-smultixcan.txt",
        sep="\t",
        index_col="gene_name",
    )

    return multixcan_random_results

In [11]:
load_multixcan_random_phenotype(0).head()

Unnamed: 0_level_0,gene,pvalue,n,n_indep,p_i_best,t_i_best,p_i_worst,t_i_worst,eigen_max,eigen_min,eigen_min_kept,z_min,z_max,z_mean,z_sd,tmi,status
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
RHPN2,ENSG00000131941.7,4e-05,48.0,3.0,0.000213947,Artery_Tibial,0.990132,Brain_Nucleus_accumbens_basal_ganglia,36.556432,7.692089e-16,2.519701,-2.721185,3.701952,1.283152,1.825567,3.0,0
GPATCH1,ENSG00000076650.6,7.8e-05,40.0,3.0,0.000453439,Brain_Cerebellum,0.817384,Brain_Frontal_Cortex_BA9,29.990208,2.086487e-15,1.815203,-3.506853,2.383485,-2.016745,1.715495,3.0,0
NFKBIA,ENSG00000100906.10,9.6e-05,1.0,1.0,9.591208e-05,Brain_Frontal_Cortex_BA9,9.6e-05,Brain_Frontal_Cortex_BA9,1.0,1.0,1.0,-3.900707,-3.900707,-3.900707,,1.0,0
TTC5,ENSG00000136319.11,0.000109,47.0,5.0,0.001402826,Brain_Hippocampus,0.961887,Colon_Sigmoid,21.272442,8.142339e-16,0.732606,-3.194069,1.397514,-0.916662,1.068989,5.0,0
ADGRA3,ENSG00000152990.13,0.000135,41.0,12.0,3.211289e-07,Heart_Atrial_Appendage,0.653657,Whole_Blood,12.988248,3.499412e-16,0.444682,-5.110605,3.59941,-0.464735,2.316607,12.0,0


## MultiXcan real results (PhenomeXcan)

In [12]:
multixcan_real_results = pd.read_pickle(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [13]:
multixcan_real_results = multixcan_real_results[
    ~multixcan_real_results.index.duplicated(keep="first")
].dropna(how="all", axis=0)

In [14]:
multixcan_real_results.shape

(22508, 3752)

In [15]:
multixcan_real_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


In [16]:
assert not multixcan_real_results.isna().any(None)

## Load full correlation matrix

In [17]:
orig_corr_mat = pd.read_pickle(OUTPUT_DIR / "corr_mat.pkl.xz")

In [18]:
orig_corr_mat.shape

(6442, 6442)

In [19]:
orig_corr_mat.head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.115011,0.173138,0.056096,0.008032,0.008727,0.006797,0.004533,0.00735,0.010391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.115011,1.0,0.681368,0.360588,0.011545,0.010729,0.003577,0.01023,0.010747,0.008769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.173138,0.681368,1.0,0.381394,0.011774,0.012527,0.003754,0.012096,0.012679,0.010442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.056096,0.360588,0.381394,1.0,0.013005,0.015775,0.006184,0.006813,0.010775,0.009189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.008032,0.011545,0.011774,0.013005,1.0,0.356676,0.45401,0.137643,0.20034,0.09321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load cohort metadata

In [20]:
gene_tissues_df = pd.read_pickle(
    OUTPUT_DIR / "cohort_1000g_eur_metadata" / "gene_tissues.pkl.gz"
).set_index("gene_name")

In [21]:
gene_tissues_df.shape

(6444, 9)

In [22]:
gene_tissues_df = gene_tissues_df.loc[~gene_tissues_df.index.duplicated(keep="first")]

In [23]:
gene_tissues_df.shape

(6444, 9)

In [24]:
assert gene_tissues_df.index.is_unique

In [25]:
gene_tissues_df.head()

Unnamed: 0_level_0,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov,tissues_variances,n_snps_used_sum,n_snps_in_model_sum,unique_n_snps_in_model,unique_n_snps_used
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]","[0.008284978865240098, 0.000732674839341259]",{'Brain_Substantia_nigra': 0.00073379232755585...,2,2,2,2
FGR,"(Testis, Brain_Frontal_Cortex_BA9, Pancreas, E...",36,"[30.57880701512375, 2.029732645600111, 1.40179...","[0.22478471249969337, 0.00881387381303179]","{'Artery_Aorta': 0.004641097161388297, 'Colon_...",40,40,5,5
CFH,"(Testis, Artery_Coronary, Heart_Atrial_Appenda...",34,"[20.77260827663933, 8.177158142324839, 1.86464...","[0.19699534279580105, 0.05731825316349168, 0.0...",{'Small_Intestine_Terminal_Ileum': 0.000166052...,44,44,12,12
GCLC,"(Testis, Artery_Coronary, Heart_Atrial_Appenda...",32,"[20.976449940269113, 4.772354749306764, 2.2362...","[0.3657809638925397, 0.12645413710424536, 0.01...",{'Small_Intestine_Terminal_Ileum': 0.006074884...,46,46,23,23
NFYA,"(Testis, Artery_Coronary, Heart_Atrial_Appenda...",40,[37.63836978240499],"[1.478714450892399, 0.04969061838773913]",{'Small_Intestine_Terminal_Ileum': 0.033165962...,47,48,14,13


# Functions

In [26]:
def get_data(
    lv_code,
    random_phenotype_code=None,
    real_phenotype_code=None,
    add_covars=False,
    # add_snplevel_covars=False,
    add_covars_logs=False,
):
    if random_phenotype_code is not None:
        target_data = load_multixcan_random_phenotype(random_phenotype_code)["pvalue"]
        y = pd.Series(
            data=np.abs(stats.norm.ppf(target_data.to_numpy() / 2)),
            index=target_data.index.copy(),
        )
    elif real_phenotype_code is not None:
        y = multixcan_real_results[real_phenotype_code]

    y = y[~y.index.duplicated(keep="first")]
    y = y.dropna()

    X = multiplier_z[lv_code].copy()

    common_genes = orig_corr_mat.index.intersection(y.index).intersection(X.index)
    y = y.loc[common_genes]

    X = X.loc[common_genes]
    X = sm.add_constant(X)

    if add_covars:
        covars = load_multixcan_random_phenotype(random_phenotype_code)[
            ["n", "n_indep"]
        ]
        covars = covars[~covars.index.duplicated(keep="first")]
        covars = covars.loc[X.index]

        # gene-level covariates (S-MultiXcan)
        covars = covars.rename(
            columns={
                "n_indep": "gene_size",
            }
        )
        covars = covars.assign(
            gene_density=covars.apply(lambda x: x["gene_size"] / x["n"], axis=1)
        )

        if add_covars_logs:
            covars["gene_size_log"] = np.log(covars["gene_size"])
            covars["gene_density_log"] = -np.log(covars["gene_density"])

        # snp-level covariates (S-PrediXcan)
        covars = covars.assign(gene_n_snps_used=gene_tissues_df["n_snps_used_sum"])
        covars = covars.assign(
            gene_n_snps_used_sharing=gene_tissues_df.apply(
                lambda x: x["n_snps_used_sum"] / x["unique_n_snps_used"], axis=1
            )
        )

        if add_covars_logs:
            covars["gene_n_snps_used_log"] = np.log(covars["gene_n_snps_used"])
            covars["gene_n_snps_used_sharing_log"] = np.log(
                covars["gene_n_snps_used_sharing"]
            )

        # if add_covars:
        covars = covars.drop(columns=[c for c in covars.columns if c in ("n",)])

        X = X.join(covars)

    return X, y

In [27]:
# testing
_X, _y = get_data("LV7", 10)
assert _X.shape[0] < 7000
assert _X.shape[1] == 2
assert "LV7" in _X.columns
assert "const" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [28]:
_X.head()

Unnamed: 0,const,LV7
NOC2L,1.0,0.0
HES4,1.0,0.0
ISG15,1.0,0.0
AGRN,1.0,0.0
TNFRSF18,1.0,0.0


In [29]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [30]:
# testing
_X, _y = get_data("LV7", 10, add_covars=True)
assert _X.shape[0] < 7000
assert _X.shape[1] == 2 + 4
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_size" in _X.columns
assert "gene_density" in _X.columns
assert "gene_n_snps_used" in _X.columns
assert "gene_n_snps_used_sharing" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [31]:
_X.head()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,1.0,0.0,5.0,0.106383,103,4.478261
HES4,1.0,0.0,3.0,0.068182,55,6.875
ISG15,1.0,0.0,6.0,0.142857,59,2.565217
AGRN,1.0,0.0,4.0,0.086957,75,5.0
TNFRSF18,1.0,0.0,4.0,0.088889,65,2.407407


In [32]:
# load_multixcan_random_phenotype(10).loc["TNFRSF18"]
gene_tissues_df.loc["TNFRSF18"]

tissue                      (Testis, Artery_Coronary, Heart_Atrial_Appenda...
n_tissues                                                                  45
tissues_pc_variances        [25.078112154400927, 11.018170291707419, 5.583...
tissues_pc_variances_cov    [0.6646273863834019, 0.30611303601683504, 0.05...
tissues_variances           {'Small_Intestine_Terminal_Ileum': 0.003919989...
n_snps_used_sum                                                            65
n_snps_in_model_sum                                                        66
unique_n_snps_in_model                                                     28
unique_n_snps_used                                                         27
Name: TNFRSF18, dtype: object

In [33]:
assert _X.loc["TNFRSF18", "gene_size"] == 4.0
assert _X.loc["TNFRSF18", "gene_density"] == 4 / 45.0
assert _X.loc["TNFRSF18", "gene_n_snps_used"] == 65
assert _X.loc["TNFRSF18", "gene_n_snps_used_sharing"] == 65 / 27.0

In [34]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [35]:
# testing
_X, _y = get_data("LV7", 10, add_covars=True, add_covars_logs=True)
assert _X.shape[0] < 7000
assert _X.shape[1] == 2 + 4 + 4
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_size" in _X.columns
assert "gene_size_log" in _X.columns
assert "gene_density" in _X.columns
assert "gene_density_log" in _X.columns
assert "gene_n_snps_used" in _X.columns
assert "gene_n_snps_used_log" in _X.columns
assert "gene_n_snps_used_sharing" in _X.columns
assert "gene_n_snps_used_sharing_log" in _X.columns
assert not _X.isna().any(None)

assert _X["gene_density"].between(0.0, 1.0, inclusive="right").all()
assert _X["gene_density_log"].min() >= 0.0
assert _X["gene_size"].min() >= 0.0
assert _X["gene_size_log"].min() >= 0.0
assert _X["gene_n_snps_used"].min() >= 0.0
assert _X["gene_n_snps_used_log"].min() >= 0.0
assert _X["gene_n_snps_used_sharing"].min() >= 0.0
assert _X["gene_n_snps_used_sharing_log"].min() >= 0.0

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [36]:
_X.head()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_size_log,gene_density_log,gene_n_snps_used,gene_n_snps_used_sharing,gene_n_snps_used_log,gene_n_snps_used_sharing_log
NOC2L,1.0,0.0,5.0,0.106383,1.609438,2.24071,103,4.478261,4.634729,1.499235
HES4,1.0,0.0,3.0,0.068182,1.098612,2.685577,55,6.875,4.007333,1.927892
ISG15,1.0,0.0,6.0,0.142857,1.791759,1.94591,59,2.565217,4.077537,0.942043
AGRN,1.0,0.0,4.0,0.086957,1.386294,2.442347,75,5.0,4.317488,1.609438
TNFRSF18,1.0,0.0,4.0,0.088889,1.386294,2.420368,65,2.407407,4.174387,0.87855


In [37]:
_X.describe()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_size_log,gene_density_log,gene_n_snps_used,gene_n_snps_used_sharing,gene_n_snps_used_log,gene_n_snps_used_sharing_log
count,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0
mean,1.0,0.089368,4.365414,0.168023,1.327457,2.133689,56.40593,5.5407,3.814766,1.501012
std,0.0,0.346195,2.317113,0.174679,0.564483,0.801807,30.092311,4.376822,0.796356,0.636414
min,1.0,0.0,1.0,0.020408,0.0,-0.0,1.0,1.0,0.0,0.0
25%,1.0,0.0,3.0,0.066667,1.098612,1.658228,35.0,3.0,3.555348,1.098612
50%,1.0,0.0,4.0,0.108696,1.386294,2.219203,55.0,4.478261,4.007333,1.499235
75%,1.0,0.012172,6.0,0.190476,1.791759,2.70805,77.0,6.75,4.343805,1.909543
max,1.0,5.125221,18.0,1.0,2.890372,3.89182,204.0,49.0,5.31812,3.89182


In [38]:
_y.head()

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
dtype: float64

In [39]:
def standardize_data(X, y):
    X = X.copy()
    y = y.copy()

    c = [c for c in X.columns if c != "const"]
    X[c] = (X[c] - X[c].mean()) / X[c].std()

    return X, (y - y.mean()) / y.std()

In [40]:
def get_aligned_corr_mat(X, perc=1.0):
    # perc == 1.0 means select all nonzero genes;
    # perc = None means do not subset the correlation matrix
    gene_corrs = orig_corr_mat.loc[X.index, X.index]

    if perc is None:
        return gene_corrs

    corr_mat_sub = pd.DataFrame(
        np.identity(gene_corrs.shape[0]),
        index=gene_corrs.index.copy(),
        columns=gene_corrs.columns.copy(),
    )

    X = X.iloc[:, 1]

    X_non_zero = X[X > 0]
    X_thres = X_non_zero.quantile(1 - perc)
    lv_nonzero_genes = X[X >= X_thres].index

    lv_nonzero_genes = lv_nonzero_genes.intersection(gene_corrs.index)
    corr_mat_sub.loc[lv_nonzero_genes, lv_nonzero_genes] = gene_corrs.loc[
        lv_nonzero_genes, lv_nonzero_genes
    ]

    return corr_mat_sub

In [41]:
# testing
_X_test = pd.DataFrame(
    {
        "const": 1.0,
        "LV1": [1.0, 0.4, 0.0],  # the last gene has zero weight
    },
    index=[
        "PSMB10",  # the first two genes have a high sum of correlations, to make sure the sum is not close to 1.0
        "SLC12A4",
        "ACD",
    ],
)

# do not subset
_tmp_corr = get_aligned_corr_mat(_X_test, perc=None)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.73],
            [0.77, 1.0, 0.63],
            [0.73, 0.63, 1.00],
        ]
    ),
)

# do subset: include all non-zero LV genes
_tmp_corr = get_aligned_corr_mat(_X_test, perc=1.0)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.00],
            [0.77, 1.0, 0.00],
            [0.00, 0.00, 1.00],
        ]
    ),
)

# do subset: include all non-zero LV genes with weight > 99% percentile
_tmp_corr = get_aligned_corr_mat(_X_test, perc=0.99)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.00, 0.00],
            [0.00, 1.0, 0.00],
            [0.00, 0.00, 1.00],
        ]
    ),
)

In [42]:
def train_statsmodels_gls(X, y, corr_mat):
    gls_model = sm.GLS(y, X, sigma=corr_mat)
    gls_results = gls_model.fit()
    return gls_results

# [full corr matrix] GLS on randomly generated phenotypes

In [43]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [44]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [45]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [46]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1459
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.702
Time:                        13:36:40   Log-Likelihood:                -8696.5
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6440   BIC:                         1.741e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0271      0.024      1.121      0.2

In [59]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.0032814567822982274
exp_coef_se = 0.008590718523010138
exp_tvalue = -0.38197698754869974
exp_pval_twosided = 0.7024910374237221
exp_pval_onesided = 0.648754481288139
    


In [48]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
HIST1H2BO,1.0,8.480948
HIST1H2BF,1.0,8.426226
HIST1H2BK,1.0,8.245903
HIST1H2BD,1.0,8.119013
HIST1H2BC,1.0,7.744137
...,...,...
TREM1,1.0,0.000000
TREML2,1.0,0.000000
TREM2,1.0,0.000000
NFYA,1.0,0.000000


In [49]:
Xs.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
HIST1H2BO,1.0,27.865226
HIST1H2BF,1.0,27.685041
HIST1H2BK,1.0,27.091293
HIST1H2BD,1.0,26.673482
HIST1H2BC,1.0,25.439130
...,...,...
TREM1,1.0,-0.059957
TREML2,1.0,-0.059957
TREM2,1.0,-0.059957
NFYA,1.0,-0.059957


In [50]:
y.sort_values(ascending=False)

CHPF2     4.039680
PRR5      3.726033
MMP12     3.686147
RBM38     3.581041
SOS1      3.528183
            ...   
GPX3      0.000826
SUOX      0.000686
SPRED2    0.000364
DEGS1     0.000152
SAFB      0.000072
Length: 6442, dtype: float64

In [51]:
ys.sort_values(ascending=False)

CHPF2     5.607324
PRR5      5.070443
MMP12     5.002169
RBM38     4.822255
SOS1      4.731776
            ...   
GPX3     -1.306133
SUOX     -1.306373
SPRED2   -1.306924
DEGS1    -1.307288
SAFB     -1.307425
Length: 6442, dtype: float64

In [52]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [53]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.428
Date:                Tue, 02 Aug 2022   Prob (F-statistic):             0.0642
Time:                        13:44:55   Log-Likelihood:                -9138.6
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.829e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.505e-16      0.012  -1.21e-14      1.0

## Random phenotype 6 / LV455

In [60]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [61]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [62]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                   0.02033
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.887
Time:                        13:53:21   Log-Likelihood:                -8696.6
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6440   BIC:                         1.741e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0272      0.024      1.127      0.2

In [63]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.0015724547818453105
exp_coef_se = 0.011027453856403382
exp_tvalue = 0.1425945465128583
exp_pval_twosided = 0.8866148655455224
exp_pval_onesided = 0.4433074327727612
    


In [64]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV455
CACNA1A,1.0,8.294351
ZNF26,1.0,7.956442
ARHGAP42,1.0,5.592084
UBE2B,1.0,5.379685
GAB2,1.0,3.946462
...,...,...
ZNF655,1.0,0.000000
ZKSCAN5,1.0,0.000000
ZNF394,1.0,0.000000
ATP5J2,1.0,0.000000


In [65]:
y.sort_values(ascending=False)

CHPF2     4.039680
PRR5      3.726033
MMP12     3.686147
RBM38     3.581041
SOS1      3.528183
            ...   
GPX3      0.000826
SUOX      0.000686
SPRED2    0.000364
DEGS1     0.000152
SAFB      0.000072
Length: 6442, dtype: float64

In [66]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [67]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2985
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.585
Time:                        13:53:21   Log-Likelihood:                -5677.5
No. Observations:                6442   AIC:                         1.136e+04
Df Residuals:                    6440   BIC:                         1.137e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7636      0.007    104.576      0.0

## Random phenotype 0 / LV801

In [68]:
lv_code = "LV801"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [69]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [70]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5098
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.475
Time:                        13:54:08   Log-Likelihood:                -8681.3
No. Observations:                6442   AIC:                         1.737e+04
Df Residuals:                    6440   BIC:                         1.738e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0215      0.024     -0.892      0.3

In [71]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.007824760447674654
exp_coef_se = 0.010959087123929062
exp_tvalue = 0.7139974670508243
exp_pval_twosided = 0.47525462160232723
exp_pval_onesided = 0.23762731080116362
    


In [72]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV801
SPATS2,1.0,7.137544
SSR3,1.0,6.096376
H3F3C,1.0,4.471034
COLEC12,1.0,4.057876
AP4M1,1.0,2.904084


In [73]:
y.sort_values(ascending=False).head()

GPATCH1    3.951082
NFKBIA     3.900707
TTC5       3.870256
ZNF17      3.807992
ZNF563     3.747230
dtype: float64

In [74]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [75]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.062
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.151
Time:                        13:54:08   Log-Likelihood:                -5895.9
No. Observations:                6442   AIC:                         1.180e+04
Df Residuals:                    6440   BIC:                         1.181e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7707      0.008    101.281      0.0

# [sub corr matrix ] GLS on randomly generated phenotypes

In [None]:
PERC_NONZERO_GENES = 1.00

## Random phenotype 6 / LV45

In [None]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

In [None]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [None]:
print(_gls_results.summary())

In [None]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

In [None]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

In [None]:
X.sort_values(lv_code, ascending=False).head()

In [None]:
y.sort_values(ascending=False).head()

In [None]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 6 / LV455

In [None]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

In [None]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [None]:
print(_gls_results.summary())

In [None]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

In [None]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

In [None]:
X.sort_values(lv_code, ascending=False).head()

In [None]:
y.sort_values(ascending=False).head()

In [None]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 10 / LV100

In [None]:
lv_code = "LV100"
phenotype_code = 10

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

In [None]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [None]:
print(_gls_results.summary())

In [None]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

In [None]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

In [None]:
X.sort_values(lv_code, ascending=False).head()

In [None]:
y.sort_values(ascending=False).head()

In [None]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 0 / LV800

In [None]:
lv_code = "LV800"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

In [None]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [None]:
print(_gls_results.summary())

In [None]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

In [None]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

In [None]:
X.sort_values(lv_code, ascending=False).head()

In [None]:
y.sort_values(ascending=False).head()

In [None]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

# GLS on real phenotypes

In [200]:
PERC_NONZERO_GENES = None

In [201]:
multixcan_real_results.columns

Index(['100001_raw-Food_weight', '100002_raw-Energy', '100003_raw-Protein',
       '100004_raw-Fat', '100005_raw-Carbohydrate', '100006_raw-Saturated_fat',
       '100007_raw-Polyunsaturated_fat', '100008_raw-Total_sugars',
       '100009_raw-Englyst_dietary_fibre', '100010-Portion_size',
       ...
       'visual impairment', 'vitiligo', 'vitreous body disease',
       'vocal cord polyp', 'voice disorders',
       'wellbeing measurement AND family relationship', 'wheezing',
       'whooping cough', 'worry measurement', 'wrist fracture'],
      dtype='object', length=3752)

## whooping cough / LV570

In [257]:
lv_code = "LV570"
phenotype_code = "visual impairment"

phenotype_name = f"multixcan-phenomexcan-{phenotype_code.replace(' ', '_')}-pvalues"
display(phenotype_name)

'multixcan-phenomexcan-visual_impairment-pvalues'

In [258]:
X, y = get_data(lv_code, real_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [259]:
X.head()

Unnamed: 0,const,LV570
NOC2L,1.0,0.0
HES4,1.0,0.032252
ISG15,1.0,0.0
AGRN,1.0,0.0
TNFRSF18,1.0,0.0


In [260]:
y.head()

NOC2L       0.911665
HES4        0.296922
ISG15       0.145821
AGRN        0.866756
TNFRSF18    0.208452
Name: visual impairment, dtype: float64

In [261]:
Xs, ys = standardize_data(X, y)

In [262]:
_Xs_desc = Xs[[lv_code]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV570
count,6442.0
mean,-3.9155960000000004e-17
std,1.0
min,-0.1764047
25%,-0.1764047
50%,-0.1764047
75%,-0.07132475
max,29.93952


In [263]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [264]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:      visual impairment   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1129
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.737
Time:                        14:29:12   Log-Likelihood:                -14094.
No. Observations:                6442   AIC:                         2.819e+04
Df Residuals:                    6440   BIC:                         2.821e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0062      0.056     -0.111      0.9

In [265]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.006677552728904192
exp_coef_se = 0.019869632911790076
exp_tvalue = -0.3360682483943587
exp_pval_twosided = 0.7368303500487827
exp_pval_onesided = 0.6315848249756086
    


In [212]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [213]:
y

NOC2L       1.301498
HES4        0.491604
ISG15       0.300490
AGRN        1.595415
TNFRSF18    0.362178
              ...   
CPT1B       0.283934
CHKB        0.621814
MAPK8IP2    0.479153
ARSA        0.274866
SHANK3      1.125992
Name: whooping cough, Length: 6442, dtype: float64

In [214]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:         whooping cough   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.665
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.197
Time:                        14:22:55   Log-Likelihood:                -9139.5
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.830e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.003e-16      0.012   8.05e-15      1.0

# Test different covariates

In [76]:
covars = load_multixcan_random_phenotype(0)[["n", "n_indep"]]
covars = covars[~covars.index.duplicated(keep="first")]
covars = covars.dropna()
covars = covars.join(gene_tissues_df, how="inner")
assert not covars.isna().any(None)

In [77]:
covars = covars.drop(columns=[c for c in covars.columns if "model" in c])

In [78]:
covars.shape

(6444, 9)

In [79]:
covars.head()

Unnamed: 0_level_0,n,n_indep,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov,tissues_variances,n_snps_used_sum,unique_n_snps_used
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GPATCH1,40.0,3.0,"(Testis, Artery_Coronary, Heart_Atrial_Appenda...",40,"[29.235812681654803, 7.331117237863964, 1.8666...","[1.4457016202816222, 0.15754336632735133, 0.07...","{'Artery_Aorta': 0.006815900132465873, 'Colon_...",50,19
NFKBIA,1.0,1.0,(Brain_Frontal_Cortex_BA9),1,[1.0],[3.4801284748968485e-05],{'Brain_Frontal_Cortex_BA9': 3.480128474896848...,1,1
TTC5,47.0,5.0,"(Testis, Artery_Coronary, Heart_Atrial_Appenda...",47,"[20.3843508249499, 12.558841649027991, 9.28359...","[0.31473546115929124, 0.0965740452605278, 0.03...",{'Small_Intestine_Terminal_Ileum': 0.010800462...,84,12
ZNF17,49.0,5.0,"(Testis, Artery_Coronary, Heart_Atrial_Appenda...",49,"[28.646268534806357, 9.073735811917347, 7.5861...","[1.9513603925601095, 0.17418261811465457, 0.10...",{'Small_Intestine_Terminal_Ileum': 0.003260710...,99,14
ZNF563,49.0,2.0,"(Testis, Artery_Coronary, Heart_Atrial_Appenda...",49,"[46.8735432455952, 2.0233826275424898]","[0.09856714803229967, 0.009511702571331633]",{'Small_Intestine_Terminal_Ileum': 0.002128108...,54,6


In [80]:
# gene_size and gene_density
covars = covars.rename(
    columns={
        "n_indep": "gene_size",
    }
)
covars = covars.assign(
    gene_density=covars.apply(lambda x: x["gene_size"] / x["n"], axis=1)
)
covars = covars.drop(columns=["n"])

In [81]:
# gene_n_snps_used and gene_n_snps_used_sharing
covars = covars.assign(gene_n_snps_used=gene_tissues_df["n_snps_used_sum"])

covars = covars.assign(
    gene_n_snps_used_sharing=gene_tissues_df.apply(
        lambda x: x["n_snps_used_sum"] / x["unique_n_snps_used"], axis=1
    )
)

In [82]:
_final_covars = [
    "gene_size",
    "gene_density",
    "gene_n_snps_used",
    "gene_n_snps_used_sharing",
]

In [83]:
covars[_final_covars].head()

Unnamed: 0_level_0,gene_size,gene_density,gene_n_snps_used,gene_n_snps_used_sharing
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GPATCH1,3.0,0.075,50,2.631579
NFKBIA,1.0,1.0,1,1.0
TTC5,5.0,0.106383,84,7.0
ZNF17,5.0,0.102041,99,7.071429
ZNF563,2.0,0.040816,54,9.0


In [84]:
covars[_final_covars].describe()

Unnamed: 0,gene_size,gene_density,gene_n_snps_used,gene_n_snps_used_sharing
count,6444.0,6444.0,6444.0,6444.0
mean,4.366387,0.168031,56.404873,5.539761
std,2.317428,0.174653,30.087886,4.376469
min,1.0,0.020408,1.0,1.0
25%,3.0,0.066667,35.0,3.0
50%,4.0,0.108696,55.0,4.475973
75%,6.0,0.190476,77.0,6.75
max,18.0,1.0,204.0,49.0


In [85]:
_tmp = covars.assign(**{f"{c}_log": np.log(covars[c]) for c in _final_covars})
display(_tmp[[c for c in _tmp.columns if "_log" in c]].describe())

Unnamed: 0,gene_size_log,gene_density_log,gene_n_snps_used_log,gene_n_snps_used_sharing_log
count,6444.0,6444.0,6444.0,6444.0
mean,1.32767,-2.133538,3.814812,1.500832
std,0.564526,0.801732,0.796239,0.636401
min,0.0,-3.89182,0.0,0.0
25%,1.098612,-2.70805,3.555348,1.098612
50%,1.386294,-2.219203,4.007333,1.498724
75%,1.791759,-1.658228,4.343805,1.909543
max,2.890372,0.0,5.31812,3.89182


In [86]:
covars[_final_covars].corr()

Unnamed: 0,gene_size,gene_density,gene_n_snps_used,gene_n_snps_used_sharing
gene_size,1.0,0.33839,0.034188,-0.419472
gene_density,0.33839,1.0,-0.555306,-0.44463
gene_n_snps_used,0.034188,-0.555306,1.0,0.227687
gene_n_snps_used_sharing,-0.419472,-0.44463,0.227687,1.0


In [87]:
_tmp[[c for c in _tmp.columns if "_log" in c]].corr()

Unnamed: 0,gene_size_log,gene_density_log,gene_n_snps_used_log,gene_n_snps_used_sharing_log
gene_size_log,1.0,0.613765,0.161196,-0.46165
gene_density_log,0.613765,1.0,-0.65128,-0.795831
gene_n_snps_used_log,0.161196,-0.65128,1.0,0.528185
gene_n_snps_used_sharing_log,-0.46165,-0.795831,0.528185,1.0


# [full corr matrix] GLS on randomly generated phenotypes using gene-level covariates

In [88]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [89]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [92]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
# keep only covars that we are testing
X = X.drop(columns=["gene_n_snps_used", "gene_n_snps_used_sharing"])
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [93]:
X.head()

Unnamed: 0,const,LV45,gene_size,gene_density
NOC2L,1.0,0.0,5.0,0.106383
HES4,1.0,0.0,3.0,0.068182
ISG15,1.0,0.0,6.0,0.142857
AGRN,1.0,0.0,4.0,0.086957
TNFRSF18,1.0,0.0,4.0,0.088889


In [94]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [95]:
Xs, ys = standardize_data(X, y)

In [96]:
_Xs_desc = Xs[[lv_code, "gene_size", "gene_density"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV45,gene_size,gene_density
count,6442.0,6442.0,6442.0
mean,1.764776e-17,1.985372e-17,6.153552e-15
std,1.0,1.0,1.0
min,-0.05995686,-1.453777,-0.8450385
25%,-0.05995686,-0.589507,-0.5802019
50%,-0.05995686,-0.1573718,-0.3395799
75%,-0.05995686,0.7068986,0.1286255
max,27.86523,5.892521,4.763266


In [97]:
Xs.head()

Unnamed: 0,const,LV45,gene_size,gene_density
NOC2L,1.0,-0.059957,0.274763,-0.35282
HES4,1.0,-0.059957,-0.589507,-0.571527
ISG15,1.0,-0.059957,0.706899,-0.144
AGRN,1.0,-0.059957,-0.157372,-0.46404
TNFRSF18,1.0,-0.059957,-0.157372,-0.452976


In [98]:
ys.head()

NOC2L      -0.750320
HES4       -0.838108
ISG15       0.727987
AGRN        1.375061
TNFRSF18    0.251828
dtype: float64

In [99]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [100]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1625
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.922
Time:                        13:59:04   Log-Likelihood:                -8696.3
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0233      0.025      0.929   

In [101]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.0032341246762881432
exp_coef_se = 0.00859220860058952
exp_tvalue = -0.3764020203217886
exp_pval_twosided = 0.7066304479272314
exp_pval_onesided = 0.6466847760363843
    


In [102]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [103]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-covars'

In [104]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_size,gene_density
NOC2L,5.0,0.106383
HES4,3.0,0.068182
ISG15,6.0,0.142857
AGRN,4.0,0.086957
TNFRSF18,4.0,0.088889


In [105]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [106]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.192
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.311
Time:                        13:59:05   Log-Likelihood:                -9138.5
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6438   BIC:                         1.831e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -1.626e-16      0.012  -1.31e-14   

## Random phenotype 6 / LV455

In [107]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [108]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
X = X.drop(columns=["gene_n_snps_used", "gene_n_snps_used_sharing"])
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [109]:
X.head()

Unnamed: 0,const,LV455,gene_size,gene_density
NOC2L,1.0,0.005223,5.0,0.106383
HES4,1.0,0.0,3.0,0.068182
ISG15,1.0,0.0,6.0,0.142857
AGRN,1.0,0.002763,4.0,0.086957
TNFRSF18,1.0,0.003323,4.0,0.088889


In [110]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [111]:
Xs, ys = standardize_data(X, y)

In [112]:
_Xs_desc = Xs[[lv_code, "gene_size", "gene_density"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV455,gene_size,gene_density
count,6442.0,6442.0,6442.0
mean,9.485668000000001e-17,1.985372e-17,6.153552e-15
std,1.0,1.0,1.0
min,-0.07851697,-1.453777,-0.8450385
25%,-0.07851697,-0.589507,-0.5802019
50%,-0.07851697,-0.1573718,-0.3395799
75%,-0.04700189,0.7068986,0.1286255
max,41.51043,5.892521,4.763266


In [113]:
Xs.head()

Unnamed: 0,const,LV455,gene_size,gene_density
NOC2L,1.0,-0.052328,0.274763,-0.35282
HES4,1.0,-0.078517,-0.589507,-0.571527
ISG15,1.0,-0.078517,0.706899,-0.144
AGRN,1.0,-0.064664,-0.157372,-0.46404
TNFRSF18,1.0,-0.061858,-0.157372,-0.452976


In [114]:
ys.head()

NOC2L      -0.750320
HES4       -0.838108
ISG15       0.727987
AGRN        1.375061
TNFRSF18    0.251828
dtype: float64

In [115]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [116]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1210
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.948
Time:                        14:01:10   Log-Likelihood:                -8696.4
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0234      0.025      0.934   

In [117]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.0014838894195672275
exp_coef_se = 0.011297197897620825
exp_tvalue = 0.13135021914414127
exp_pval_twosided = 0.895502370998393
exp_pval_onesided = 0.4477511854991965
    


In [118]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [119]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-covars'

In [120]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_size,gene_density
NOC2L,5.0,0.106383
HES4,3.0,0.068182
ISG15,6.0,0.142857
AGRN,4.0,0.086957
TNFRSF18,4.0,0.088889


In [121]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [122]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1465
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.932
Time:                        14:01:10   Log-Likelihood:                -9140.1
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6438   BIC:                         1.832e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -1.626e-16      0.012  -1.31e-14   

## Random phenotype 0 / LV801 (using logarithms)

In [123]:
lv_code = "LV801"
phenotype_code = 0

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [129]:
X, y = get_data(
    lv_code, random_phenotype_code=phenotype_code, add_covars=True, add_covars_logs=True
)
X = X.drop(
    columns=[
        "gene_n_snps_used",
        "gene_n_snps_used_sharing",
        "gene_n_snps_used_log",
        "gene_n_snps_used_sharing_log",
    ]
)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [130]:
X.head()

Unnamed: 0,const,LV801,gene_size,gene_density,gene_size_log,gene_density_log
NOC2L,1.0,0.063959,5.0,0.106383,1.609438,2.24071
HES4,1.0,0.006702,3.0,0.068182,1.098612,2.685577
ISG15,1.0,0.0,6.0,0.142857,1.791759,1.94591
AGRN,1.0,0.005386,4.0,0.086957,1.386294,2.442347
TNFRSF18,1.0,0.0,4.0,0.088889,1.386294,2.420368


In [131]:
y.head()

NOC2L       1.192661
HES4        0.615835
ISG15       1.388647
AGRN        0.098554
TNFRSF18    0.085000
dtype: float64

In [132]:
Xs, ys = standardize_data(X, y)

In [133]:
Xs.head()

Unnamed: 0,const,LV801,gene_size,gene_density,gene_size_log,gene_density_log
NOC2L,1.0,0.229344,0.273869,-0.352878,0.499538,0.133474
HES4,1.0,-0.106576,-0.589274,-0.571571,-0.405406,0.688305
ISG15,1.0,-0.145894,0.705441,-0.144071,0.822526,-0.234195
AGRN,1.0,-0.114292,-0.157702,-0.46409,0.104232,0.384953
TNFRSF18,1.0,-0.145894,-0.157702,-0.453027,0.104232,0.357541


In [134]:
ys.head()

NOC2L       0.695458
HES4       -0.258870
ISG15       1.019708
AGRN       -1.114686
TNFRSF18   -1.137110
dtype: float64

In [135]:
_Xs_desc = Xs[
    [lv_code, "gene_size", "gene_density", "gene_size_log", "gene_density_log"]
].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV801,gene_size,gene_density,gene_size_log,gene_density_log
count,6442.0,6442.0,6442.0,6442.0,6442.0
mean,-2.15082e-17,-5.5149230000000003e-17,6.216422e-15,-5.459995e-14,-3.78059e-14
std,1.0,1.0,1.0,1.0,1.0
min,-0.1458936,-1.452417,-0.8450651,-2.351632,-2.6611
25%,-0.1458936,-0.589274,-0.580245,-0.4054058,-0.5929869
50%,-0.1458936,-0.1577025,-0.3396379,0.104232,0.106652
75%,-0.03586973,0.7054405,0.1285383,0.8225262,0.7163332
max,41.72866,5.884299,4.76289,2.768753,2.192711


In [136]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [137]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4878
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.786
Time:                        14:06:30   Log-Likelihood:                -8680.4
No. Observations:                6442   AIC:                         1.737e+04
Df Residuals:                    6436   BIC:                         1.741e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -0.0146      0.025  

In [138]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.008257877072158701
exp_coef_se = 0.01098379035278686
exp_tvalue = 0.7518239885253702
exp_pval_twosided = 0.4521844060369917
exp_pval_onesided = 0.22609220301849586
    


In [139]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [140]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-covars"
display(phenotype_covars_name)

'multixcan-random_phenotype0-covars'

In [141]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_size,gene_density,gene_size_log,gene_density_log
NOC2L,5.0,0.106383,1.609438,2.24071
HES4,3.0,0.068182,1.098612,2.685577
ISG15,6.0,0.142857,1.791759,1.94591
AGRN,4.0,0.086957,1.386294,2.442347
TNFRSF18,4.0,0.088889,1.386294,2.420368


In [142]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [143]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.154
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.329
Time:                        14:06:30   Log-Likelihood:                -9137.4
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6436   BIC:                         1.833e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             5.838e-15      0.012  

# [full corr matrix] GLS on randomly generated phenotypes using SNP-level covariates

In [144]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [145]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [149]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
X = X.drop(columns=["gene_size", "gene_density"])
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [150]:
X.head()

Unnamed: 0,const,LV45,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,1.0,0.0,103,4.478261
HES4,1.0,0.0,55,6.875
ISG15,1.0,0.0,59,2.565217
AGRN,1.0,0.0,75,5.0
TNFRSF18,1.0,0.0,65,2.407407


In [151]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [152]:
Xs, ys = standardize_data(X, y)

In [154]:
_Xs_desc = Xs[[lv_code, "gene_n_snps_used", "gene_n_snps_used_sharing"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV45,gene_n_snps_used,gene_n_snps_used_sharing
count,6442.0,6442.0,6442.0
mean,1.3235820000000001e-17,0.0,7.059102e-17
std,1.0,1.0,1.0
min,-0.05995686,-1.841199,-1.037442
25%,-0.05995686,-0.711342,-0.5804895
50%,-0.05995686,-0.046721,-0.242742
75%,-0.05995686,0.684363,0.2762964
max,27.86523,4.90471,9.929418


In [155]:
Xs.head()

Unnamed: 0,const,LV45,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,1.0,-0.059957,1.548371,-0.242742
HES4,1.0,-0.059957,-0.046721,0.304856
ISG15,1.0,-0.059957,0.086204,-0.679827
AGRN,1.0,-0.059957,0.617901,-0.123537
TNFRSF18,1.0,-0.059957,0.28559,-0.715883


In [156]:
ys.head()

NOC2L      -0.750320
HES4       -0.838108
ISG15       0.727987
AGRN        1.375061
TNFRSF18    0.251828
dtype: float64

In [157]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [158]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.3128
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.816
Time:                        14:10:21   Log-Likelihood:                -8696.1
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [159]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.0032535573271900795
exp_coef_se = 0.008591651358118488
exp_tvalue = -0.3786882394984177
exp_pval_twosided = 0.7049318746949735
exp_pval_onesided = 0.6475340626525132
    


In [160]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [161]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-snplevel_covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-snplevel_covars'

In [162]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,103,4.478261
HES4,55,6.875
ISG15,59,2.565217
AGRN,75,5.0
TNFRSF18,65,2.407407


In [163]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [164]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.260
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.286
Time:                        14:10:21   Log-Likelihood:                -9138.4
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6438   BIC:                         1.831e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

## Random phenotype 6 / LV455

In [165]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [166]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
X = X.drop(columns=["gene_size", "gene_density"])
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [167]:
X.head()

Unnamed: 0,const,LV455,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,1.0,0.005223,103,4.478261
HES4,1.0,0.0,55,6.875
ISG15,1.0,0.0,59,2.565217
AGRN,1.0,0.002763,75,5.0
TNFRSF18,1.0,0.003323,65,2.407407


In [168]:
y.head()

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
dtype: float64

In [169]:
Xs, ys = standardize_data(X, y)

In [170]:
_Xs_desc = Xs[[lv_code, "gene_n_snps_used", "gene_n_snps_used_sharing"]].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV455,gene_n_snps_used,gene_n_snps_used_sharing
count,6442.0,6442.0,6442.0
mean,-4.411939e-18,0.0,7.059102e-17
std,1.0,1.0,1.0
min,-0.07851697,-1.841199,-1.037442
25%,-0.07851697,-0.711342,-0.5804895
50%,-0.07851697,-0.046721,-0.242742
75%,-0.04700189,0.684363,0.2762964
max,41.51043,4.90471,9.929418


In [171]:
Xs.head()

Unnamed: 0,const,LV455,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,1.0,-0.052328,1.548371,-0.242742
HES4,1.0,-0.078517,-0.046721,0.304856
ISG15,1.0,-0.078517,0.086204,-0.679827
AGRN,1.0,-0.064664,0.617901,-0.123537
TNFRSF18,1.0,-0.061858,0.28559,-0.715883


In [172]:
ys.head()

NOC2L      -0.750320
HES4       -0.838108
ISG15       0.727987
AGRN        1.375061
TNFRSF18    0.251828
dtype: float64

In [173]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [174]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2686
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.848
Time:                        14:13:20   Log-Likelihood:                -8696.2
No. Observations:                6442   AIC:                         1.740e+04
Df Residuals:                    6438   BIC:                         1.743e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [175]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.0011536250507955133
exp_coef_se = 0.011122650387146025
exp_tvalue = 0.10371853925470038
exp_pval_twosided = 0.9173959444428127
exp_pval_onesided = 0.45869797222140635
    


In [176]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [177]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-snplevel_covars"
display(phenotype_covars_name)

'multixcan-random_phenotype6-snplevel_covars'

In [178]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,103,4.478261
HES4,55,6.875
ISG15,59,2.565217
AGRN,75,5.0
TNFRSF18,65,2.407407


In [179]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [180]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2653
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.850
Time:                        14:13:20   Log-Likelihood:                -9139.9
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6438   BIC:                         1.831e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

## Random phenotype 0 / LV801 (using logarithms)

In [181]:
lv_code = "LV801"
phenotype_code = 0

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [185]:
X, y = get_data(
    lv_code,
    random_phenotype_code=phenotype_code,
    add_covars=True,
    add_covars_logs=True,
)
X = X.drop(columns=["gene_size", "gene_density", "gene_size_log", "gene_density_log"])
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [186]:
X.head()

Unnamed: 0,const,LV801,gene_n_snps_used,gene_n_snps_used_sharing,gene_n_snps_used_log,gene_n_snps_used_sharing_log
NOC2L,1.0,0.063959,103,4.478261,4.634729,1.499235
HES4,1.0,0.006702,55,6.875,4.007333,1.927892
ISG15,1.0,0.0,59,2.565217,4.077537,0.942043
AGRN,1.0,0.005386,75,5.0,4.317488,1.609438
TNFRSF18,1.0,0.0,65,2.407407,4.174387,0.87855


In [187]:
y.head()

NOC2L       1.192661
HES4        0.615835
ISG15       1.388647
AGRN        0.098554
TNFRSF18    0.085000
dtype: float64

In [188]:
Xs, ys = standardize_data(X, y)

In [189]:
_Xs_desc = Xs[
    [
        lv_code,
        "gene_n_snps_used",
        "gene_n_snps_used_log",
        "gene_n_snps_used_sharing",
        "gene_n_snps_used_sharing_log",
    ]
].describe()
display(_Xs_desc)
assert (_Xs_desc.loc["mean"] < 1e-10).all()
assert (_Xs_desc.loc["std"].between(0.9999, 1.00001)).all()

Unnamed: 0,LV801,gene_n_snps_used,gene_n_snps_used_log,gene_n_snps_used_sharing,gene_n_snps_used_sharing_log
count,6442.0,6442.0,6442.0,6442.0,6442.0
mean,1.8750740000000002e-17,0.0,-1.2132830000000001e-17,7.059102e-17,-2.19494e-16
std,1.0,1.0,1.0,1.0,1.0
min,-0.1458936,-1.841199,-4.790275,-1.037442,-2.358546
25%,-0.1458936,-0.711342,-0.3257555,-0.5804895,-0.6322927
50%,-0.1458936,-0.046721,0.241811,-0.242742,-0.002793177
75%,-0.03586973,0.684363,0.6643257,0.2762964,0.6419248
max,41.72866,4.90471,1.887791,9.929418,3.756685


In [190]:
Xs.head()

Unnamed: 0,const,LV801,gene_n_snps_used,gene_n_snps_used_sharing,gene_n_snps_used_log,gene_n_snps_used_sharing_log
NOC2L,1.0,0.229344,1.548371,-0.242742,1.029644,-0.002793
HES4,1.0,-0.106576,-0.046721,0.304856,0.241811,0.670757
ISG15,1.0,-0.145894,0.086204,-0.679827,0.329968,-0.87831
AGRN,1.0,-0.114292,0.617901,-0.123537,0.631279,0.170369
TNFRSF18,1.0,-0.145894,0.28559,-0.715883,0.451584,-0.978077


In [191]:
ys.head()

NOC2L       0.695458
HES4       -0.258870
ISG15       1.019708
AGRN       -1.114686
TNFRSF18   -1.137110
dtype: float64

In [192]:
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [193]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            GLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.193
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.310
Time:                        14:16:46   Log-Likelihood:                -8678.6
No. Observations:                6442   AIC:                         1.737e+04
Df Residuals:                    6436   BIC:                         1.741e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

In [194]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.011057584219855475
exp_coef_se = 0.01127211381890071
exp_tvalue = 0.980968112769983
exp_pval_twosided = 0.32664533937794404
exp_pval_onesided = 0.16332266968897202
    


In [195]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [196]:
# save covariates
phenotype_covars_name = f"{phenotype_name_base}-snplevel_covars"
display(phenotype_covars_name)

'multixcan-random_phenotype0-snplevel_covars'

In [197]:
y_covars = X[[c for c in X.columns if c not in ("const", lv_code)]]
display(y_covars.head())
assert not y_covars.isna().any(None)

Unnamed: 0,gene_n_snps_used,gene_n_snps_used_sharing,gene_n_snps_used_log,gene_n_snps_used_sharing_log
NOC2L,103,4.478261,4.634729,1.499235
HES4,55,6.875,4.007333,1.927892
ISG15,59,2.565217,4.077537,0.942043
AGRN,75,5.0,4.317488,1.609438
TNFRSF18,65,2.407407,4.174387,0.87855


In [198]:
# save covariates
y_covars.to_pickle(OUTPUT_DIR / f"{phenotype_covars_name}.pkl.xz")

In [199]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.050
Date:                Tue, 02 Aug 2022   Prob (F-statistic):              0.386
Time:                        14:16:46   Log-Likelihood:                -9137.7
No. Observations:                6442   AIC:                         1.829e+04
Df Residuals:                    6436   BIC:                         1.833e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           