# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook reads all gene correlations across all chromosomes and computes a single correlation matrix by assembling a big correlation matrix with all genes.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import numpy as np
from scipy.spatial.distance import squareform
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import conf
import utils
from entity import Gene

# Settings

In [3]:
# reference panel
REFERENCE_PANEL = "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL = "MASHR"
EQTL_MODEL_FILES_PREFIX = "mashr_"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"
# EQTL_MODEL_FILES_PREFIX = "en_"

# make it read the prefix from conf.py
EQTL_MODEL_FILES_PREFIX = None

In [4]:
# Parameters
REFERENCE_PANEL = "1000G"
EQTL_MODEL = "MASHR"

In [5]:
if EQTL_MODEL_FILES_PREFIX is None:
    EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][
        f"{EQTL_MODEL}_PREFIX"
    ]

In [6]:
display(f"Using eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'Using eQTL model: MASHR / mashr_'

In [7]:
REFERENCE_PANEL_DIR = conf.PHENOMEXCAN["LD_BLOCKS"][f"{REFERENCE_PANEL}_GENOTYPE_DIR"]

In [8]:
display(f"Using reference panel folder: {str(REFERENCE_PANEL_DIR)}")

'Using reference panel folder: /opt/data/data/phenomexcan/ld_blocks/reference_panel_1000G'

In [9]:
OUTPUT_DIR_BASE = (
    conf.PHENOMEXCAN["LD_BLOCKS"][f"GENE_CORRS_DIR"]
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
display(OUTPUT_DIR_BASE)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr')

In [10]:
display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr'

In [11]:
OUTPUT_DIR = utils.get_git_repository_path() / "tests" / "data" / "gls"
display(OUTPUT_DIR)
assert OUTPUT_DIR.exists()

PosixPath('/opt/code/tests/data/gls')

# Load data

## MultiPLIER Z

In [12]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [13]:
multiplier_z_genes = multiplier_z.index.tolist()

In [14]:
len(multiplier_z_genes)

6750

In [15]:
multiplier_z_genes[:10]

['GAS6',
 'MMP14',
 'DSP',
 'MARCKSL1',
 'SPARC',
 'CTSD',
 'EPAS1',
 'PALLD',
 'PHC2',
 'LGALS3BP']

## Function to load MultiXcan's results on random phenotypes

In [16]:
def load_multixcan_random_phenotype(phenotype_code):
    multixcan_random_results = pd.read_csv(
        conf.RESULTS["GLS_NULL_SIMS"]
        / "twas"
        / "smultixcan"
        / f"random.pheno{phenotype_code}-gtex_v8-mashr-smultixcan.txt",
        sep="\t",
        index_col="gene_name",
    )

    return multixcan_random_results

## MultiXcan real results (PhenomeXcan)

In [17]:
multixcan_real_results = pd.read_pickle(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [18]:
multixcan_real_results = multixcan_real_results[
    ~multixcan_real_results.index.duplicated(keep="first")
].dropna(how="all", axis=0)

In [19]:
multixcan_real_results.shape

(22508, 3752)

In [20]:
multixcan_real_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


In [21]:
assert not multixcan_real_results.isna().any(None)

## Load full correlation matrix

In [22]:
output_file_name_template = conf.PHENOMEXCAN["LD_BLOCKS"][
    "GENE_CORRS_FILE_NAME_TEMPLATES"
]["GENE_CORR_AVG"]

output_file = OUTPUT_DIR_BASE / "multiplier_genes-gene_correlations-gene_symbols.pkl"

# # FIXME testing remove
# output_file = (
#     OUTPUT_DIR_BASE
#     / "_gene_corrs_using_use_within_distance_False"
#     / "multiplier_genes-gene_correlations-gene_symbols-fixed.pkl"
# )

display(output_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr/multiplier_genes-gene_correlations-gene_symbols.pkl')

In [23]:
full_corr_matrix_gene_symbols = pd.read_pickle(output_file)

In [24]:
full_corr_matrix_gene_symbols.shape

(6452, 6452)

In [25]:
full_corr_matrix_gene_symbols.head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.115011,0.173138,0.052445,0.008032,0.008727,0.006797,0.004533,0.00735,0.010391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.115011,1.0,0.681368,0.347023,0.011545,0.010729,0.003577,0.01023,0.010747,0.008769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.173138,0.681368,1.0,0.351822,0.011774,0.012527,0.003754,0.012096,0.012679,0.010442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.052445,0.347023,0.351822,1.0,0.014103,0.013988,0.006056,0.006296,0.005363,0.0083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.008032,0.011545,0.011774,0.014103,1.0,0.356676,0.45401,0.137643,0.20034,0.09321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Make matrix compatible with GLS

In [26]:
_eigvals = np.linalg.eigvals(full_corr_matrix_gene_symbols)
display(_eigvals[_eigvals < 0].shape[0])
display(_eigvals[_eigvals < 0])

0

array([], dtype=float64)

In [27]:
try:
    np.linalg.cholesky(full_corr_matrix_gene_symbols)
    print("No need to fix")
except Exception as e:
    print(f"Failed with:\n {str(e)}")

No need to fix


In [28]:
orig_corr_mat = full_corr_matrix_gene_symbols

In [29]:
orig_corr_mat.to_pickle(OUTPUT_DIR / "corr_mat.pkl.xz")

In [30]:
# %load_ext rpy2.ipython

In [31]:
# corr_mat_r = full_corr_matrix_gene_symbols.to_numpy()

In [32]:
# %Rpush corr_mat_r

In [33]:
# %%R
# # taken from https://www.r-bloggers.com/2013/08/correcting-a-pseudo-correlation-matrix-to-be-positive-semidefinite/
# CorrectCM <- function(CM, p = 0) {
#   n <- dim(CM)[1L]
#   E <- eigen(CM)
#   CM1 <- E$vectors %*% tcrossprod(diag(pmax(E$values, p), n), E$vectors)
#   Balance <- diag(1 / sqrt(diag(CM1)))
#   CM2 <- Balance %*% CM1 %*% Balance
#   return(CM2)
# }

In [34]:
# %%R -o corr_mat_r_fixed
# corr_mat_r_fixed <- CorrectCM(corr_mat_r, 1e-5)

In [35]:
# corr_mat_r_fixed

In [36]:
# corr_mat_r_fixed = pd.DataFrame(
#     corr_mat_r_fixed,
#     index=full_corr_matrix_gene_symbols.index.tolist(),
#     columns=full_corr_matrix_gene_symbols.columns.tolist()
# )

In [37]:
# corr_mat_r_fixed.shape

In [38]:
# corr_mat_r_fixed.head()

In [39]:
# corr_mat_r_fixed.equals(full_corr_matrix_gene_symbols)

In [40]:
# del full_corr_matrix_gene_symbols

In [41]:
# orig_corr_mat = corr_mat_r_fixed

In [42]:
# orig_corr_mat.to_pickle(OUTPUT_DIR / "corr_mat_fixed.pkl.xz")

# Functions

In [43]:
import statsmodels.api as sm
from sklearn.preprocessing import scale

In [44]:
def get_data(lv_code, random_phenotype_code=None, real_phenotype_code=None):
    if random_phenotype_code is not None:
        target_data = load_multixcan_random_phenotype(random_phenotype_code)["pvalue"]
        y = pd.Series(
            data=np.abs(stats.norm.ppf(target_data.to_numpy() / 2)),
            index=target_data.index.copy(),
        )
    elif real_phenotype_code is not None:
        y = multixcan_real_results[real_phenotype_code]

    y = y[~y.index.duplicated(keep="first")]
    y = y.dropna()

    X = multiplier_z[lv_code].copy()

    common_genes = orig_corr_mat.index.intersection(y.index).intersection(X.index)
    y = y.loc[common_genes]

    X = X.loc[common_genes]
    X = sm.add_constant(X)

    return X, y

In [45]:
def standardize_data(X, y):
    X = X.copy()
    y = y.copy()

    c = [c for c in X.columns if c != "const"]
    X[c] = (X[c] - X[c].mean()) / X[c].std()

    return X, (y - y.mean()) / y.std()

In [46]:
def get_aligned_corr_mat(X, perc=1.0):
    # perc == 1.0 means select all nonzero genes;
    # perc = None means do not subset the correlation matrix
    gene_corrs = orig_corr_mat.loc[X.index, X.index]

    if perc is None:
        return gene_corrs

    corr_mat_sub = pd.DataFrame(
        np.identity(gene_corrs.shape[0]),
        index=gene_corrs.index.copy(),
        columns=gene_corrs.columns.copy(),
    )

    X = X.iloc[:, 1]

    X_non_zero = X[X > 0]
    X_thres = X_non_zero.quantile(1 - perc)
    lv_nonzero_genes = X[X >= X_thres].index

    lv_nonzero_genes = lv_nonzero_genes.intersection(gene_corrs.index)
    corr_mat_sub.loc[lv_nonzero_genes, lv_nonzero_genes] = gene_corrs.loc[
        lv_nonzero_genes, lv_nonzero_genes
    ]

    return corr_mat_sub

In [47]:
# testing
_X_test = pd.DataFrame(
    {
        "const": 1.0,
        "LV1": [1.0, 0.4, 0.0],  # the last gene has zero weight
    },
    index=[
        "PSMB10",  # the first two genes have a high sum of correlations, to make sure the sum is not close to 1.0
        "SLC12A4",
        "ACD",
    ],
)

# do not subset
_tmp_corr = get_aligned_corr_mat(_X_test, perc=None)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.73],
            [0.77, 1.0, 0.63],
            [0.73, 0.63, 1.00],
        ]
    ),
)

# do subset: include all non-zero LV genes
_tmp_corr = get_aligned_corr_mat(_X_test, perc=1.0)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.00],
            [0.77, 1.0, 0.00],
            [0.00, 0.00, 1.00],
        ]
    ),
)

# do subset: include all non-zero LV genes with weight > 99% percentile
_tmp_corr = get_aligned_corr_mat(_X_test, perc=0.99)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.00, 0.00],
            [0.00, 1.0, 0.00],
            [0.00, 0.00, 1.00],
        ]
    ),
)

In [48]:
def train_statsmodels_gls(X, y, corr_mat):
    gls_model = sm.GLS(y, X, sigma=corr_mat)
    gls_results = gls_model.fit()
    return gls_results

# [full corr matrix] GLS on randomly generated phenotypes

In [49]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [50]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [51]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [52]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                   0.09643
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.756
Time:                        20:18:41   Log-Likelihood:                -10482.
No. Observations:                6442   AIC:                         2.097e+04
Df Residuals:                    6440   BIC:                         2.098e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1039      0.002   -561.718      0.0

In [53]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.429
Date:                Mon, 18 Jul 2022   Prob (F-statistic):             0.0641
Time:                        20:18:48   Log-Likelihood:                -5683.6
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.138e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7650      0.007    104.807      0.0

In [54]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.003810795472142111
0.012271545160174927
0.310539171913685
0.7561610253800751
0.37808051269003756


In [55]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
HIST1H2BO,1.0,8.480948
HIST1H2BF,1.0,8.426226
HIST1H2BK,1.0,8.245903
HIST1H2BD,1.0,8.119013
HIST1H2BC,1.0,7.744137
...,...,...
TREM1,1.0,0.000000
TREML2,1.0,0.000000
TREM2,1.0,0.000000
NFYA,1.0,0.000000


In [56]:
Xs.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
HIST1H2BO,1.0,27.865226
HIST1H2BF,1.0,27.685041
HIST1H2BK,1.0,27.091293
HIST1H2BD,1.0,26.673482
HIST1H2BC,1.0,25.439130
...,...,...
TREM1,1.0,-0.059957
TREML2,1.0,-0.059957
TREM2,1.0,-0.059957
NFYA,1.0,-0.059957


In [57]:
y.sort_values(ascending=False)

CHPF2     4.039680
PRR5      3.726033
MMP12     3.686147
RBM38     3.581041
SOS1      3.528183
            ...   
GPX3      0.000826
SUOX      0.000686
SPRED2    0.000364
DEGS1     0.000152
SAFB      0.000072
Length: 6442, dtype: float64

In [58]:
ys.sort_values(ascending=False)

CHPF2     5.600117
PRR5      5.063874
MMP12     4.995681
RBM38     4.815982
SOS1      4.725610
            ...   
GPX3     -1.305113
SUOX     -1.305353
SPRED2   -1.305903
DEGS1    -1.306266
SAFB     -1.306403
Length: 6442, dtype: float64

In [59]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 6 / LV455

In [68]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [69]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [70]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2301
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.631
Time:                        20:21:53   Log-Likelihood:                -10481.
No. Observations:                6442   AIC:                         2.097e+04
Df Residuals:                    6440   BIC:                         2.098e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1041      0.002   -603.202      0.0

In [71]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2956
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.587
Time:                        20:21:53   Log-Likelihood:                -5685.2
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.139e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7639      0.007    104.494      0.0

In [72]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

8.650711873537164e-05
0.00018035107257825515
0.47965957451035285
0.6314857662460573
0.3157428831230287


In [73]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV455
CACNA1A,1.0,8.294351
ZNF26,1.0,7.956442
ARHGAP42,1.0,5.592084
UBE2B,1.0,5.379685
GAB2,1.0,3.946462
...,...,...
ZNF655,1.0,0.000000
ZKSCAN5,1.0,0.000000
ZNF394,1.0,0.000000
ATP5J2,1.0,0.000000


In [74]:
y.sort_values(ascending=False)

CHPF2     4.039680
PRR5      3.726033
MMP12     3.686147
RBM38     3.581041
SOS1      3.528183
            ...   
GPX3      0.000826
SUOX      0.000686
SPRED2    0.000364
DEGS1     0.000152
SAFB      0.000072
Length: 6442, dtype: float64

In [75]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 0 / LV800

In [76]:
lv_code = "LV800"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [77]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [78]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            GLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     7.383
Date:                Mon, 18 Jul 2022   Prob (F-statistic):            0.00660
Time:                        20:25:33   Log-Likelihood:                -11319.
No. Observations:                6442   AIC:                         2.264e+04
Df Residuals:                    6440   BIC:                         2.266e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.4327      0.006   -248.297      0.0

In [79]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4172
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.518
Time:                        20:25:33   Log-Likelihood:                -5887.9
No. Observations:                6442   AIC:                         1.178e+04
Df Residuals:                    6440   BIC:                         1.179e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7706      0.008     98.662      0.0

In [80]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.0008674165392909389
0.0003192417956200798
-2.717114585845851
0.0066029845360324755
0.9966985077319838


In [81]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV800
ZNF20,1.0,1.9807
ZNF606,1.0,1.931659
ZNF551,1.0,1.832833
ZNF543,1.0,1.701683
ZNF14,1.0,1.640789


In [82]:
y.sort_values(ascending=False).head()

GPATCH1    3.951082
NFKBIA     3.900707
TTC5       3.870256
ZNF17      3.807992
ZNF563     3.747230
dtype: float64

In [83]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

# [sub corr matrix ] GLS on randomly generated phenotypes

In [144]:
PERC_NONZERO_GENES = 1.00

## Random phenotype 6 / LV45

In [145]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [146]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [147]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.249
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.264
Time:                        20:47:39   Log-Likelihood:                -9137.4
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.829e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0002      0.013      0.017      0.9

In [148]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.429
Date:                Mon, 18 Jul 2022   Prob (F-statistic):             0.0641
Time:                        20:47:39   Log-Likelihood:                -5683.6
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.138e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7650      0.007    104.807      0.0

In [149]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.012985100862501646
0.011620815913625014
-1.117400099873973
0.2638649762970155
0.8680675118514922


In [150]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV45
HIST1H2BO,1.0,8.480948
HIST1H2BF,1.0,8.426226
HIST1H2BK,1.0,8.245903
HIST1H2BD,1.0,8.119013
HIST1H2BC,1.0,7.744137


In [151]:
y.sort_values(ascending=False).head()

CHPF2    4.039680
PRR5     3.726033
MMP12    3.686147
RBM38    3.581041
SOS1     3.528183
dtype: float64

In [152]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 6 / LV455

In [153]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [154]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [155]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2274
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.633
Time:                        20:48:27   Log-Likelihood:                -9085.7
No. Observations:                6442   AIC:                         1.818e+04
Df Residuals:                    6440   BIC:                         1.819e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0007      0.013      0.050      0.9

In [156]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2956
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.587
Time:                        20:48:27   Log-Likelihood:                -5685.2
No. Observations:                6442   AIC:                         1.137e+04
Df Residuals:                    6440   BIC:                         1.139e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7639      0.007    104.494      0.0

In [157]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.005579886461567623
0.011701985846626066
0.4768324397842631
0.6334976228046719
0.31674881140233596


In [158]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV455
CACNA1A,1.0,8.294351
ZNF26,1.0,7.956442
ARHGAP42,1.0,5.592084
UBE2B,1.0,5.379685
GAB2,1.0,3.946462


In [159]:
y.sort_values(ascending=False).head()

CHPF2    4.039680
PRR5     3.726033
MMP12    3.686147
RBM38    3.581041
SOS1     3.528183
dtype: float64

In [160]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 10 / LV100

In [112]:
lv_code = "LV100"
phenotype_code = 10

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype10-pvalues'

In [113]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [114]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                 4.829e-06
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.998
Time:                        20:32:45   Log-Likelihood:                -8974.5
No. Observations:                6442   AIC:                         1.795e+04
Df Residuals:                    6440   BIC:                         1.797e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0092      0.014      0.676      0.4

In [115]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5978
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.439
Time:                        20:32:46   Log-Likelihood:                -5800.3
No. Observations:                6442   AIC:                         1.160e+04
Df Residuals:                    6440   BIC:                         1.162e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7726      0.008    102.837      0.0

In [116]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

6.008877267380782e-08
2.7345042158271415e-05
0.0021974284159453007
0.9982467752659725
0.49912338763298625


In [117]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV100
STX4,1.0,6.438276
RAB11B,1.0,6.34148
MED11,1.0,5.910379
NDUFB7,1.0,4.358643
MRPL34,1.0,3.903225


In [118]:
y.sort_values(ascending=False).head()

RPL15     3.852128
VAMP4     3.543077
HMGCS1    3.404068
MED9      3.360888
ABCB10    3.335043
dtype: float64

In [119]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 0 / LV800

In [120]:
lv_code = "LV800"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [121]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [122]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1969
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.657
Time:                        20:33:21   Log-Likelihood:                -9062.0
No. Observations:                6442   AIC:                         1.813e+04
Df Residuals:                    6440   BIC:                         1.814e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0013      0.013      0.102      0.9

In [123]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4172
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.518
Time:                        20:33:21   Log-Likelihood:                -5887.9
No. Observations:                6442   AIC:                         1.178e+04
Df Residuals:                    6440   BIC:                         1.179e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7706      0.008     98.662      0.0

In [124]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.006103958765364128
0.013754152651896586
0.4437902442883271
0.6572091521023612
0.3286045760511806


In [125]:
X.sort_values(lv_code, ascending=False).head()

Unnamed: 0,const,LV800
ZNF20,1.0,1.9807
ZNF606,1.0,1.931659
ZNF551,1.0,1.832833
ZNF543,1.0,1.701683
ZNF14,1.0,1.640789


In [126]:
y.sort_values(ascending=False).head()

GPATCH1    3.951082
NFKBIA     3.900707
TTC5       3.870256
ZNF17      3.807992
ZNF563     3.747230
dtype: float64

In [127]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

# GLS on real phenotypes

In [128]:
PERC_NONZERO_GENES = 1.00

In [129]:
multixcan_real_results.columns

Index(['100001_raw-Food_weight', '100002_raw-Energy', '100003_raw-Protein',
       '100004_raw-Fat', '100005_raw-Carbohydrate', '100006_raw-Saturated_fat',
       '100007_raw-Polyunsaturated_fat', '100008_raw-Total_sugars',
       '100009_raw-Englyst_dietary_fibre', '100010-Portion_size',
       ...
       'visual impairment', 'vitiligo', 'vitreous body disease',
       'vocal cord polyp', 'voice disorders',
       'wellbeing measurement AND family relationship', 'wheezing',
       'whooping cough', 'worry measurement', 'wrist fracture'],
      dtype='object', length=3752)

## whooping cough / LV570

In [130]:
lv_code = "LV570"
phenotype_code = "whooping cough"

phenotype_name = f"multixcan-phenomexcan-{phenotype_code.replace(' ', '_')}-pvalues"
display(phenotype_name)

'multixcan-phenomexcan-whooping_cough-pvalues'

In [131]:
X, y = get_data(lv_code, real_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [132]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:         whooping cough   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9445
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.331
Time:                        20:34:38   Log-Likelihood:                -9095.1
No. Observations:                6450   AIC:                         1.819e+04
Df Residuals:                    6448   BIC:                         1.821e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0036      0.013     -0.273      0.7

In [133]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:         whooping cough   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.685
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.194
Time:                        20:34:42   Log-Likelihood:                -5888.1
No. Observations:                6450   AIC:                         1.178e+04
Df Residuals:                    6448   BIC:                         1.179e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7911      0.008    103.763      0.0

In [134]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

0.010052199503207914
0.010343425212644251
0.971844364565006
0.3311644354099884
0.1655822177049942


In [135]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [136]:
y

NOC2L       1.301498
HES4        0.491604
ISG15       0.300490
AGRN        1.595415
TNFRSF18    0.362178
              ...   
CPT1B       0.283934
CHKB        0.621814
MAPK8IP2    0.479153
ARSA        0.274866
SHANK3      1.125992
Name: whooping cough, Length: 6450, dtype: float64

## wheezing and LV400

In [137]:
lv_code = "LV400"
phenotype_code = "wheezing"

phenotype_name = f"multixcan-phenomexcan-{phenotype_code.replace(' ', '_')}-pvalues"
display(phenotype_name)

'multixcan-phenomexcan-wheezing-pvalues'

In [138]:
X, y = get_data(lv_code, real_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [139]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:               wheezing   R-squared:                       0.010
Model:                            GLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     62.81
Date:                Mon, 18 Jul 2022   Prob (F-statistic):           2.67e-15
Time:                        20:35:17   Log-Likelihood:                -9568.4
No. Observations:                6450   AIC:                         1.914e+04
Df Residuals:                    6448   BIC:                         1.915e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0234      0.014     -1.625      0.1

In [140]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(y, X)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:               wheezing   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.672
Date:                Mon, 18 Jul 2022   Prob (F-statistic):              0.196
Time:                        20:35:17   Log-Likelihood:                -8849.6
No. Observations:                6450   AIC:                         1.770e+04
Df Residuals:                    6448   BIC:                         1.772e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1786      0.012     96.100      0.0

In [141]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])
    print(stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid))

-0.09501682279807443
0.01198940412636518
-7.925066316609398
2.6672597928016686e-15
0.9999999999999987


In [142]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [143]:
y

NOC2L       2.745848
HES4        0.208291
ISG15       0.136734
AGRN        0.229924
TNFRSF18    0.408283
              ...   
CPT1B       0.483466
CHKB        0.264246
MAPK8IP2    0.709193
ARSA        0.786883
SHANK3      0.404021
Name: wheezing, Length: 6450, dtype: float64