# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook reads all gene correlations across all chromosomes and computes a single correlation matrix by assembling a big correlation matrix with all genes.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import numpy as np
from scipy.spatial.distance import squareform
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

import conf
import utils
from entity import Gene

# Settings

In [3]:
# reference panel
REFERENCE_PANEL = "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL = "MASHR"
EQTL_MODEL_FILES_PREFIX = "mashr_"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"
# EQTL_MODEL_FILES_PREFIX = "en_"

# make it read the prefix from conf.py
EQTL_MODEL_FILES_PREFIX = None

In [4]:
# Parameters
REFERENCE_PANEL = "1000G"
EQTL_MODEL = "MASHR"

In [5]:
if EQTL_MODEL_FILES_PREFIX is None:
    EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][
        f"{EQTL_MODEL}_PREFIX"
    ]

In [6]:
display(f"Using eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'Using eQTL model: MASHR / mashr_'

In [7]:
REFERENCE_PANEL_DIR = conf.PHENOMEXCAN["LD_BLOCKS"][f"{REFERENCE_PANEL}_GENOTYPE_DIR"]

In [8]:
display(f"Using reference panel folder: {str(REFERENCE_PANEL_DIR)}")

'Using reference panel folder: /opt/data/data/phenomexcan/ld_blocks/reference_panel_1000G'

In [9]:
OUTPUT_DIR_BASE = (
    conf.PHENOMEXCAN["LD_BLOCKS"][f"GENE_CORRS_DIR"]
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
display(OUTPUT_DIR_BASE)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr')

In [10]:
display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr'

In [11]:
OUTPUT_DIR = utils.get_git_repository_path() / "tests" / "data" / "gls"
assert OUTPUT_DIR.exists()

# Load data

## MultiPLIER Z

In [12]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [13]:
multiplier_z_genes = multiplier_z.index.tolist()

In [14]:
len(multiplier_z_genes)

6750

In [15]:
multiplier_z_genes[:10]

['GAS6',
 'MMP14',
 'DSP',
 'MARCKSL1',
 'SPARC',
 'CTSD',
 'EPAS1',
 'PALLD',
 'PHC2',
 'LGALS3BP']

## Function to load MultiXcan's results on random phenotypes

In [16]:
def load_multixcan_random_phenotype(phenotype_code):
    multixcan_random_results = pd.read_csv(
        conf.RESULTS["GLS_NULL_SIMS"]
        / "twas"
        / "smultixcan"
        / f"random.pheno{phenotype_code}-gtex_v8-mashr-smultixcan.txt",
        sep="\t",
        index_col="gene_name",
    )

    return multixcan_random_results

## MultiXcan real results (PhenomeXcan)

In [17]:
multixcan_real_results = pd.read_pickle(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [18]:
multixcan_real_results = multixcan_real_results[
    ~multixcan_real_results.index.duplicated(keep="first")
].dropna(how="all", axis=0)

In [19]:
multixcan_real_results.shape

(22508, 3752)

In [20]:
multixcan_real_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


In [21]:
assert not multixcan_real_results.isna().any(None)

## Load full correlation matrix

In [22]:
output_file_name_template = conf.PHENOMEXCAN["LD_BLOCKS"][
    "GENE_CORRS_FILE_NAME_TEMPLATES"
]["GENE_CORR_AVG"]

# output_file = OUTPUT_DIR_BASE / "multiplier_genes-gene_correlations-gene_symbols.pkl"
output_file = (
    OUTPUT_DIR_BASE
    / "multiplier_genes-gene_correlations_within_distance-gene_symbols.pkl"
)
display(output_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr/multiplier_genes-gene_correlations_within_distance-gene_symbols.pkl')

In [23]:
full_corr_matrix_gene_symbols = pd.read_pickle(output_file)

In [24]:
full_corr_matrix_gene_symbols.shape

(6452, 6452)

In [25]:
full_corr_matrix_gene_symbols.head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.115011,0.173138,0.052445,0.008032,0.008727,0.006797,0.004533,0.00735,0.010391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.115011,1.0,0.681368,0.347023,0.011545,0.010729,0.003577,0.01023,0.010747,0.008769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.173138,0.681368,1.0,0.351822,0.011774,0.012527,0.003754,0.012096,0.012679,0.010442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.052445,0.347023,0.351822,1.0,0.014103,0.013988,0.006056,0.006296,0.005363,0.0083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.008032,0.011545,0.011774,0.014103,1.0,0.356676,0.45401,0.137643,0.20034,0.09321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Make matrix compatible with GLS

In [26]:
_eigvals = np.linalg.eigvals(full_corr_matrix_gene_symbols)
display(_eigvals[_eigvals < 0].shape[0])
display(_eigvals[_eigvals < 0])

0

array([], dtype=float64)

In [27]:
try:
    np.linalg.cholesky(full_corr_matrix_gene_symbols)
    print("No need to fix")
except Exception as e:
    print(f"Failed with:\n {str(e)}")

No need to fix


In [28]:
orig_corr_mat = full_corr_matrix_gene_symbols

In [86]:
orig_corr_mat.to_pickle(OUTPUT_DIR / "corr_mat.pkl.xz")

In [29]:
# %load_ext rpy2.ipython

In [30]:
# corr_mat_r = full_corr_matrix_gene_symbols.to_numpy()

In [31]:
# %Rpush corr_mat_r

In [32]:
# %%R
# # taken from https://www.r-bloggers.com/2013/08/correcting-a-pseudo-correlation-matrix-to-be-positive-semidefinite/
# CorrectCM <- function(CM, p = 0) {
#   n <- dim(CM)[1L]
#   E <- eigen(CM)
#   CM1 <- E$vectors %*% tcrossprod(diag(pmax(E$values, p), n), E$vectors)
#   Balance <- diag(1 / sqrt(diag(CM1)))
#   CM2 <- Balance %*% CM1 %*% Balance
#   return(CM2)
# }

In [33]:
# %%R -o corr_mat_r_fixed
# corr_mat_r_fixed <- CorrectCM(corr_mat_r, 1e-5)

In [34]:
# corr_mat_r_fixed

In [35]:
# corr_mat_r_fixed = pd.DataFrame(
#     corr_mat_r_fixed,
#     index=full_corr_matrix_gene_symbols.index.tolist(),
#     columns=full_corr_matrix_gene_symbols.columns.tolist()
# )

In [36]:
# corr_mat_r_fixed.shape

In [37]:
# corr_mat_r_fixed.head()

In [38]:
# corr_mat_r_fixed.equals(full_corr_matrix_gene_symbols)

In [39]:
# del full_corr_matrix_gene_symbols

In [40]:
# orig_corr_mat = corr_mat_r_fixed

In [41]:
# orig_corr_mat.to_pickle(OUTPUT_DIR / "corr_mat_fixed.pkl.xz")

# Functions

In [42]:
import statsmodels.api as sm
from sklearn.preprocessing import scale

In [43]:
def get_data(lv_code, random_phenotype_code=None, real_phenotype_code=None):
    if random_phenotype_code is not None:
        target_data = load_multixcan_random_phenotype(random_phenotype_code)["pvalue"]
        y = pd.Series(
            data=np.abs(stats.norm.ppf(target_data.to_numpy() / 2)),
            index=target_data.index.copy(),
        )
    elif real_phenotype_code is not None:
        y = multixcan_real_results[real_phenotype_code]

    y = y[~y.index.duplicated(keep="first")]
    y = y.dropna()

    common_genes = orig_corr_mat.index.intersection(y.index)
    y = y.loc[common_genes]

    X = multiplier_z[lv_code].copy()
    X = X.loc[common_genes]
    # X = (X - X.mean()) / X.std()
    X = sm.add_constant(X)

    return X, y

In [44]:
def standardize_data(X, y):
    X = X.copy()
    y = y.copy()

    c = [c for c in X.columns if c != "const"]
    X[c] = (X[c] - X[c].mean()) / X[c].std()

    return X, (y - y.mean()) / y.std()

In [45]:
def get_aligned_corr_mat(X, perc=1.0):
    # perc == 1.0 means select all nonzero genes
    gene_corrs = orig_corr_mat.loc[X.index, X.index]

    corr_mat_sub = pd.DataFrame(
        np.identity(gene_corrs.shape[0]),
        index=gene_corrs.index.copy(),
        columns=gene_corrs.columns.copy(),
    )

    X = X.iloc[:, 1]

    X_non_zero = X[X > 0]
    X_thres = X_non_zero.quantile(1 - perc)
    lv_nonzero_genes = X[X >= X_thres].index

    lv_nonzero_genes = lv_nonzero_genes.intersection(gene_corrs.index)
    corr_mat_sub.loc[lv_nonzero_genes, lv_nonzero_genes] = gene_corrs.loc[
        lv_nonzero_genes, lv_nonzero_genes
    ]

    return corr_mat_sub

In [46]:
# testing

In [47]:
def train_statsmodels_gls(X, y, corr_mat):
    gls_model = sm.GLS(y, X, sigma=corr_mat)
    gls_results = gls_model.fit()
    return gls_results

# Make sure statsmodels (Python) and gls from R give the same results

## Random phenotype 0

In [48]:
# lv_code = "LV1"

In [49]:
# X, y = get_data(lv_code, random_phenotype_code=1, transformation="log10")

In [50]:
# X.shape

In [51]:
# y.shape

In [52]:
# corr_mat = get_aligned_corr_mat(X)

In [53]:
# corr_mat.shape

## statsmodels.GLS

In [54]:
# _gls_results = train_statsmodels_gls(X, y, corr_mat)

In [55]:
# print(_gls_results.summary())

In [56]:
# # print full numbers
# with np.printoptions(threshold=sys.maxsize, precision=20):
#     print(_gls_results.params.to_numpy())
#     print(_gls_results.bse.to_numpy())
#     print(_gls_results.tvalues.to_numpy())
#     print(_gls_results.pvalues.to_numpy())

## R gls

In [57]:
# training_data = pd.concat([X, y], axis=1)

In [58]:
# training_data

In [59]:
# %load_ext rpy2.ipython

In [60]:
# corr_mat_r = corr_mat.to_numpy()

In [61]:
# %Rpush corr_mat_r

In [62]:
# %%R -i training_data
# library(nlme)

# C <- corSymm(corr_mat_r[lower.tri(corr_mat_r)], fixed = T)

# g <- gls(pvalue ~ LV1, correlation=C, data=training_data)

In [63]:
# %%R
# summary(g)$tTable

In [64]:
# %%R -o r_gls_results
# r_gls_results <- summary(g)$tTable

In [65]:
# r_gls_results_df = pd.DataFrame(r_gls_results, index=["(Intercept)", lv_code], columns=["Value", "Std.Error", "t-value", "p-value"])

In [66]:
# r_gls_results_df

In [67]:
# assert np.allclose(r_gls_results_df["Value"].to_numpy().flatten(), _gls_results.params.values, atol=0.0, rtol=1e-5)

In [68]:
# assert np.allclose(r_gls_results_df["Std.Error"].to_numpy().flatten(), _gls_results.bse.values, atol=0.0, rtol=1e-5)

In [69]:
# assert np.allclose(r_gls_results_df["t-value"].to_numpy().flatten(), _gls_results.tvalues, atol=0.0, rtol=1e-5)

In [70]:
# assert np.allclose(r_gls_results_df["p-value"].to_numpy().flatten(), _gls_results.pvalues, atol=0.0, rtol=1e-5)

# GLS on randomly generated phenotypes

In [78]:
PERC_NONZERO_GENES = 1.00

## Random phenotype 6 / LV45

In [79]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [80]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [81]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.249
Date:                Thu, 14 Jul 2022   Prob (F-statistic):              0.264
Time:                        14:06:52   Log-Likelihood:                -9137.4
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.829e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0002      0.013      0.017      0.9

In [82]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])

-0.012985100862501646
0.011620815913625014
-1.117400099873973
0.2638649762970155


In [83]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [84]:
y

NOC2L       0.325533
HES4        0.274247
ISG15       1.189163
AGRN        1.567185
TNFRSF18    0.910990
              ...   
CPT1B       1.060302
CHKB        1.820202
MAPK8IP2    0.175087
ARSA        0.432990
SHANK3      0.315227
Length: 6442, dtype: float64

## Random phenotype 10 / LV10

In [101]:
lv_code = "LV100"
phenotype_code = 10

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype10-pvalues'

In [102]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [103]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                 4.829e-06
Date:                Thu, 14 Jul 2022   Prob (F-statistic):              0.998
Time:                        14:18:25   Log-Likelihood:                -8974.5
No. Observations:                6442   AIC:                         1.795e+04
Df Residuals:                    6440   BIC:                         1.797e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0092      0.014      0.676      0.4

In [104]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])

6.008877267380782e-08
2.7345042158271415e-05
0.0021974284159453007
0.9982467752659725


In [105]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [106]:
y

NOC2L       0.679536
HES4        2.495365
ISG15       1.892361
AGRN        1.428397
TNFRSF18    0.390039
              ...   
CPT1B       0.388832
CHKB        0.377164
MAPK8IP2    0.466494
ARSA        0.379426
SHANK3      0.516225
Length: 6442, dtype: float64

## Random phenotype 0 / LV800

In [119]:
lv_code = "LV800"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [120]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [121]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1969
Date:                Thu, 14 Jul 2022   Prob (F-statistic):              0.657
Time:                        14:22:16   Log-Likelihood:                -9062.0
No. Observations:                6442   AIC:                         1.813e+04
Df Residuals:                    6440   BIC:                         1.814e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0013      0.013      0.102      0.9

In [122]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])

0.006103958765364128
0.013754152651896586
0.4437902442883271
0.6572091521023612


In [123]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [124]:
y

NOC2L       1.192661
HES4        0.615835
ISG15       1.388647
AGRN        0.098554
TNFRSF18    0.085000
              ...   
CPT1B       0.060585
CHKB        0.265594
MAPK8IP2    0.940106
ARSA        1.460604
SHANK3      0.015885
Length: 6442, dtype: float64

# GLS on real phenotypes

In [125]:
multixcan_real_results.columns

Index(['100001_raw-Food_weight', '100002_raw-Energy', '100003_raw-Protein',
       '100004_raw-Fat', '100005_raw-Carbohydrate', '100006_raw-Saturated_fat',
       '100007_raw-Polyunsaturated_fat', '100008_raw-Total_sugars',
       '100009_raw-Englyst_dietary_fibre', '100010-Portion_size',
       ...
       'visual impairment', 'vitiligo', 'vitreous body disease',
       'vocal cord polyp', 'voice disorders',
       'wellbeing measurement AND family relationship', 'wheezing',
       'whooping cough', 'worry measurement', 'wrist fracture'],
      dtype='object', length=3752)

## whooping cough / LV570

In [140]:
lv_code = "LV570"
phenotype_code = "whooping cough"

phenotype_name = f"multixcan-phenomexcan-{phenotype_code.replace(' ', '_')}-pvalues"
display(phenotype_name)

'multixcan-phenomexcan-whooping_cough-pvalues'

In [141]:
X, y = get_data(lv_code, real_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [142]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:         whooping cough   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9445
Date:                Thu, 14 Jul 2022   Prob (F-statistic):              0.331
Time:                        14:27:06   Log-Likelihood:                -9095.1
No. Observations:                6450   AIC:                         1.819e+04
Df Residuals:                    6448   BIC:                         1.821e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0036      0.013     -0.273      0.7

In [143]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])

0.010052199503207914
0.010343425212644251
0.971844364565006
0.3311644354099884


In [144]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [145]:
y

NOC2L       1.301498
HES4        0.491604
ISG15       0.300490
AGRN        1.595415
TNFRSF18    0.362178
              ...   
CPT1B       0.283934
CHKB        0.621814
MAPK8IP2    0.479153
ARSA        0.274866
SHANK3      1.125992
Name: whooping cough, Length: 6450, dtype: float64

## wheezing and LV400

In [164]:
lv_code = "LV400"
phenotype_code = "wheezing"

phenotype_name = f"multixcan-phenomexcan-{phenotype_code.replace(' ', '_')}-pvalues"
display(phenotype_name)

'multixcan-phenomexcan-wheezing-pvalues'

In [165]:
X, y = get_data(lv_code, real_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, PERC_NONZERO_GENES)

Xs, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(Xs, ys, corr_mat)

In [166]:
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:               wheezing   R-squared:                       0.010
Model:                            GLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     62.81
Date:                Thu, 14 Jul 2022   Prob (F-statistic):           2.67e-15
Time:                        14:34:48   Log-Likelihood:                -9568.4
No. Observations:                6450   AIC:                         1.914e+04
Df Residuals:                    6448   BIC:                         1.915e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0234      0.014     -1.625      0.1

In [167]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(_gls_results.params.to_numpy()[1])
    print(_gls_results.bse.to_numpy()[1])
    print(_gls_results.tvalues.to_numpy()[1])
    print(_gls_results.pvalues.to_numpy()[1])

-0.09501682279807443
0.01198940412636518
-7.925066316609398
2.6672597928016686e-15


In [168]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [169]:
y

NOC2L       2.745848
HES4        0.208291
ISG15       0.136734
AGRN        0.229924
TNFRSF18    0.408283
              ...   
CPT1B       0.483466
CHKB        0.264246
MAPK8IP2    0.709193
ARSA        0.786883
SHANK3      0.404021
Name: wheezing, Length: 6450, dtype: float64