# Description

It generates tests cases for the GLS model.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm

import conf
import utils
from entity import Gene

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = "1000G_EUR"

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = "1000G"

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = "MASHR"

In [4]:
OUTPUT_DIR = utils.get_git_repository_path() / "tests" / "data" / "gls"
display(OUTPUT_DIR)
assert OUTPUT_DIR.exists()

PosixPath('/opt/code/tests/data/gls')

# Load data

## MultiPLIER Z

In [5]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [6]:
multiplier_z_genes = multiplier_z.index.tolist()

In [7]:
len(multiplier_z_genes)

6750

In [8]:
multiplier_z_genes[:10]

['GAS6',
 'MMP14',
 'DSP',
 'MARCKSL1',
 'SPARC',
 'CTSD',
 'EPAS1',
 'PALLD',
 'PHC2',
 'LGALS3BP']

## Function to load MultiXcan's results on random phenotypes

In [9]:
def load_multixcan_random_phenotype(phenotype_code):
    multixcan_random_results = pd.read_csv(
        conf.RESULTS["GLS_NULL_SIMS"]
        / "twas"
        / "smultixcan"
        / f"random.pheno{phenotype_code}-gtex_v8-mashr-smultixcan.txt",
        sep="\t",
        index_col="gene_name",
    )

    return multixcan_random_results

In [10]:
load_multixcan_random_phenotype(0).head()

Unnamed: 0_level_0,gene,pvalue,n,n_indep,p_i_best,t_i_best,p_i_worst,t_i_worst,eigen_max,eigen_min,eigen_min_kept,z_min,z_max,z_mean,z_sd,tmi,status
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
RHPN2,ENSG00000131941.7,4e-05,48.0,3.0,0.000213947,Artery_Tibial,0.990132,Brain_Nucleus_accumbens_basal_ganglia,36.556432,7.692089e-16,2.519701,-2.721185,3.701952,1.283152,1.825567,3.0,0
GPATCH1,ENSG00000076650.6,7.8e-05,40.0,3.0,0.000453439,Brain_Cerebellum,0.817384,Brain_Frontal_Cortex_BA9,29.990208,2.086487e-15,1.815203,-3.506853,2.383485,-2.016745,1.715495,3.0,0
NFKBIA,ENSG00000100906.10,9.6e-05,1.0,1.0,9.591208e-05,Brain_Frontal_Cortex_BA9,9.6e-05,Brain_Frontal_Cortex_BA9,1.0,1.0,1.0,-3.900707,-3.900707,-3.900707,,1.0,0
TTC5,ENSG00000136319.11,0.000109,47.0,5.0,0.001402826,Brain_Hippocampus,0.961887,Colon_Sigmoid,21.272442,8.142339e-16,0.732606,-3.194069,1.397514,-0.916662,1.068989,5.0,0
ADGRA3,ENSG00000152990.13,0.000135,41.0,12.0,3.211289e-07,Heart_Atrial_Appendage,0.653657,Whole_Blood,12.988248,3.499412e-16,0.444682,-5.110605,3.59941,-0.464735,2.316607,12.0,0


## MultiXcan real results (PhenomeXcan)

In [11]:
multixcan_real_results = pd.read_pickle(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
).rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [12]:
multixcan_real_results = multixcan_real_results[
    ~multixcan_real_results.index.duplicated(keep="first")
].dropna(how="all", axis=0)

In [13]:
multixcan_real_results.shape

(22508, 3752)

In [14]:
multixcan_real_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


In [15]:
assert not multixcan_real_results.isna().any(None)

## Load full correlation matrix

In [16]:
orig_corr_mat = pd.read_pickle(OUTPUT_DIR / "corr_mat.pkl.xz")

In [17]:
orig_corr_mat.shape

(6442, 6442)

In [18]:
orig_corr_mat.head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.115011,0.173138,0.056096,0.008032,0.008727,0.006797,0.004533,0.00735,0.010391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.115011,1.0,0.681368,0.360588,0.011545,0.010729,0.003577,0.01023,0.010747,0.008769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.173138,0.681368,1.0,0.381394,0.011774,0.012527,0.003754,0.012096,0.012679,0.010442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.056096,0.360588,0.381394,1.0,0.013005,0.015775,0.006184,0.006813,0.010775,0.009189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.008032,0.011545,0.011774,0.013005,1.0,0.356676,0.45401,0.137643,0.20034,0.09321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load cohort metadata

In [19]:
gene_tissues_df = pd.read_pickle(
    OUTPUT_DIR / "cohort_1000g_eur_metadata" / "gene_tissues.pkl.gz"
).set_index("gene_name")

In [20]:
gene_tissues_df.shape

(22314, 5)

In [21]:
gene_tissues_df = gene_tissues_df.loc[~gene_tissues_df.index.duplicated(keep="first")]

In [22]:
gene_tissues_df.shape

(22308, 5)

In [23]:
assert gene_tissues_df.index.is_unique

In [24]:
gene_tissues_df.head()

Unnamed: 0_level_0,tissue,n_snps_used_sum,n_snps_in_model_sum,unique_n_snps_in_model,unique_n_snps_used
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,2,2,2
SCYL3,"(Colon_Transverse, Brain_Cerebellum, Brain_Hip...",88,90,18,17
C1orf112,"(Colon_Transverse, Brain_Cerebellum, Brain_Hip...",62,64,24,22
FGR,"(Colon_Transverse, Brain_Cerebellum, Brain_Hip...",40,40,5,5
CFH,"(Colon_Transverse, Brain_Cerebellum, Brain_Hip...",44,44,12,12


# Functions

In [175]:
def get_data(
    lv_code,
    random_phenotype_code,
    add_covars=False,
    add_covars_logs=False,
):
    target_data = load_multixcan_random_phenotype(random_phenotype_code)["pvalue"]
    y = pd.Series(
        data=-np.log10(target_data.to_numpy()),
        index=target_data.index.copy(),
    )

    y = y[~y.index.duplicated(keep="first")]
    y = y.dropna()

    X = multiplier_z[lv_code].copy()

    common_genes = orig_corr_mat.index.intersection(y.index).intersection(X.index)
    y = y.loc[common_genes]
    X = X.loc[common_genes]

    # binarize
    x_perc = 0.01
    x_q = X.quantile(1.0 - x_perc)
    x_binarized = X.copy()
    # make sure top genes have nonzero weights
    x_cond = (x_binarized > 0.0) & (x_binarized >= x_q)
    x_binarized[x_cond] = 1.0
    x_binarized[~x_cond] = 0.0
    X = x_binarized

    X = sm.add_constant(X)

    if add_covars:
        covars = load_multixcan_random_phenotype(random_phenotype_code)[
            ["n", "n_indep"]
        ]
        covars = covars[~covars.index.duplicated(keep="first")]
        covars = covars.loc[X.index]

        # gene-level covariates (S-MultiXcan)
        covars = covars.rename(
            columns={
                "n_indep": "gene_size",
            }
        )
        covars = covars.assign(
            gene_density=covars.apply(lambda x: x["gene_size"] / x["n"], axis=1)
        )

        if add_covars_logs:
            covars["gene_size_log"] = np.log(covars["gene_size"])
            covars["gene_density_log"] = -np.log(covars["gene_density"])

        # if add_covars:
        covars = covars.drop(columns=[c for c in covars.columns if c in ("n",)])

        X = X.join(covars)

    return X, y

In [26]:
# testing
_X, _y = get_data("LV7", 10)
assert _X.shape[0] < 7000
assert _X.shape[1] == 2
assert "LV7" in _X.columns
assert "const" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

x_summary = _X["LV7"].value_counts()
assert x_summary.shape[0] == 2, "Wrong binarization"
n_pos = int(x_summary.loc[1.0])
n_neg = int(x_summary.loc[0.0])
assert n_pos > 10
assert n_neg > 10
assert n_pos < n_neg

In [27]:
_X.head()

Unnamed: 0,const,LV7
NOC2L,1.0,0.0
HES4,1.0,0.0
ISG15,1.0,0.0
AGRN,1.0,0.0
TNFRSF18,1.0,0.0


In [28]:
_y.head()

NOC2L       0.303820
HES4        1.900225
ISG15       1.233268
AGRN        0.814805
TNFRSF18    0.157074
dtype: float64

In [29]:
# testing
_X, _y = get_data("LV7", 10, add_covars=True)
assert _X.shape[0] < 7000
assert _X.shape[1] == 2 + 4
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_size" in _X.columns
assert "gene_density" in _X.columns
assert "gene_n_snps_used" in _X.columns
assert "gene_n_snps_used_sharing" in _X.columns
assert not _X.isna().any(None)

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [30]:
_X.head()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_n_snps_used,gene_n_snps_used_sharing
NOC2L,1.0,0.0,5.0,0.106383,103,4.478261
HES4,1.0,0.0,3.0,0.068182,55,6.875
ISG15,1.0,0.0,6.0,0.142857,59,2.565217
AGRN,1.0,0.0,4.0,0.086957,75,5.0
TNFRSF18,1.0,0.0,4.0,0.088889,65,2.407407


In [31]:
# load_multixcan_random_phenotype(10).loc["TNFRSF18"]
gene_tissues_df.loc["TNFRSF18"]

tissue                    (Colon_Transverse, Brain_Hippocampus, Prostate...
n_snps_used_sum                                                          65
n_snps_in_model_sum                                                      66
unique_n_snps_in_model                                                   28
unique_n_snps_used                                                       27
Name: TNFRSF18, dtype: object

In [32]:
assert _X.loc["TNFRSF18", "gene_size"] == 4.0
assert _X.loc["TNFRSF18", "gene_density"] == 4 / 45.0
assert _X.loc["TNFRSF18", "gene_n_snps_used"] == 65
assert _X.loc["TNFRSF18", "gene_n_snps_used_sharing"] == 65 / 27.0

In [33]:
_y.head()

NOC2L       0.303820
HES4        1.900225
ISG15       1.233268
AGRN        0.814805
TNFRSF18    0.157074
dtype: float64

In [34]:
# testing
_X, _y = get_data("LV7", 10, add_covars=True, add_covars_logs=True)
assert _X.shape[0] < 7000
assert _X.shape[1] == 2 + 4 + 4
assert "LV7" in _X.columns
assert "const" in _X.columns
assert "gene_size" in _X.columns
assert "gene_size_log" in _X.columns
assert "gene_density" in _X.columns
assert "gene_density_log" in _X.columns
assert "gene_n_snps_used" in _X.columns
assert "gene_n_snps_used_log" in _X.columns
assert "gene_n_snps_used_sharing" in _X.columns
assert "gene_n_snps_used_sharing_log" in _X.columns
assert not _X.isna().any(None)

assert _X["gene_density"].between(0.0, 1.0, inclusive="right").all()
assert _X["gene_density_log"].min() >= 0.0
assert _X["gene_size"].min() >= 0.0
assert _X["gene_size_log"].min() >= 0.0
assert _X["gene_n_snps_used"].min() >= 0.0
assert _X["gene_n_snps_used_log"].min() >= 0.0
assert _X["gene_n_snps_used_sharing"].min() >= 0.0
assert _X["gene_n_snps_used_sharing_log"].min() >= 0.0

assert _y.shape[0] == _X.shape[0]
assert not _y.isna().any(None)

In [35]:
_X.head()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_size_log,gene_density_log,gene_n_snps_used,gene_n_snps_used_sharing,gene_n_snps_used_log,gene_n_snps_used_sharing_log
NOC2L,1.0,0.0,5.0,0.106383,1.609438,2.24071,103,4.478261,4.634729,1.499235
HES4,1.0,0.0,3.0,0.068182,1.098612,2.685577,55,6.875,4.007333,1.927892
ISG15,1.0,0.0,6.0,0.142857,1.791759,1.94591,59,2.565217,4.077537,0.942043
AGRN,1.0,0.0,4.0,0.086957,1.386294,2.442347,75,5.0,4.317488,1.609438
TNFRSF18,1.0,0.0,4.0,0.088889,1.386294,2.420368,65,2.407407,4.174387,0.87855


In [36]:
_X.describe()

Unnamed: 0,const,LV7,gene_size,gene_density,gene_size_log,gene_density_log,gene_n_snps_used,gene_n_snps_used_sharing,gene_n_snps_used_log,gene_n_snps_used_sharing_log
count,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0,6442.0
mean,1.0,0.01009,4.365414,0.168023,1.327457,2.133689,56.451568,5.540752,3.816128,1.500996
std,0.0,0.099949,2.317113,0.174679,0.564483,0.801807,30.091728,4.376953,0.795153,0.636456
min,1.0,0.0,1.0,0.020408,0.0,-0.0,1.0,1.0,0.0,0.0
25%,1.0,0.0,3.0,0.066667,1.098612,1.658228,35.0,3.0,3.555348,1.098612
50%,1.0,0.0,4.0,0.108696,1.386294,2.219203,55.0,4.48913,4.007333,1.501656
75%,1.0,0.0,6.0,0.190476,1.791759,2.70805,77.0,6.75,4.343805,1.909543
max,1.0,1.0,18.0,1.0,2.890372,3.89182,204.0,49.0,5.31812,3.89182


In [37]:
_y.head()

NOC2L       0.303820
HES4        1.900225
ISG15       1.233268
AGRN        0.814805
TNFRSF18    0.157074
dtype: float64

In [38]:
def standardize_data(X, y):
    X = X.copy()
    y = y.copy()

    c = [c for c in X.columns if c != "const"]
    X[c] = (X[c] - X[c].mean()) / X[c].std()

    return X, (y - y.mean()) / y.std()

In [207]:
def get_aligned_corr_mat(X, perc=0.01):
    # perc == 1.0 means select all genes
    # perc == 0.01 means select top 1% of genes
    # perc = None means do not subset the correlation matrix
    gene_corrs = orig_corr_mat.loc[X.index, X.index]

    if perc is None:
        return gene_corrs

    corr_mat_sub = pd.DataFrame(
        np.eye(gene_corrs.shape[0]),
        index=gene_corrs.index.copy(),
        columns=gene_corrs.columns.copy(),
    )

    lv_col = X.columns[1]
    assert lv_col.startswith("LV")
    X = X = multiplier_z[lv_col].copy()

    # X_non_zero = X[X > 0]
    X_thres = X.quantile(1.0 - perc)
    lv_nonzero_genes = X[X >= X_thres].index

    lv_nonzero_genes = lv_nonzero_genes.intersection(gene_corrs.index)
    corr_mat_sub.loc[lv_nonzero_genes, lv_nonzero_genes] = gene_corrs.loc[
        lv_nonzero_genes, lv_nonzero_genes
    ]

    return corr_mat_sub

In [191]:
# testing
_X_test = pd.DataFrame(
    {
        "const": 1.0,
        "LV1": [1.0, 0.4, 0.0],  # the last gene has zero weight
    },
    index=[
        "PSMB10",  # the first two genes have a high sum of correlations, to make sure the sum is not close to 1.0
        "SLC12A4",
        "ACD",
    ],
)

# do not subset
_tmp_corr = get_aligned_corr_mat(_X_test, perc=None)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.73],
            [0.77, 1.0, 0.63],
            [0.73, 0.63, 1.00],
        ]
    ),
)

# do subset: include all non-zero LV genes
_tmp_corr = get_aligned_corr_mat(_X_test, perc=1.0)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.77, 0.73],
            [0.77, 1.0, 0.63],
            [0.73, 0.63, 1.00],
        ]
    ),
)

# do subset: but perc is so low that it doesn't select  any genes
_tmp_corr = get_aligned_corr_mat(_X_test, perc=0.001)
assert _tmp_corr.shape == (_X_test.shape[0], _X_test.shape[0])
assert np.array_equal(
    _tmp_corr.round(2).to_numpy(),
    np.array(
        [
            [1.0, 0.00, 0.00],
            [0.00, 1.0, 0.00],
            [0.00, 0.00, 1.00],
        ]
    ),
)

In [192]:
def train_statsmodels_gls(X, y, corr_mat):
    gls_model = sm.GLS(y, X, sigma=corr_mat)
    gls_results = gls_model.fit()
    return gls_results

# [full corr matrix] GLS on randomly generated phenotypes

In [112]:
PERC_NONZERO_GENES = None

## Random phenotype 6 / LV45

In [113]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [114]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [115]:
y

NOC2L       0.127973
HES4        0.105742
ISG15       0.630088
AGRN        0.931549
TNFRSF18    0.440931
              ...   
CPT1B       0.539091
CHKB        1.162865
MAPK8IP2    0.064991
ARSA        0.177164
SHANK3      0.123442
Length: 6442, dtype: float64

In [116]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.161
Date:                Fri, 02 Sep 2022   Prob (F-statistic):              0.281
Time:                        23:25:17   Log-Likelihood:                -8670.9
No. Observations:                6442   AIC:                         1.735e+04
Df Residuals:                    6440   BIC:                         1.736e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0267      0.024      1.109      0.2

In [117]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.10018201664770203
exp_coef_se = 0.09298021617384379
exp_tvalue = -1.0774551917624404
exp_pval_twosided = 0.28131731604765614
exp_pval_onesided = 0.859341341976172
    


In [118]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
CENPH,1.0,1.0
HIST1H2AD,1.0,1.0
HIST1H4D,1.0,1.0
HIST1H2BE,1.0,1.0
HIST1H2BD,1.0,1.0
...,...,...
AIF1,1.0,0.0
NCR3,1.0,0.0
LST1,1.0,0.0
LTB,1.0,0.0


In [120]:
y.sort_values(ascending=False)

CHPF2     4.271450
PRR5      3.711042
MMP12     3.642685
RBM38     3.465685
SOS1      3.378385
            ...   
GPX3      0.000286
SUOX      0.000238
SPRED2    0.000126
DEGS1     0.000053
SAFB      0.000025
Length: 6442, dtype: float64

In [121]:
ys.sort_values(ascending=False)

CHPF2     9.182597
PRR5      7.849822
MMP12     7.687254
RBM38     7.266307
SOS1      7.058689
            ...   
GPX3     -0.975186
SUOX     -0.975301
SPRED2   -0.975567
DEGS1    -0.975742
SAFB     -0.975808
Length: 6442, dtype: float64

In [122]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [123]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.379
Date:                Fri, 02 Sep 2022   Prob (F-statistic):              0.123
Time:                        23:25:18   Log-Likelihood:                -9139.1
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.830e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.055e-17      0.012  -3.25e-15      1.0

## Random phenotype 6 / LV455

In [124]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [125]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [126]:
y

NOC2L       0.127973
HES4        0.105742
ISG15       0.630088
AGRN        0.931549
TNFRSF18    0.440931
              ...   
CPT1B       0.539091
CHKB        1.162865
MAPK8IP2    0.064991
ARSA        0.177164
SHANK3      0.123442
Length: 6442, dtype: float64

In [127]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4638
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.496
Time:                        03:50:08   Log-Likelihood:                -8671.2
No. Observations:                6442   AIC:                         1.735e+04
Df Residuals:                    6440   BIC:                         1.736e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0250      0.024      1.037      0.3

In [128]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.0784587858266203
exp_coef_se = 0.1152051853461905
exp_tvalue = 0.6810351946472929
exp_pval_twosided = 0.4958737072729271
exp_pval_onesided = 0.24793685363646356
    


In [129]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV455
POU2F2,1.0,1.0
EPHB2,1.0,1.0
SETD1B,1.0,1.0
LIG3,1.0,1.0
RFFL,1.0,1.0
...,...,...
FLOT1,1.0,0.0
TUBB,1.0,0.0
MDC1,1.0,0.0
DHX16,1.0,0.0


In [131]:
y.sort_values(ascending=False)

CHPF2     4.271450
PRR5      3.711042
MMP12     3.642685
RBM38     3.465685
SOS1      3.378385
            ...   
GPX3      0.000286
SUOX      0.000238
SPRED2    0.000126
DEGS1     0.000053
SAFB      0.000025
Length: 6442, dtype: float64

In [132]:
ys.sort_values(ascending=False)

CHPF2     9.182597
PRR5      7.849822
MMP12     7.687254
RBM38     7.266307
SOS1      7.058689
            ...   
GPX3     -0.975186
SUOX     -0.975301
SPRED2   -0.975567
DEGS1    -0.975742
SAFB     -0.975808
Length: 6442, dtype: float64

In [133]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [134]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.379
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.123
Time:                        03:51:04   Log-Likelihood:                -9139.1
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.830e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.055e-17      0.012  -3.25e-15      1.0

## Random phenotype 0 / LV801

In [135]:
lv_code = "LV801"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [136]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [137]:
y

NOC2L       0.632640
HES4        0.269215
ISG15       0.782674
AGRN        0.035508
TNFRSF18    0.030462
              ...   
CPT1B       0.021505
CHKB        0.102069
MAPK8IP2    0.459467
ARSA        0.841263
SHANK3      0.005539
Length: 6442, dtype: float64

In [138]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9747
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.324
Time:                        03:51:52   Log-Likelihood:                -8669.9
No. Observations:                6442   AIC:                         1.734e+04
Df Residuals:                    6440   BIC:                         1.736e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0218      0.024     -0.907      0.3

In [139]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.10885793222623774
exp_coef_se = 0.11026400471153004
exp_tvalue = 0.9872481279002079
exp_pval_twosided = 0.32355810271013086
exp_pval_onesided = 0.16177905135506543
    


In [140]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV801
RALGPS2,1.0,1.0
FOXO3,1.0,1.0
AP2S1,1.0,1.0
RAB5C,1.0,1.0
ETFA,1.0,1.0
...,...,...
DHX16,1.0,0.0
MRPS18B,1.0,0.0
TRIM10,1.0,0.0
ZNRD1,1.0,0.0


In [141]:
y.sort_values(ascending=False)

GPATCH1    4.109027
NFKBIA     4.018127
TTC5       3.963686
ZNF17      3.853562
ZNF563     3.747638
             ...   
TXNDC5     0.000225
SEC22B     0.000139
AP3B2      0.000109
WWTR1      0.000108
PCBP4      0.000053
Length: 6442, dtype: float64

In [142]:
ys.sort_values(ascending=False)

GPATCH1    8.400360
NFKBIA     8.193397
TTC5       8.069447
ZNF17      7.818714
ZNF563     7.577544
             ...   
TXNDC5    -0.954608
SEC22B    -0.954804
AP3B2     -0.954872
WWTR1     -0.954875
PCBP4     -0.955001
Length: 6442, dtype: float64

In [143]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [144]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.112
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.146
Time:                        03:51:52   Log-Likelihood:                -9139.2
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.830e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.748e-17      0.012   3.81e-15      1.0

# [sub corr matrix ] GLS on randomly generated phenotypes

In [208]:
PERC_NONZERO_GENES = 0.01

## Random phenotype 6 / LV45

In [209]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [210]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [215]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.8543
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.355
Time:                        04:29:12   Log-Likelihood:                -9136.7
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.829e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0019      0.013      0.150      0.8

In [216]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.13122859992487934
exp_coef_se = 0.14197922645725486
exp_tvalue = -0.9242802852175549
exp_pval_twosided = 0.3553750345424116
exp_pval_onesided = 0.8223124827287942
    


In [217]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45
CENPH,1.0,1.0
HIST1H2AD,1.0,1.0
HIST1H4D,1.0,1.0
HIST1H2BE,1.0,1.0
HIST1H2BD,1.0,1.0
...,...,...
AIF1,1.0,0.0
NCR3,1.0,0.0
LST1,1.0,0.0
LTB,1.0,0.0


In [218]:
y.sort_values(ascending=False)

CHPF2     4.271450
PRR5      3.711042
MMP12     3.642685
RBM38     3.465685
SOS1      3.378385
            ...   
GPX3      0.000286
SUOX      0.000238
SPRED2    0.000126
DEGS1     0.000053
SAFB      0.000025
Length: 6442, dtype: float64

In [219]:
ys.sort_values(ascending=False)

CHPF2     9.182597
PRR5      7.849822
MMP12     7.687254
RBM38     7.266307
SOS1      7.058689
            ...   
GPX3     -0.975186
SUOX     -0.975301
SPRED2   -0.975567
DEGS1    -0.975742
SAFB     -0.975808
Length: 6442, dtype: float64

In [220]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 6 / LV455

In [237]:
lv_code = "LV455"
phenotype_code = 6

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [238]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [239]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.206
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.272
Time:                        04:35:42   Log-Likelihood:                -9139.0
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.830e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0012      0.013     -0.100      0.9

In [240]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.14024604432028576
exp_coef_se = 0.12772108665506693
exp_tvalue = 1.0980649162424108
exp_pval_twosided = 0.2722171958691433
exp_pval_onesided = 0.13610859793457164
    


In [241]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV455
POU2F2,1.0,1.0
EPHB2,1.0,1.0
SETD1B,1.0,1.0
LIG3,1.0,1.0
RFFL,1.0,1.0
...,...,...
FLOT1,1.0,0.0
TUBB,1.0,0.0
MDC1,1.0,0.0
DHX16,1.0,0.0


In [242]:
y.sort_values(ascending=False)

CHPF2     4.271450
PRR5      3.711042
MMP12     3.642685
RBM38     3.465685
SOS1      3.378385
            ...   
GPX3      0.000286
SUOX      0.000238
SPRED2    0.000126
DEGS1     0.000053
SAFB      0.000025
Length: 6442, dtype: float64

In [243]:
ys.sort_values(ascending=False)

CHPF2     9.182597
PRR5      7.849822
MMP12     7.687254
RBM38     7.266307
SOS1      7.058689
            ...   
GPX3     -0.975186
SUOX     -0.975301
SPRED2   -0.975567
DEGS1    -0.975742
SAFB     -0.975808
Length: 6442, dtype: float64

In [244]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 10 / LV100

In [221]:
lv_code = "LV100"
phenotype_code = 10

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype10-pvalues'

In [222]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [223]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                  0.006761
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.934
Time:                        04:30:23   Log-Likelihood:                -9140.4
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.830e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -7.593e-05      0.013     -0.006      0.9

In [224]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.01045784784294252
exp_coef_se = 0.12718873964387706
exp_tvalue = 0.08222306371007403
exp_pval_twosided = 0.9344718886260288
exp_pval_onesided = 0.4672359443130144
    


In [225]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV100
RPL4,1.0,1.0
PTBP1,1.0,1.0
DNM1,1.0,1.0
FXYD5,1.0,1.0
HNRNPA0,1.0,1.0
...,...,...
MDC1,1.0,0.0
DHX16,1.0,0.0
MRPS18B,1.0,0.0
TRIM10,1.0,0.0


In [226]:
y.sort_values(ascending=False)

RPL15      3.931459
VAMP4      3.402868
HMGCS1     3.177896
MED9       3.109621
ABCB10     3.069121
             ...   
DYNLRB1    0.000160
ARPC1A     0.000159
FUS        0.000123
STARD5     0.000052
AMPH       0.000009
Length: 6442, dtype: float64

In [227]:
ys.sort_values(ascending=False)

RPL15      8.255021
VAMP4      7.013330
HMGCS1     6.484860
MED9       6.324478
ABCB10     6.229341
             ...   
DYNLRB1   -0.979822
ARPC1A    -0.979825
FUS       -0.979908
STARD5    -0.980075
AMPH      -0.980176
Length: 6442, dtype: float64

In [228]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

## Random phenotype 0 / LV800

In [229]:
lv_code = "LV800"
phenotype_code = 0

phenotype_name = f"multixcan-random_phenotype{phenotype_code}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype0-pvalues'

In [230]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [231]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.217
Date:                Sat, 03 Sep 2022   Prob (F-statistic):             0.0729
Time:                        04:33:15   Log-Likelihood:                -9114.7
No. Observations:                6442   AIC:                         1.823e+04
Df Residuals:                    6440   BIC:                         1.825e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0028      0.012     -0.227      0.8

In [232]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = 0.28001523677025164
exp_coef_se = 0.1561087448671094
exp_tvalue = 1.79371909631725
exp_pval_twosided = 0.07290491285969262
exp_pval_onesided = 0.03645245642984631
    


In [233]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV800
ZNF627,1.0,1.0
ZKSCAN4,1.0,1.0
ZNF287,1.0,1.0
ZNF180,1.0,1.0
ZNF235,1.0,1.0
...,...,...
ZNF311,1.0,0.0
ZKSCAN3,1.0,0.0
ZNF165,1.0,0.0
HIST1H2BO,1.0,0.0


In [234]:
y.sort_values(ascending=False)

GPATCH1    4.109027
NFKBIA     4.018127
TTC5       3.963686
ZNF17      3.853562
ZNF563     3.747638
             ...   
TXNDC5     0.000225
SEC22B     0.000139
AP3B2      0.000109
WWTR1      0.000108
PCBP4      0.000053
Length: 6442, dtype: float64

In [235]:
ys.sort_values(ascending=False)

GPATCH1    8.400360
NFKBIA     8.193397
TTC5       8.069447
ZNF17      7.818714
ZNF563     7.577544
             ...   
TXNDC5    -0.954608
SEC22B    -0.954804
AP3B2     -0.954872
WWTR1     -0.954875
PCBP4     -0.955001
Length: 6442, dtype: float64

In [236]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

# Test different covariates

In [245]:
covars = load_multixcan_random_phenotype(0)[["n", "n_indep"]]
covars = covars[~covars.index.duplicated(keep="first")]
covars = covars.dropna()
covars = covars.join(gene_tissues_df, how="inner")
assert not covars.isna().any(None)

In [246]:
covars = covars.drop(columns=[c for c in covars.columns if "model" in c])

In [247]:
covars.shape

(22308, 5)

In [248]:
covars.head()

Unnamed: 0_level_0,n,n_indep,tissue,n_snps_used_sum,unique_n_snps_used
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RHPN2,48.0,3.0,"(Colon_Transverse, Brain_Cerebellum, Brain_Hip...",79,25
GPATCH1,40.0,3.0,"(Colon_Transverse, Brain_Cerebellum, Brain_Hip...",50,19
NFKBIA,1.0,1.0,(Brain_Frontal_Cortex_BA9),1,1
TTC5,47.0,5.0,"(Colon_Transverse, Brain_Hippocampus, Prostate...",84,12
ADGRA3,41.0,12.0,"(Colon_Transverse, Brain_Cerebellum, Brain_Hip...",62,30


In [249]:
# gene_size and gene_density
covars = covars.rename(
    columns={
        "n_indep": "gene_size",
    }
)
covars = covars.assign(
    gene_density=covars.apply(lambda x: x["gene_size"] / x["n"], axis=1)
)
covars = covars.drop(columns=["n"])

In [250]:
_final_covars = [
    "gene_size",
    "gene_density",
]

In [251]:
covars[_final_covars].head()

Unnamed: 0_level_0,gene_size,gene_density
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1
RHPN2,3.0,0.0625
GPATCH1,3.0,0.075
NFKBIA,1.0,1.0
TTC5,5.0,0.106383
ADGRA3,12.0,0.292683


In [252]:
covars[_final_covars].describe()

Unnamed: 0,gene_size,gene_density
count,22308.0,22308.0
mean,3.806437,0.279629
std,2.274187,0.30709
min,1.0,0.020408
25%,2.0,0.081633
50%,3.0,0.139535
75%,5.0,0.333333
max,18.0,1.0


In [253]:
_tmp = covars.assign(**{f"{c}_log": np.log(covars[c]) for c in _final_covars})
display(_tmp[[c for c in _tmp.columns if "_log" in c]].describe())

Unnamed: 0,gene_size_log,gene_density_log
count,22308.0,22308.0
mean,1.148862,-1.801823
std,0.641076,1.018646
min,0.0,-3.89182
25%,0.693147,-2.505526
50%,1.098612,-1.969441
75%,1.609438,-1.098612
max,2.890372,0.0


In [254]:
covars[_final_covars].corr()

Unnamed: 0,gene_size,gene_density
gene_size,1.0,-0.171108
gene_density,-0.171108,1.0


In [255]:
_tmp[[c for c in _tmp.columns if "_log" in c]].corr()

Unnamed: 0,gene_size_log,gene_density_log
gene_size_log,1.0,-0.008408
gene_density_log,-0.008408,1.0


## [full corr matrix] GLS on randomly generated phenotypes using gene-level covariates

In [256]:
PERC_NONZERO_GENES = None

### Random phenotype 6 / LV45

In [257]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [258]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [259]:
X.head()

Unnamed: 0,const,LV45,gene_size,gene_density
NOC2L,1.0,0.0,5.0,0.106383
HES4,1.0,0.0,3.0,0.068182
ISG15,1.0,0.0,6.0,0.142857
AGRN,1.0,0.0,4.0,0.086957
TNFRSF18,1.0,0.0,4.0,0.088889


In [260]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.4215
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.738
Time:                        04:53:19   Log-Likelihood:                -8670.8
No. Observations:                6442   AIC:                         1.735e+04
Df Residuals:                    6438   BIC:                         1.738e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0324      0.030      1.063   

In [261]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.10052902446730924
exp_coef_se = 0.09300042682237371
exp_tvalue = -1.0809522913192084
exp_pval_twosided = 0.27975882566803706
exp_pval_onesided = 0.8601205871659815
    


In [262]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45,gene_size,gene_density
CENPH,1.0,1.0,2.0,0.041667
HIST1H2AD,1.0,1.0,3.0,0.166667
HIST1H4D,1.0,1.0,1.0,0.142857
HIST1H2BE,1.0,1.0,2.0,0.074074
HIST1H2BD,1.0,1.0,4.0,0.129032
...,...,...,...,...
AIF1,1.0,0.0,8.0,0.177778
NCR3,1.0,0.0,7.0,0.175000
LST1,1.0,0.0,6.0,0.130435
LTB,1.0,0.0,3.0,0.750000


In [263]:
y.sort_values(ascending=False)

CHPF2     4.271450
PRR5      3.711042
MMP12     3.642685
RBM38     3.465685
SOS1      3.378385
            ...   
GPX3      0.000286
SUOX      0.000238
SPRED2    0.000126
DEGS1     0.000053
SAFB      0.000025
Length: 6442, dtype: float64

In [264]:
ys.sort_values(ascending=False)

CHPF2     9.182597
PRR5      7.849822
MMP12     7.687254
RBM38     7.266307
SOS1      7.058689
            ...   
GPX3     -0.975186
SUOX     -0.975301
SPRED2   -0.975567
DEGS1    -0.975742
SAFB     -0.975808
Length: 6442, dtype: float64

In [265]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")

In [266]:
# for debugging purposes I print the OLS results also
_tmp_model = sm.OLS(ys, Xs)
_tmp_results = _tmp_model.fit()
print(_tmp_results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.379
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.123
Time:                        04:53:20   Log-Likelihood:                -9139.1
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6440   BIC:                         1.830e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4.055e-17      0.012  -3.25e-15      1.0

## [sub corr matrix] GLS on randomly generated phenotypes using SNP-level covariates

In [267]:
PERC_NONZERO_GENES = 0.01

### Random phenotype 6 / LV45

In [268]:
lv_code = "LV45"
phenotype_code = 6

phenotype_name_base = f"multixcan-random_phenotype{phenotype_code}"
phenotype_name = f"{phenotype_name_base}-pvalues"
display(phenotype_name)

'multixcan-random_phenotype6-pvalues'

In [269]:
X, y = get_data(lv_code, random_phenotype_code=phenotype_code, add_covars=True)
corr_mat = get_aligned_corr_mat(X, perc=PERC_NONZERO_GENES)

In [270]:
_, ys = standardize_data(X, y)
_gls_results = train_statsmodels_gls(X, ys, corr_mat)
print(_gls_results.summary())

                            GLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.3591
Date:                Sat, 03 Sep 2022   Prob (F-statistic):              0.783
Time:                        04:56:38   Log-Likelihood:                -9136.5
No. Observations:                6442   AIC:                         1.828e+04
Df Residuals:                    6438   BIC:                         1.831e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.0061      0.027     -0.224   

In [271]:
# print full numbers
with np.printoptions(threshold=sys.maxsize, precision=20):
    print(
        f"""
exp_coef = {_gls_results.params.to_numpy()[1]}
exp_coef_se = {_gls_results.bse.to_numpy()[1]}
exp_tvalue = {_gls_results.tvalues.to_numpy()[1]}
exp_pval_twosided = {_gls_results.pvalues.to_numpy()[1]}
exp_pval_onesided = {stats.t.sf(_gls_results.tvalues.to_numpy()[1], _gls_results.df_resid)}
    """
    )


exp_coef = -0.13057289396289432
exp_coef_se = 0.14208177608559344
exp_tvalue = -0.9189981823160354
exp_pval_twosided = 0.35813095213395807
exp_pval_onesided = 0.820934523933021
    


In [272]:
X.sort_values(lv_code, ascending=False)

Unnamed: 0,const,LV45,gene_size,gene_density
CENPH,1.0,1.0,2.0,0.041667
HIST1H2AD,1.0,1.0,3.0,0.166667
HIST1H4D,1.0,1.0,1.0,0.142857
HIST1H2BE,1.0,1.0,2.0,0.074074
HIST1H2BD,1.0,1.0,4.0,0.129032
...,...,...,...,...
AIF1,1.0,0.0,8.0,0.177778
NCR3,1.0,0.0,7.0,0.175000
LST1,1.0,0.0,6.0,0.130435
LTB,1.0,0.0,3.0,0.750000


In [273]:
y.sort_values(ascending=False)

CHPF2     4.271450
PRR5      3.711042
MMP12     3.642685
RBM38     3.465685
SOS1      3.378385
            ...   
GPX3      0.000286
SUOX      0.000238
SPRED2    0.000126
DEGS1     0.000053
SAFB      0.000025
Length: 6442, dtype: float64

In [274]:
ys.sort_values(ascending=False)

CHPF2     9.182597
PRR5      7.849822
MMP12     7.687254
RBM38     7.266307
SOS1      7.058689
            ...   
GPX3     -0.975186
SUOX     -0.975301
SPRED2   -0.975567
DEGS1    -0.975742
SAFB     -0.975808
Length: 6442, dtype: float64

In [275]:
# save phenotype
y.to_pickle(OUTPUT_DIR / f"{phenotype_name}.pkl.xz")