# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook compiles information about the GWAS and TWAS for a particular cohort. For example, the set of GWAS variants, variance of predicted expression of genes, etc.

It has specicfic parameters for papermill (see under `Settings` below).

This notebook is not directly run. See README.md.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pickle

import numpy as np

import pandas as pd
from tqdm import tqdm
import pytest

import conf
from entity import Gene

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = None

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = None

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = None

# a string with a path pointing to an imputed GWAS
GWAS_FILE = None

# a string with a path pointing where S-PrediXcan results (tissue-specific are located
SPREDIXCAN_FOLDER = None

# an f-string with one placeholder {tissue}
SPREDIXCAN_FILE_PATTERN = None

# a string with a path pointing to an S-MultiXcan result
SMULTIXCAN_FILE = None

In [4]:
# Parameters
COHORT_NAME = "1000G_EUR"
REFERENCE_PANEL = "1000G"
EQTL_MODEL = "MASHR"
GWAS_FILE = "/opt/data/results/gls/null_sims/final_imputed_gwas/random.pheno0.glm-imputed.txt.gz"
SPREDIXCAN_FOLDER = "/opt/data/results/gls/null_sims/twas/spredixcan/"
SPREDIXCAN_FILE_PATTERN = "random.pheno0-gtex_v8-mashr-{tissue}.csv"
SMULTIXCAN_FILE = "/opt/data/results/gls/null_sims/twas/smultixcan/random.pheno0-gtex_v8-mashr-smultixcan.txt"


In [5]:
assert COHORT_NAME is not None and len(COHORT_NAME) > 0, "A cohort name must be given"

COHORT_NAME = COHORT_NAME.lower()
display(f"Cohort name: {COHORT_NAME}")

'Cohort name: 1000g_eur'

In [6]:
assert (
    REFERENCE_PANEL is not None and len(REFERENCE_PANEL) > 0
), "A reference panel must be given"

display(f"Reference panel: {REFERENCE_PANEL}")

'Reference panel: 1000G'

In [7]:
assert GWAS_FILE is not None and len(GWAS_FILE) > 0, "A GWAS file path must be given"
GWAS_FILE = Path(GWAS_FILE).resolve()
assert GWAS_FILE.exists(), "GWAS file does not exist"

display(f"GWAS file path: {str(GWAS_FILE)}")

'GWAS file path: /opt/data/results/gls/null_sims/final_imputed_gwas/random.pheno0.glm-imputed.txt.gz'

In [8]:
assert (
    SPREDIXCAN_FOLDER is not None and len(SPREDIXCAN_FOLDER) > 0
), "An S-PrediXcan folder path must be given"
SPREDIXCAN_FOLDER = Path(SPREDIXCAN_FOLDER).resolve()
assert SPREDIXCAN_FOLDER.exists(), "S-PrediXcan folder does not exist"

display(f"S-PrediXcan folder path: {str(SPREDIXCAN_FOLDER)}")

'S-PrediXcan folder path: /opt/data/results/gls/null_sims/twas/spredixcan'

In [9]:
assert (
    SPREDIXCAN_FILE_PATTERN is not None and len(SPREDIXCAN_FILE_PATTERN) > 0
), "An S-PrediXcan file pattern must be given"
assert (
    "{tissue}" in SPREDIXCAN_FILE_PATTERN
), "S-PrediXcan file pattern must have a '{tissue}' placeholder"

display(f"S-PrediXcan file template: {SPREDIXCAN_FILE_PATTERN}")

'S-PrediXcan file template: random.pheno0-gtex_v8-mashr-{tissue}.csv'

In [10]:
assert (
    SMULTIXCAN_FILE is not None and len(SMULTIXCAN_FILE) > 0
), "An S-MultiXcan result file path must be given"
SMULTIXCAN_FILE = Path(SMULTIXCAN_FILE).resolve()
assert SMULTIXCAN_FILE.exists(), "S-MultiXcan result file does not exist"

display(f"S-MultiXcan file path: {str(SMULTIXCAN_FILE)}")

'S-MultiXcan file path: /opt/data/results/gls/null_sims/twas/smultixcan/random.pheno0-gtex_v8-mashr-smultixcan.txt'

In [11]:
assert (
    EQTL_MODEL is not None and len(EQTL_MODEL) > 0
), "A prediction/eQTL model must be given"

display(f"eQTL model: {EQTL_MODEL}")

'eQTL model: MASHR'

In [12]:
OUTPUT_DIR_BASE = (
    conf.RESULTS["GLS"]
    / "gene_corrs"
    / "cohorts"
    / COHORT_NAME
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)

OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/results/gls/gene_corrs/cohorts/1000g_eur/1000g/mashr'

# Load MultiPLIER Z genes

In [13]:
multiplier_z_genes = pd.read_pickle(
    conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"]
).index.tolist()

In [14]:
len(multiplier_z_genes)

6750

In [15]:
assert len(multiplier_z_genes) == len(set(multiplier_z_genes))

In [16]:
multiplier_z_genes[:5]

['GAS6', 'MMP14', 'DSP', 'MARCKSL1', 'SPARC']

# GWAS

In [17]:
gwas_file_columns = pd.read_csv(GWAS_FILE, sep="\t", nrows=2).columns
assert (
    "panel_variant_id" in gwas_file_columns
), "GWAS file must be final imputed one with column 'panel_variant_id'"
# FIXME: add other needed columns here

In [18]:
gwas_data = pd.read_csv(
    GWAS_FILE,
    sep="\t",
    usecols=["panel_variant_id", "pvalue", "zscore", "imputation_status"],
)

In [19]:
gwas_data.shape

(8339505, 4)

In [20]:
gwas_data.head()

Unnamed: 0,panel_variant_id,zscore,pvalue,imputation_status
0,chr1_54490_G_A_b38,1.567452,0.117009,original
1,chr1_87021_T_C_b38,0.244372,0.806943,imputed
2,chr1_263722_C_G_b38,1.155239,0.247993,imputed
3,chr1_594402_C_T_b38,0.851234,0.394639,imputed
4,chr1_630555_C_T_b38,0.855129,0.39248,imputed


In [21]:
gwas_data["imputation_status"].unique()

array(['original', 'imputed'], dtype=object)

In [22]:
gwas_data.dropna().shape

(8339505, 4)

In [23]:
# remove SNPs with no results
gwas_data = gwas_data.dropna()

In [24]:
gwas_data.shape

(8339505, 4)

## Save GWAS variants

In [25]:
gwas_data.head()

Unnamed: 0,panel_variant_id,zscore,pvalue,imputation_status
0,chr1_54490_G_A_b38,1.567452,0.117009,original
1,chr1_87021_T_C_b38,0.244372,0.806943,imputed
2,chr1_263722_C_G_b38,1.155239,0.247993,imputed
3,chr1_594402_C_T_b38,0.851234,0.394639,imputed
4,chr1_630555_C_T_b38,0.855129,0.39248,imputed


In [26]:
assert gwas_data["panel_variant_id"].is_unique

In [27]:
gwas_variants_ids_set = frozenset(gwas_data["panel_variant_id"])
list(gwas_variants_ids_set)[:5]

['chr2_119011847_T_C_b38',
 'chr14_69314678_G_A_b38',
 'chr6_153545649_G_C_b38',
 'chr16_76797711_A_C_b38',
 'chr1_54582638_G_A_b38']

In [28]:
with open(OUTPUT_DIR_BASE / "gwas_variant_ids.pkl", "wb") as handle:
    pickle.dump(gwas_variants_ids_set, handle, protocol=pickle.HIGHEST_PROTOCOL)

# TWAS

## Available tissues for eQTL model

In [29]:
prediction_model_tissues = conf.PHENOMEXCAN["PREDICTION_MODELS"][
    f"{EQTL_MODEL}_TISSUES"
].split(" ")

In [30]:
len(prediction_model_tissues)

49

In [31]:
prediction_model_tissues[:5]

['Skin_Not_Sun_Exposed_Suprapubic',
 'Cells_EBV-transformed_lymphocytes',
 'Brain_Frontal_Cortex_BA9',
 'Kidney_Cortex',
 'Brain_Substantia_nigra']

## S-MultiXcan results

In [32]:
smultixcan_results = pd.read_csv(
    SMULTIXCAN_FILE, sep="\t", usecols=["gene", "gene_name", "pvalue", "n", "n_indep"]
)

In [33]:
smultixcan_results.shape

(22317, 5)

In [34]:
smultixcan_results = smultixcan_results.dropna()

In [35]:
smultixcan_results.shape

(22314, 5)

In [36]:
smultixcan_results = smultixcan_results.assign(
    gene_id=smultixcan_results["gene"].apply(lambda g: g.split(".")[0])
)

In [37]:
smultixcan_results.head()

Unnamed: 0,gene,gene_name,pvalue,n,n_indep,gene_id
0,ENSG00000131941.7,RHPN2,4e-05,48.0,3.0,ENSG00000131941
1,ENSG00000076650.6,GPATCH1,7.8e-05,40.0,3.0,ENSG00000076650
2,ENSG00000100906.10,NFKBIA,9.6e-05,1.0,1.0,ENSG00000100906
3,ENSG00000136319.11,TTC5,0.000109,47.0,5.0,ENSG00000136319
4,ENSG00000152990.13,ADGRA3,0.000135,41.0,12.0,ENSG00000152990


In [38]:
assert smultixcan_results["gene_id"].is_unique

### Remove duplicated gene names

In [39]:
smultixcan_results["gene_name"].is_unique

False

In [40]:
# list duplicated gene names
_smultixcan_duplicated_gene_names = smultixcan_results[
    smultixcan_results["gene_name"].duplicated(keep=False)
]
display(_smultixcan_duplicated_gene_names)

Unnamed: 0,gene,gene_name,pvalue,n,n_indep,gene_id
1461,ENSG00000235641.4,LINC00484,0.06845,35.0,3.0,ENSG00000235641
2390,ENSG00000237667.5,LINC01115,0.112329,22.0,1.0,ENSG00000237667
5198,ENSG00000272342.1,LINC01115,0.243122,4.0,1.0,ENSG00000272342
5883,ENSG00000229694.6,LINC00484,0.274814,40.0,5.0,ENSG00000229694
6129,ENSG00000147676.13,MAL2,0.286003,29.0,5.0,ENSG00000147676
6207,ENSG00000283992.1,LYNX1,0.289628,45.0,3.0,ENSG00000283992
10559,ENSG00000180155.19,LYNX1,0.48696,48.0,3.0,ENSG00000180155
14026,ENSG00000235271.5,LINC01422,0.643355,38.0,6.0,ENSG00000235271
14251,ENSG00000253972.5,MAL2,0.654873,6.0,4.0,ENSG00000253972
20086,ENSG00000182957.15,SPATA13,0.906729,40.0,14.0,ENSG00000182957


In [41]:
# TODO: my strategy below to handle duplicated gene names is to keep the first one
#  it might be better to have another strategy, maybe keeping the most significant

In [42]:
smultixcan_results = smultixcan_results.drop_duplicates(
    subset=["gene_name"], keep="first"
)
display(smultixcan_results.shape)

(22308, 6)

### Get common genes with MultiPLIER

In [43]:
common_genes = set(multiplier_z_genes).intersection(
    set(smultixcan_results["gene_name"])
)

In [44]:
len(common_genes)

6444

In [45]:
sorted(list(common_genes))[:5]

['A2M', 'AAAS', 'AANAT', 'AARS', 'AARS2']

In [46]:
assert smultixcan_results[smultixcan_results["gene_name"].isin(common_genes)].shape[
    0
] == len(common_genes)

In [47]:
smultixcan_gene_id_common = smultixcan_results[
    smultixcan_results["gene_name"].isin(common_genes)
]["gene_id"]

In [48]:
smultixcan_gene_id_common.shape

(6444,)

In [49]:
assert smultixcan_gene_id_common.is_unique

In [50]:
smultixcan_gene_id_common = set(smultixcan_gene_id_common)

### Save

In [51]:
with open(OUTPUT_DIR_BASE / "common_genes.pkl", "wb") as handle:
    pickle.dump(common_genes, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Genes info

In [52]:
multiplier_gene_obj = {
    gene_name: Gene(name=gene_name)
    for gene_name in common_genes
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
}

In [53]:
len(multiplier_gene_obj)

6444

In [54]:
assert multiplier_gene_obj["GAS6"].ensembl_id == "ENSG00000183087"

In [55]:
_gene_obj = list(multiplier_gene_obj.values())

genes_info = pd.DataFrame(
    {
        "name": [g.name for g in _gene_obj],
        "id": [g.ensembl_id for g in _gene_obj],
        "chr": [g.chromosome for g in _gene_obj],
        "band": [g.band for g in _gene_obj],
        "start_position": [g.get_attribute("start_position") for g in _gene_obj],
        "end_position": [g.get_attribute("end_position") for g in _gene_obj],
    }
)

In [56]:
genes_info = genes_info.assign(
    gene_length=genes_info.apply(
        lambda x: x["end_position"] - x["start_position"], axis=1
    )
)

In [57]:
genes_info.shape

(6444, 7)

In [58]:
genes_info.head()

Unnamed: 0,name,id,chr,band,start_position,end_position,gene_length
0,SYNJ2,ENSG00000078269,6,6q25.3,157981863.0,158099176.0,117313.0
1,AMDHD2,ENSG00000162066,16,16p13.3,2520357.0,2531422.0,11065.0
2,CENPL,ENSG00000120334,1,1q25.1,173799550.0,173824720.0,25170.0
3,NPR2,ENSG00000159899,9,9p13.3,35792154.0,35809732.0,17578.0
4,H2AFZ,ENSG00000164032,4,4q23,99948086.0,99950355.0,2269.0


In [59]:
genes_info.sort_values("chr")

Unnamed: 0,name,id,chr,band,start_position,end_position,gene_length
5412,BEND5,ENSG00000162373,1,1p33,48727519.0,48776969.0,49450.0
3896,PTGS2,ENSG00000073756,1,1q31.1,186671791.0,186680423.0,8632.0
3886,PHLDA3,ENSG00000174307,1,1q32.1,201464383.0,201469237.0,4854.0
1021,SORT1,ENSG00000134243,1,1p13.3,109309568.0,109397918.0,88350.0
3877,MEF2D,ENSG00000116604,1,1q22,156463727.0,156500779.0,37052.0
...,...,...,...,...,...,...,...
4718,NCBP1,ENSG00000136937,9,9q22.33,97633668.0,97673748.0,40080.0
3739,ZFP37,ENSG00000136866,9,9q32,113038380.0,113056759.0,18379.0
1526,RPL7A,ENSG00000148303,9,9q34.2,133348218.0,133351426.0,3208.0
951,TMEM133,ENSG00000170647,,,,,


### Save

In [60]:
genes_info.to_pickle(OUTPUT_DIR_BASE / "genes_info.pkl")

## S-PrediXcan results

### Load results across all tissues

In [61]:
spredixcan_result_files = {
    t: SPREDIXCAN_FOLDER / SPREDIXCAN_FILE_PATTERN.format(tissue=t)
    for t in prediction_model_tissues
}

In [62]:
assert len(spredixcan_result_files) == len(prediction_model_tissues)
display(list(spredixcan_result_files.values())[:5])

[PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Skin_Not_Sun_Exposed_Suprapubic.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Cells_EBV-transformed_lymphocytes.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Brain_Frontal_Cortex_BA9.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Kidney_Cortex.csv'),
 PosixPath('/opt/data/results/gls/null_sims/twas/spredixcan/random.pheno0-gtex_v8-mashr-Brain_Substantia_nigra.csv')]

In [63]:
# look at the structure of one result
pd.read_csv(spredixcan_result_files["Whole_Blood"]).head()

Unnamed: 0,gene,gene_name,zscore,effect_size,pvalue,var_g,pred_perf_r2,pred_perf_pval,pred_perf_qval,n_snps_used,n_snps_in_cov,n_snps_in_model,best_gwas_p,largest_weight
0,ENSG00000131236.16,CAP1,-3.882684,,0.000103,8.3699e-05,,,,1,1,1,0.000103,0.021593
1,ENSG00000130787.13,HIP1R,3.862583,,0.000112,0.003039669,,,,2,2,2,0.001681,0.087495
2,ENSG00000103018.16,CYB5B,3.478682,,0.000504,0.006455623,,,,1,1,1,0.000504,0.114291
3,ENSG00000117906.13,RCN2,3.431258,,0.000601,5.848182e-07,,,,1,1,1,0.000601,0.001696
4,ENSG00000135469.13,COQ10A,3.363631,,0.000769,0.0004312235,,,,2,2,2,0.013565,0.06446


In [64]:
assert all(f.exists() for f in spredixcan_result_files.values())

In [65]:
spredixcan_dfs = [
    pd.read_csv(
        f,
        usecols=[
            "gene",
            "gene_name",
            "zscore",
            "pvalue",
            "n_snps_used",
            "n_snps_in_model",
        ],
    )
    .dropna(subset=["gene", "zscore", "pvalue"])
    .assign(tissue=t)
    for t, f in spredixcan_result_files.items()
]

In [66]:
assert len(spredixcan_dfs) == len(prediction_model_tissues)

In [67]:
spredixcan_dfs = pd.concat(spredixcan_dfs)

In [68]:
assert spredixcan_dfs["tissue"].unique().shape[0] == len(prediction_model_tissues)

In [69]:
spredixcan_dfs.shape

(653140, 7)

In [70]:
spredixcan_dfs = spredixcan_dfs.assign(
    gene_id=spredixcan_dfs["gene"].apply(lambda g: g.split(".")[0])
)

In [71]:
spredixcan_dfs.head()

Unnamed: 0,gene,gene_name,zscore,pvalue,n_snps_used,n_snps_in_model,tissue,gene_id
0,ENSG00000131236.16,CAP1,-3.882684,0.000103,1,1,Skin_Not_Sun_Exposed_Suprapubic,ENSG00000131236
1,ENSG00000196172.9,ZNF681,-3.710444,0.000207,1,1,Skin_Not_Sun_Exposed_Suprapubic,ENSG00000196172
2,ENSG00000280789.1,PAGR1,3.403263,0.000666,2,2,Skin_Not_Sun_Exposed_Suprapubic,ENSG00000280789
3,ENSG00000215218.3,UBE2QL1,3.385093,0.000712,3,3,Skin_Not_Sun_Exposed_Suprapubic,ENSG00000215218
4,ENSG00000167371.17,PRRT2,3.381834,0.00072,2,2,Skin_Not_Sun_Exposed_Suprapubic,ENSG00000167371


In [72]:
# leave only common genes
spredixcan_dfs = spredixcan_dfs[
    spredixcan_dfs["gene_id"].isin(smultixcan_gene_id_common)
].drop(columns=["gene_name"])

In [73]:
spredixcan_dfs.shape

(233567, 7)

### Count number of tissues available per gene

In [74]:
spredixcan_genes_n_models = spredixcan_dfs.groupby("gene_id")["tissue"].nunique()

In [75]:
spredixcan_genes_n_models

gene_id
ENSG00000000419     2
ENSG00000000938    36
ENSG00000000971    34
ENSG00000001084    32
ENSG00000001167    40
                   ..
ENSG00000278540    36
ENSG00000278828     4
ENSG00000278845    49
ENSG00000281005    49
ENSG00000282608    36
Name: tissue, Length: 6444, dtype: int64

In [76]:
# testing that in S-MultiXcan I get the same number of tissues per gene
_tmp_smultixcan_results_n_models = (
    smultixcan_results.set_index("gene_id")["n"].astype(int).rename("tissue")
)

_cg = _tmp_smultixcan_results_n_models.index.intersection(
    spredixcan_genes_n_models.index
)
_tmp_smultixcan_results_n_models = _tmp_smultixcan_results_n_models.loc[_cg]
_spredixcan = spredixcan_genes_n_models.loc[_cg]

assert _spredixcan.shape[0] == _tmp_smultixcan_results_n_models.shape[0]
assert _spredixcan.equals(_tmp_smultixcan_results_n_models.loc[_spredixcan.index])

### Get tissues available per gene

In [77]:
spredixcan_genes_models = spredixcan_dfs.groupby("gene_id")["tissue"].apply(
    lambda x: frozenset(x.tolist())
)

In [78]:
spredixcan_genes_models

gene_id
ENSG00000000419         (Brain_Hypothalamus, Brain_Substantia_nigra)
ENSG00000000938    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
ENSG00000000971    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
ENSG00000001084    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
ENSG00000001167    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
                                         ...                        
ENSG00000278540    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
ENSG00000278828    (Esophagus_Muscularis, Artery_Coronary, Adipos...
ENSG00000278845    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
ENSG00000281005    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
ENSG00000282608    (Brain_Hippocampus, Artery_Tibial, Brain_Anter...
Name: tissue, Length: 6444, dtype: object

In [79]:
assert spredixcan_genes_n_models.shape[0] == spredixcan_genes_models.shape[0]

In [80]:
assert spredixcan_genes_n_models.index.equals(spredixcan_genes_models.index)

In [81]:
assert (spredixcan_genes_models.apply(len) <= len(prediction_model_tissues)).all()

In [82]:
spredixcan_genes_models.apply(len).describe()

count    6444.000000
mean       36.245655
std        12.962835
min         1.000000
25%        29.000000
50%        41.000000
75%        47.000000
max        49.000000
Name: tissue, dtype: float64

In [83]:
# testing
assert (
    spredixcan_genes_models.loc[spredixcan_genes_n_models.index]
    .apply(len)
    .equals(spredixcan_genes_n_models)
)

### Add gene name and set index

In [84]:
spredixcan_genes_models = spredixcan_genes_models.to_frame().reset_index()

In [85]:
spredixcan_genes_models.head()

Unnamed: 0,gene_id,tissue
0,ENSG00000000419,"(Brain_Hypothalamus, Brain_Substantia_nigra)"
1,ENSG00000000938,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."
2,ENSG00000000971,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."
3,ENSG00000001084,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."
4,ENSG00000001167,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."


In [86]:
spredixcan_genes_models = spredixcan_genes_models.assign(
    gene_name=spredixcan_genes_models["gene_id"].apply(
        lambda g: Gene.GENE_ID_TO_NAME_MAP[g]
    )
)

In [87]:
spredixcan_genes_models = spredixcan_genes_models[["gene_id", "gene_name", "tissue"]]

In [88]:
spredixcan_genes_models = spredixcan_genes_models.set_index("gene_id")

In [89]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)"
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter..."


### Add number of tissues

In [90]:
spredixcan_genes_models = spredixcan_genes_models.assign(
    n_tissues=spredixcan_genes_models["tissue"].apply(len)
)

In [91]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",36
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",34
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",32
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",40


### Get gene's objects

In [92]:
spredixcan_gene_obj = {
    gene_id: Gene(ensembl_id=gene_id) for gene_id in spredixcan_genes_models.index
}

In [93]:
len(spredixcan_gene_obj)

6444

### Add genes' variance captured by principal components

In [94]:
def _get_gene_pc_variance(gene_row):
    gene_id = gene_row.name
    gene_tissues = gene_row["tissue"]
    gene_obj = spredixcan_gene_obj[gene_id]

    u, s, vt = gene_obj.get_tissues_correlations_svd(
        tissues=gene_tissues,
        snps_subset=gwas_variants_ids_set,
        reference_panel=REFERENCE_PANEL,
        model_type=EQTL_MODEL,
        # use_covariance_matrix=True,
    )

    return s

In [95]:
_tmp = spredixcan_genes_models.loc["ENSG00000188976"]
_get_gene_pc_variance(_tmp)

array([34.04228543,  4.46235102,  2.10095291,  2.01662842,  1.39383852])

In [96]:
spredixcan_genes_tissues_pc_variance = spredixcan_genes_models.apply(
    _get_gene_pc_variance, axis=1
)

In [97]:
spredixcan_genes_tissues_pc_variance

gene_id
ENSG00000000419             [1.0372585612589562, 0.9627414387410438]
ENSG00000000938    [30.57880701512375, 2.0297326456001112, 1.4017...
ENSG00000000971    [20.772608276639335, 8.177158142324833, 1.8646...
ENSG00000001084    [20.97644994026909, 4.772354749306766, 2.23620...
ENSG00000001167                                  [37.63836978240501]
                                         ...                        
ENSG00000278540    [30.32818460171692, 3.2291618901090953, 1.3858...
ENSG00000278828              [3.053206216887738, 0.9467937831122616]
ENSG00000278845              [45.592035288667724, 2.289944011540383]
ENSG00000281005                                  [48.39224693554061]
ENSG00000282608    [22.226039467557197, 6.465743613747885, 2.7570...
Length: 6444, dtype: object

In [98]:
# testing
assert spredixcan_genes_tissues_pc_variance.loc[
    "ENSG00000188976"
].sum() == pytest.approx(44.01605629086847)
# this is using the covariance:
# assert spredixcan_genes_tissues_pc_variance.loc["ENSG00000188976"].sum() == pytest.approx(1.1492946006449425)

In [99]:
# add to spredixcan_genes_models
spredixcan_genes_models = spredixcan_genes_models.join(
    spredixcan_genes_tissues_pc_variance.rename("tissues_pc_variances")
)

In [100]:
spredixcan_genes_models.shape

(6444, 4)

In [101]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues,tissues_pc_variances
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]"
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",36,"[30.57880701512375, 2.0297326456001112, 1.4017..."
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",34,"[20.772608276639335, 8.177158142324833, 1.8646..."
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",32,"[20.97644994026909, 4.772354749306766, 2.23620..."
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",40,[37.63836978240501]


### Add genes' variance captured by principal components (covariance)

In [102]:
def _get_gene_pc_variance(gene_row):
    gene_id = gene_row.name
    gene_tissues = gene_row["tissue"]
    gene_obj = spredixcan_gene_obj[gene_id]

    u, s, vt = gene_obj.get_tissues_correlations_svd(
        tissues=gene_tissues,
        snps_subset=gwas_variants_ids_set,
        reference_panel=REFERENCE_PANEL,
        model_type=EQTL_MODEL,
        use_covariance_matrix=True,
    )

    return s

In [103]:
_tmp = spredixcan_genes_models.loc["ENSG00000188976"]
_get_gene_pc_variance(_tmp)

array([0.99215632, 0.1187998 , 0.03833847])

In [104]:
spredixcan_genes_tissues_pc_variance = spredixcan_genes_models.apply(
    _get_gene_pc_variance, axis=1
)

In [105]:
spredixcan_genes_tissues_pc_variance

gene_id
ENSG00000000419        [0.008284978865240098, 0.0007326748393412591]
ENSG00000000938          [0.22478471249969328, 0.008813873813031786]
ENSG00000000971    [0.196995342795801, 0.057318253163491664, 0.02...
ENSG00000001084    [0.36578096389253933, 0.1264541371042455, 0.01...
ENSG00000001167            [1.4787144508923993, 0.04969061838773912]
                                         ...                        
ENSG00000278540          [0.12964331457191375, 0.012149255372479935]
ENSG00000278828       [0.0038984582827012834, 0.0024822589593948897]
ENSG00000278845            [1.0985436416424623, 0.04839888608750839]
ENSG00000281005                                 [3.7503119195560304]
ENSG00000282608    [0.23852663455925402, 0.034896956647310455, 0....
Length: 6444, dtype: object

In [106]:
# testing
# assert spredixcan_genes_tissues_pc_variance.loc["ENSG00000188976"].sum() == pytest.approx(44.01605629086847)
# this is using the covariance:
assert spredixcan_genes_tissues_pc_variance.loc[
    "ENSG00000188976"
].sum() == pytest.approx(1.1492946006449425)

In [107]:
# add to spredixcan_genes_models
spredixcan_genes_models = spredixcan_genes_models.join(
    spredixcan_genes_tissues_pc_variance.rename("tissues_pc_variances_cov")
)

In [108]:
spredixcan_genes_models.shape

(6444, 5)

In [109]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]","[0.008284978865240098, 0.0007326748393412591]"
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",36,"[30.57880701512375, 2.0297326456001112, 1.4017...","[0.22478471249969328, 0.008813873813031786]"
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",34,"[20.772608276639335, 8.177158142324833, 1.8646...","[0.196995342795801, 0.057318253163491664, 0.02..."
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",32,"[20.97644994026909, 4.772354749306766, 2.23620...","[0.36578096389253933, 0.1264541371042455, 0.01..."
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",40,[37.63836978240501],"[1.4787144508923993, 0.04969061838773912]"


### Add gene variance per tissue

In [110]:
def _get_gene_variances(gene_row):
    gene_id = gene_row.name
    gene_tissues = gene_row["tissue"]

    tissue_variances = {}
    gene_obj = spredixcan_gene_obj[gene_id]

    for tissue in gene_tissues:
        tissue_var = gene_obj.get_pred_expression_variance(
            tissue=tissue,
            reference_panel=REFERENCE_PANEL,
            model_type=EQTL_MODEL,
            snps_subset=gwas_variants_ids_set,
        )

        if tissue_var is not None:
            tissue_variances[tissue] = tissue_var

    return tissue_variances

In [111]:
_tmp = spredixcan_genes_models.loc["ENSG00000000419"]
_get_gene_variances(_tmp)

{'Brain_Hypothalamus': 0.0082838613770255,
 'Brain_Substantia_nigra': 0.0007337923275558569}

In [112]:
spredixcan_genes_tissues_variance = spredixcan_genes_models.apply(
    _get_gene_variances, axis=1
)

In [113]:
spredixcan_genes_tissues_variance

gene_id
ENSG00000000419    {'Brain_Hypothalamus': 0.0082838613770255, 'Br...
ENSG00000000938    {'Brain_Hippocampus': 0.006367955494129906, 'A...
ENSG00000000971    {'Brain_Hippocampus': 0.005965148549549016, 'A...
ENSG00000001084    {'Brain_Hippocampus': 0.009716241262648275, 'A...
ENSG00000001167    {'Brain_Hippocampus': 0.016125723366546694, 'A...
                                         ...                        
ENSG00000278540    {'Brain_Hippocampus': 0.003810106598614404, 'A...
ENSG00000278828    {'Esophagus_Muscularis': 0.0028219750571598346...
ENSG00000278845    {'Brain_Hippocampus': 0.03965092336361112, 'Ar...
ENSG00000281005    {'Brain_Hippocampus': 0.08252181457159478, 'Ar...
ENSG00000282608    {'Brain_Hippocampus': 0.0022477482402160683, '...
Length: 6444, dtype: object

In [114]:
# testing
_gene_id = "ENSG00000188976"
x = spredixcan_genes_tissues_variance.loc[_gene_id]
# expected value obtained by sum of PCA eigenvalues on this gene's predicted expression
assert np.sum(list(x.values())) == pytest.approx(1.2326202607409493)

In [115]:
# testing
spredixcan_genes_tissues_variance.loc["ENSG00000000419"]

{'Brain_Hypothalamus': 0.0082838613770255,
 'Brain_Substantia_nigra': 0.0007337923275558569}

In [116]:
# FIXME: maybe add more tests, these differt from GTEX V8
# # testing
# # here values were obtained from S-PrediXcan results, where the reference panel is GTEX V8, not 1000G, so just approximations)
# _gene_id = "ENSG00000000419"
# assert spredixcan_genes_tissues_variance.loc[_gene_id]["Brain_Substantia_nigra"] == pytest.approx(0.0004266255268163448)
# assert spredixcan_genes_tissues_variance.loc[_gene_id]["Brain_Hypothalamus"] == pytest.approx(0.011235877515236132)

In [117]:
# add to spredixcan_genes_models
spredixcan_genes_models = spredixcan_genes_models.join(
    spredixcan_genes_tissues_variance.rename("tissues_variances")
)

In [118]:
spredixcan_genes_models.shape

(6444, 6)

In [119]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov,tissues_variances
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]","[0.008284978865240098, 0.0007326748393412591]","{'Brain_Hypothalamus': 0.0082838613770255, 'Br..."
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",36,"[30.57880701512375, 2.0297326456001112, 1.4017...","[0.22478471249969328, 0.008813873813031786]","{'Brain_Hippocampus': 0.006367955494129906, 'A..."
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",34,"[20.772608276639335, 8.177158142324833, 1.8646...","[0.196995342795801, 0.057318253163491664, 0.02...","{'Brain_Hippocampus': 0.005965148549549016, 'A..."
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",32,"[20.97644994026909, 4.772354749306766, 2.23620...","[0.36578096389253933, 0.1264541371042455, 0.01...","{'Brain_Hippocampus': 0.009716241262648275, 'A..."
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",40,[37.63836978240501],"[1.4787144508923993, 0.04969061838773912]","{'Brain_Hippocampus': 0.016125723366546694, 'A..."


### Count number of SNPs predictors used across tissue models

In [120]:
spredixcan_genes_sum_of_n_snps_used = (
    spredixcan_dfs.groupby("gene_id")["n_snps_used"].sum().rename("n_snps_used_sum")
)

In [121]:
spredixcan_genes_sum_of_n_snps_used

gene_id
ENSG00000000419     2
ENSG00000000938    40
ENSG00000000971    44
ENSG00000001084    46
ENSG00000001167    47
                   ..
ENSG00000278540    44
ENSG00000278828     5
ENSG00000278845    89
ENSG00000281005    81
ENSG00000282608    40
Name: n_snps_used_sum, Length: 6444, dtype: int64

In [122]:
# add sum of snps used to spredixcan_genes_models
spredixcan_genes_models = spredixcan_genes_models.join(
    spredixcan_genes_sum_of_n_snps_used
)

In [123]:
spredixcan_genes_models.shape

(6444, 7)

In [124]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov,tissues_variances,n_snps_used_sum
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]","[0.008284978865240098, 0.0007326748393412591]","{'Brain_Hypothalamus': 0.0082838613770255, 'Br...",2
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",36,"[30.57880701512375, 2.0297326456001112, 1.4017...","[0.22478471249969328, 0.008813873813031786]","{'Brain_Hippocampus': 0.006367955494129906, 'A...",40
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",34,"[20.772608276639335, 8.177158142324833, 1.8646...","[0.196995342795801, 0.057318253163491664, 0.02...","{'Brain_Hippocampus': 0.005965148549549016, 'A...",44
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",32,"[20.97644994026909, 4.772354749306766, 2.23620...","[0.36578096389253933, 0.1264541371042455, 0.01...","{'Brain_Hippocampus': 0.009716241262648275, 'A...",46
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",40,[37.63836978240501],"[1.4787144508923993, 0.04969061838773912]","{'Brain_Hippocampus': 0.016125723366546694, 'A...",47


### Count number of SNPs predictors in models across tissue models

In [125]:
spredixcan_genes_sum_of_n_snps_in_model = (
    spredixcan_dfs.groupby("gene_id")["n_snps_in_model"]
    .sum()
    .rename("n_snps_in_model_sum")
)

In [126]:
spredixcan_genes_sum_of_n_snps_in_model

gene_id
ENSG00000000419     2
ENSG00000000938    40
ENSG00000000971    44
ENSG00000001084    46
ENSG00000001167    48
                   ..
ENSG00000278540    44
ENSG00000278828     5
ENSG00000278845    91
ENSG00000281005    81
ENSG00000282608    40
Name: n_snps_in_model_sum, Length: 6444, dtype: int64

In [127]:
# add sum of snps in model to spredixcan_genes_models
spredixcan_genes_models = spredixcan_genes_models.join(
    spredixcan_genes_sum_of_n_snps_in_model
)

In [128]:
spredixcan_genes_models.shape

(6444, 8)

In [129]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov,tissues_variances,n_snps_used_sum,n_snps_in_model_sum
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]","[0.008284978865240098, 0.0007326748393412591]","{'Brain_Hypothalamus': 0.0082838613770255, 'Br...",2,2
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",36,"[30.57880701512375, 2.0297326456001112, 1.4017...","[0.22478471249969328, 0.008813873813031786]","{'Brain_Hippocampus': 0.006367955494129906, 'A...",40,40
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",34,"[20.772608276639335, 8.177158142324833, 1.8646...","[0.196995342795801, 0.057318253163491664, 0.02...","{'Brain_Hippocampus': 0.005965148549549016, 'A...",44,44
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",32,"[20.97644994026909, 4.772354749306766, 2.23620...","[0.36578096389253933, 0.1264541371042455, 0.01...","{'Brain_Hippocampus': 0.009716241262648275, 'A...",46,46
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",40,[37.63836978240501],"[1.4787144508923993, 0.04969061838773912]","{'Brain_Hippocampus': 0.016125723366546694, 'A...",47,48


### Summarize prediction models for each gene

In [130]:
def _summarize_gene_models(gene_id):
    """
    For a given gene ID, it returns a dataframe with predictor SNPs in rows and tissues in columns, where
    values are the weights of SNPs in those tissues.
    It can contain NaNs.
    """
    gene_obj = spredixcan_gene_obj[gene_id]
    gene_tissues = spredixcan_genes_models.loc[gene_id, "tissue"]

    gene_models = {}
    gene_unique_snps = set()
    for t in gene_tissues:
        gene_model = gene_obj.get_prediction_weights(tissue=t, model_type=EQTL_MODEL)
        gene_models[t] = gene_model

        gene_unique_snps.update(set(gene_model.index))

    df = pd.DataFrame(
        data=np.nan, index=list(gene_unique_snps), columns=list(gene_tissues)
    )

    for t in df.columns:
        for snp in df.index:
            gene_model = gene_models[t]

            if snp in gene_model.index:
                df.loc[snp, t] = gene_model.loc[snp]

    return df

In [131]:
# testing
spredixcan_gene_obj["ENSG00000000419"].get_prediction_weights(
    tissue="Brain_Hypothalamus", model_type=EQTL_MODEL
)

varID
chr20_50862947_C_T_b38    0.431375
Name: weight, dtype: float64

In [132]:
spredixcan_gene_obj["ENSG00000000419"].get_prediction_weights(
    tissue="Brain_Substantia_nigra", model_type=EQTL_MODEL
)

varID
chr20_50957480_C_T_b38   -0.146796
Name: weight, dtype: float64

In [133]:
# testing
_gene_id = "ENSG00000000419"

_gene_model = _summarize_gene_models(_gene_id)
assert (
    _gene_model.loc["chr20_50862947_C_T_b38", "Brain_Hypothalamus"].round(5) == 0.43138
)
assert pd.isnull(_gene_model.loc["chr20_50957480_C_T_b38", "Brain_Hypothalamus"])

assert pd.isnull(_gene_model.loc["chr20_50862947_C_T_b38", "Brain_Substantia_nigra"])
assert (
    _gene_model.loc["chr20_50957480_C_T_b38", "Brain_Substantia_nigra"].round(5)
    == -0.1468
)

In [134]:
gene_models = {}

for gene_id in spredixcan_genes_models.index:
    gene_models[gene_id] = _summarize_gene_models(gene_id)

In [135]:
# testing
_gene_id = "ENSG00000000419"

_gene_model = gene_models[_gene_id]
assert (
    _gene_model.loc["chr20_50862947_C_T_b38", "Brain_Hypothalamus"].round(5) == 0.43138
)
assert pd.isnull(_gene_model.loc["chr20_50957480_C_T_b38", "Brain_Hypothalamus"])

assert pd.isnull(_gene_model.loc["chr20_50862947_C_T_b38", "Brain_Substantia_nigra"])
assert (
    _gene_model.loc["chr20_50957480_C_T_b38", "Brain_Substantia_nigra"].round(5)
    == -0.1468
)

In [136]:
# save
import gzip

with gzip.GzipFile(OUTPUT_DIR_BASE / "gene_tissues_models.pkl.gz", "w") as f:
    pickle.dump(gene_models, f)

In [137]:
# testing saved file
with gzip.GzipFile(OUTPUT_DIR_BASE / "gene_tissues_models.pkl.gz", "r") as f:
    _tmp = pickle.load(f)

In [138]:
assert len(gene_models) == len(_tmp)
assert gene_models["ENSG00000000419"].equals(_tmp["ENSG00000000419"])

### Count number of _unique_ SNPs predictors used and available across tissue models

In [139]:
def _count_unique_snps(gene_id):
    """
    For a gene_id, it counts unique SNPs in all models and their intersection with GWAS SNPs (therefore, used by S-PrediXcan).
    """
    gene_obj = spredixcan_gene_obj[gene_id]
    gene_tissues = spredixcan_genes_models.loc[gene_id, "tissue"]

    gene_unique_snps = set()
    for t in gene_tissues:
        t_snps = set(gene_models[gene_id].index)
        gene_unique_snps.update(t_snps)

    gene_unique_snps_in_gwas = gwas_variants_ids_set.intersection(gene_unique_snps)

    return pd.Series(
        {
            "unique_n_snps_in_model": len(gene_unique_snps),
            "unique_n_snps_used": len(gene_unique_snps_in_gwas),
        }
    )

In [140]:
# testing
spredixcan_genes_models[spredixcan_genes_models["n_snps_used_sum"] == 2].head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov,tissues_variances,n_snps_used_sum,n_snps_in_model_sum
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]","[0.008284978865240098, 0.0007326748393412591]","{'Brain_Hypothalamus': 0.0082838613770255, 'Br...",2,2
ENSG00000010256,UQCRC1,"(Thyroid, Whole_Blood)",2,"[1.0093689729110555, 0.9906310270889445]","[0.0009273176884979874, 0.00020344782573448428]","{'Thyroid': 0.0009272948084506859, 'Whole_Bloo...",2,2
ENSG00000017427,IGF1,"(Brain_Amygdala, Testis)",2,"[1.0771380046097814, 0.9228619953902187]","[0.00902934458441735, 0.004240396755557003]","{'Brain_Amygdala': 0.008980998878714942, 'Test...",2,2
ENSG00000043093,DCUN1D1,"(Esophagus_Muscularis, Esophagus_Gastroesophag...",2,"[1.3868202090634627, 0.6131797909365372]","[0.0004918171518042509, 0.00015146433378921632]",{'Esophagus_Muscularis': 0.0004475469555083852...,2,2
ENSG00000081377,CDC14B,"(Muscle_Skeletal, Brain_Nucleus_accumbens_basa...",2,"[1.0605267240586365, 0.9394732759413637]","[0.014269778064730796, 0.008102106779328003]","{'Muscle_Skeletal': 0.008171820631792033, 'Bra...",2,2


In [141]:
# case with two snps, not repeated across tissues
_gene_id = "ENSG00000000419"
display(
    spredixcan_gene_obj[_gene_id].get_prediction_weights(
        tissue="Brain_Hypothalamus", model_type=EQTL_MODEL
    )
)
display(
    spredixcan_gene_obj[_gene_id].get_prediction_weights(
        tissue="Brain_Substantia_nigra", model_type=EQTL_MODEL
    )
)

varID
chr20_50862947_C_T_b38    0.431375
Name: weight, dtype: float64

varID
chr20_50957480_C_T_b38   -0.146796
Name: weight, dtype: float64

In [142]:
_tmp = _count_unique_snps(_gene_id)
assert _tmp.shape[0] == 2
assert _tmp["unique_n_snps_in_model"] == 2
assert _tmp["unique_n_snps_used"] == 2

In [143]:
# get unique snps for all genes
spredixcan_genes_unique_n_snps = spredixcan_genes_models.groupby("gene_id").apply(
    lambda x: _count_unique_snps(x.name)
)

In [144]:
spredixcan_genes_unique_n_snps.head()

Unnamed: 0_level_0,unique_n_snps_in_model,unique_n_snps_used
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000000419,2,2
ENSG00000000938,5,5
ENSG00000000971,12,12
ENSG00000001084,23,23
ENSG00000001167,14,13


In [145]:
assert (
    spredixcan_genes_unique_n_snps["unique_n_snps_in_model"]
    >= spredixcan_genes_unique_n_snps["unique_n_snps_used"]
).all()

In [146]:
# add unique snps to spredixcan_genes_models
spredixcan_genes_models = spredixcan_genes_models.join(spredixcan_genes_unique_n_snps)

In [147]:
spredixcan_genes_models.shape

(6444, 10)

In [148]:
spredixcan_genes_models.head()

Unnamed: 0_level_0,gene_name,tissue,n_tissues,tissues_pc_variances,tissues_pc_variances_cov,tissues_variances,n_snps_used_sum,n_snps_in_model_sum,unique_n_snps_in_model,unique_n_snps_used
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSG00000000419,DPM1,"(Brain_Hypothalamus, Brain_Substantia_nigra)",2,"[1.0372585612589562, 0.9627414387410438]","[0.008284978865240098, 0.0007326748393412591]","{'Brain_Hypothalamus': 0.0082838613770255, 'Br...",2,2,2,2
ENSG00000000938,FGR,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",36,"[30.57880701512375, 2.0297326456001112, 1.4017...","[0.22478471249969328, 0.008813873813031786]","{'Brain_Hippocampus': 0.006367955494129906, 'A...",40,40,5,5
ENSG00000000971,CFH,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",34,"[20.772608276639335, 8.177158142324833, 1.8646...","[0.196995342795801, 0.057318253163491664, 0.02...","{'Brain_Hippocampus': 0.005965148549549016, 'A...",44,44,12,12
ENSG00000001084,GCLC,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",32,"[20.97644994026909, 4.772354749306766, 2.23620...","[0.36578096389253933, 0.1264541371042455, 0.01...","{'Brain_Hippocampus': 0.009716241262648275, 'A...",46,46,23,23
ENSG00000001167,NFYA,"(Brain_Hippocampus, Artery_Tibial, Brain_Anter...",40,[37.63836978240501],"[1.4787144508923993, 0.04969061838773912]","{'Brain_Hippocampus': 0.016125723366546694, 'A...",47,48,14,13


### Save

In [149]:
assert spredixcan_genes_models["gene_name"].is_unique

In [150]:
assert not spredixcan_genes_models.isna().any(None)

In [151]:
spredixcan_genes_models.to_pickle(OUTPUT_DIR_BASE / "gene_tissues.pkl")