# Description

**TODO** update

# Modules

In [17]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.graphics.gofplots import qqplot_2samples
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import conf
from entity import Gene

# Settings

In [3]:
INPUT_DIR = conf.RESULTS["GLS_NULL_SIMS"] / "phenoplier"
display(INPUT_DIR)

PosixPath('/opt/data/results/gls/null_sims/phenoplier')

# Load MultiXcan genes present in results

In [4]:
_tmp = pd.read_csv(
    conf.RESULTS["GLS_NULL_SIMS"]
    / "twas"
    / "smultixcan"
    / "random.pheno0-gtex_v8-mashr-smultixcan.txt",
    sep="\t",
)

In [5]:
_tmp.shape

(22317, 18)

In [6]:
_tmp.head()

Unnamed: 0,gene,gene_name,pvalue,n,n_indep,p_i_best,t_i_best,p_i_worst,t_i_worst,eigen_max,eigen_min,eigen_min_kept,z_min,z_max,z_mean,z_sd,tmi,status
0,ENSG00000131941.7,RHPN2,4e-05,48.0,3.0,0.000213947,Artery_Tibial,0.990132,Brain_Nucleus_accumbens_basal_ganglia,36.556432,7.692089e-16,2.519701,-2.721185,3.701952,1.283152,1.825567,3.0,0
1,ENSG00000076650.6,GPATCH1,7.8e-05,40.0,3.0,0.000453439,Brain_Cerebellum,0.817384,Brain_Frontal_Cortex_BA9,29.990208,2.086487e-15,1.815203,-3.506853,2.383485,-2.016745,1.715495,3.0,0
2,ENSG00000100906.10,NFKBIA,9.6e-05,1.0,1.0,9.591208e-05,Brain_Frontal_Cortex_BA9,9.6e-05,Brain_Frontal_Cortex_BA9,1.0,1.0,1.0,-3.900707,-3.900707,-3.900707,,1.0,0
3,ENSG00000136319.11,TTC5,0.000109,47.0,5.0,0.001402826,Brain_Hippocampus,0.961887,Colon_Sigmoid,21.272442,8.142339e-16,0.732606,-3.194069,1.397514,-0.916662,1.068989,5.0,0
4,ENSG00000152990.13,ADGRA3,0.000135,41.0,12.0,3.211289e-07,Heart_Atrial_Appendage,0.653657,Whole_Blood,12.988248,3.499412e-16,0.444682,-5.110605,3.59941,-0.464735,2.316607,12.0,0


In [7]:
multixcan_genes = set(_tmp["gene_name"])
display(len(multixcan_genes))
display(list(multixcan_genes)[:10])

22311

['ISY1',
 'ZNF621',
 'RSBN1',
 'SEPT5',
 'XXbac-BPG55C20.7',
 'TNFRSF10A',
 'CTD-2532N20.1',
 'RNGTT',
 'LA16c-312E8.5',
 'LINC00957']

# Load MultiPLIER Z matrix

In [8]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [9]:
multiplier_z.shape

(6750, 987)

In [10]:
# keep genes only present in MultiXcan
multiplier_z = multiplier_z.loc[
    sorted(multixcan_genes.intersection(multiplier_z.index))
]

In [11]:
multiplier_z.shape

(6446, 987)

In [12]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
A2M,0.0,0.0,0.0,0.078428,0.0,0.0,0.0,0.046772,0.0,0.011033,...,0.01091,0.0,0.052869,0.012749,0.0,0.013982,0.152241,0.0,0.0,0.108884
AAAS,0.271162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.513337,0.0,...,0.0,0.0,0.0,0.001385,0.0,0.0,0.711896,0.030534,0.447105,0.424685
AANAT,0.005099,0.308497,0.028815,0.0,0.0,0.0,0.371725,0.0,0.135238,0.0,...,0.117163,0.018815,0.0,0.193142,0.0,0.040401,0.180436,0.0,0.0,0.042064
AARS,0.512978,0.0,0.451285,0.319568,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006208,0.021274,0.0,0.0,0.062143,0.0,0.0,0.364892
AARS2,1.008157,0.0,0.197424,0.024832,0.0,0.57081,0.0,0.228675,0.0,0.116243,...,0.0,0.0,0.153345,0.0,0.0,0.0,1.004578,0.01447,0.0,0.0


# Common genes

In [13]:
common_genes = multixcan_genes.intersection(set(multiplier_z.index))

In [14]:
len(common_genes)

6446

In [15]:
list(common_genes)[:10]

['ISY1',
 'RHOB',
 'ZNF621',
 'HNRNPR',
 'SEPT5',
 'TNFRSF10A',
 'RNGTT',
 'PRKAB2',
 'FTL',
 'NUP50']

In [29]:
common_genes_objs = {
    gene_name: Gene(name=gene_name)
    for gene_name in common_genes
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
}

In [30]:
len(common_genes_objs)

6446

In [31]:
common_genes_objs["GAS6"].ensembl_id

'ENSG00000183087'

In [32]:
_gene_obj = list(common_genes_objs.values())

genes_info = pd.DataFrame(
    {
        "name": [g.name for g in _gene_obj],
        "id": [g.ensembl_id for g in _gene_obj],
        "chr": [g.chromosome for g in _gene_obj],
        "band": [g.band for g in _gene_obj],
        "start_position": [g.get_attribute("start_position") for g in _gene_obj],
        "end_position": [g.get_attribute("end_position") for g in _gene_obj],
    }
)

In [33]:
genes_info.shape

(6446, 6)

In [39]:
genes_info[genes_info.isna().any(axis=1)]

Unnamed: 0,name,id,chr,band,start_position,end_position
1045,TMEM133,ENSG00000170647,,,,
5365,TBCE,ENSG00000116957,,,,


In [40]:
genes_info = genes_info.dropna()
display(genes_info.shape)

(6444, 6)

In [43]:
genes_info["chr"] = genes_info["chr"].astype(int)
genes_info["start_position"] = genes_info["start_position"].astype(int)
genes_info["end_position"] = genes_info["end_position"].astype(int)

In [44]:
genes_info.dtypes

name              object
id                object
chr                int64
band              object
start_position     int64
end_position       int64
dtype: object

In [45]:
genes_info.head()

Unnamed: 0,name,id,chr,band,start_position,end_position
0,ISY1,ENSG00000240682,3,3q21.3,129127415,129161293
1,RHOB,ENSG00000143878,2,2p24.1,20447074,20449445
2,ZNF621,ENSG00000172888,3,3p22.1,40524878,40574685
3,HNRNPR,ENSG00000125944,1,1p36.12,23303771,23344336
4,SEPT5,ENSG00000184702,22,22q11.21,19714503,19724224


# List genes by chromosome and position

In [46]:
genes_info.sort_values(["chr", "start_position"])

Unnamed: 0,name,id,chr,band,start_position,end_position
6280,NOC2L,ENSG00000188976,1,1p36.33,944203,959309
6249,HES4,ENSG00000188290,1,1p36.33,998962,1000172
2851,ISG15,ENSG00000187608,1,1p36.33,1001138,1014540
2848,AGRN,ENSG00000188157,1,1p36.33,1020120,1056118
1745,TNFRSF18,ENSG00000186891,1,1p36.33,1203508,1206592
...,...,...,...,...,...,...
3168,CPT1B,ENSG00000205560,22,22q13.33,50568861,50578465
996,CHKB,ENSG00000100288,22,22q13.33,50578949,50601455
4152,MAPK8IP2,ENSG00000008735,22,22q13.33,50600793,50613981
2682,ARSA,ENSG00000100299,22,22q13.33,50622754,50628173


## Same chromosome and close

In [52]:
with pd.option_context("display.max_rows", None):
    _tmp = genes_info[genes_info["band"].str.startswith("17q")].sort_values(
        ["start_position"]
    )
    display(_tmp)

Unnamed: 0,name,id,chr,band,start_position,end_position
1855,WSB1,ENSG00000109046,17,17q11.1,27294076,27315926
4557,KSR1,ENSG00000141068,17,17q11.2,27456470,27626438
5961,NOS2,ENSG00000007171,17,17q11.2,27756766,27800499
5154,NLK,ENSG00000087095,17,17q11.2,28041737,28196381
3960,TMEM97,ENSG00000109084,17,17q11.2,28319200,28328685
4882,VTN,ENSG00000109072,17,17q11.2,28367284,28373091
1896,SLC46A1,ENSG00000076351,17,17q11.2,28394642,28407197
2571,PIGS,ENSG00000087111,17,17q11.2,28553383,28571794
3074,ALDOC,ENSG00000109107,17,17q11.2,28573115,28576948
5895,RPL23A,ENSG00000198242,17,17q11.2,28719985,28724359


## Same chromosome but far away

In [47]:
genes_info[genes_info["chr"] == 6].sort_values(["start_position"])

Unnamed: 0,name,id,chr,band,start_position,end_position
474,IRF4,ENSG00000137265,6,6p25.3,391752,411443
3151,EXOC2,ENSG00000112685,6,6p25.3,485154,693139
426,GMDS,ENSG00000112699,6,6p25.3,1623806,2245605
1676,SERPINB6,ENSG00000124570,6,6p25.2,2948159,2972165
3486,RIPK1,ENSG00000137275,6,6p25.2,3063991,3115187
...,...,...,...,...,...,...
6369,CCR6,ENSG00000112486,6,6q27,167111807,167139696
2165,THBS2,ENSG00000186340,6,6q27,169215780,169254044
5003,DLL1,ENSG00000198719,6,6q27,170282206,170306565
3495,PSMB1,ENSG00000008018,6,6q27,170535120,170553307


# Explore specific LVs

In [56]:
multiplier_z["LV45"].sort_values(ascending=False).head(20)

HIST1H2BO    8.480948
HIST1H2BF    8.426226
HIST1H2BK    8.245903
HIST1H2BD    8.119013
HIST1H2BC    7.744137
HIST1H2AC    7.447932
HIST2H2BF    6.700409
HIST1H2AG    6.405812
HIST1H2BN    5.913649
HIST1H2AE    5.898213
HIST1H2BE    3.489655
HIST1H2BH    2.522366
HIST1H2BJ    2.467813
HIST1H2AD    2.403343
H2AFX        1.594301
HIST1H2BG    1.525977
HIST3H2BB    1.436435
HIST3H2A     1.268004
H2AFZ        1.221418
HIST1H4D     1.056777
Name: LV45, dtype: float64

In [57]:
lv_top_genes = multiplier_z["LV45"].sort_values(ascending=False).head(20).index.tolist()

In [59]:
genes_info[genes_info["name"].isin(lv_top_genes)].sort_values(["chr", "start_position"])

Unnamed: 0,name,id,chr,band,start_position,end_position
6114,HIST2H2BF,ENSG00000203814,1,1q21.2,149782689,149812373
4755,HIST3H2A,ENSG00000181218,1,1q42.13,228456979,228457873
6125,HIST3H2BB,ENSG00000196890,1,1q42.13,228458107,228460470
2421,H2AFZ,ENSG00000164032,4,4q23,99948086,99950355
282,HIST1H2BC,ENSG00000180596,6,6p22.2,26114873,26123926
6410,HIST1H2AC,ENSG00000180573,6,6p22.2,26124145,26139116
4418,HIST1H2BD,ENSG00000158373,6,6p22.2,26158146,26171349
3683,HIST1H2BE,ENSG00000274290,6,6p22.2,26172059,26184655
124,HIST1H4D,ENSG00000277157,6,6p22.2,26188765,26189076
3311,HIST1H2AD,ENSG00000196866,6,6p22.2,26198851,26199243
