# Description

This notebooks analyzes the gene bands at the top of each LV, and in particular, LV246 (associated with lipids).

# Modules

In [1]:
import pandas as pd

import conf
from entity import Gene
from data.recount2 import LVAnalysis

# Settings

In [2]:
BAND = "1p13"

In [3]:
# Parameters
PHENOPLIER_NOTEBOOK_FILEPATH = "nbs/15_gsa_gls/misc/explore_lv_genes_in_1p13.ipynb"

# Load data

## MultiPLIER Z

In [4]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [5]:
multiplier_z.shape

(6750, 987)

In [6]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


# Gene gene info

In [7]:
gene_objs = [
    Gene(name=gene_name)
    for gene_name in multiplier_z.index
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
]

In [8]:
len(gene_objs)

6454

In [9]:
gene_bands = [g.band for g in gene_objs]

In [10]:
gene_bands[:5]

['13q34', '14q11.2', '6p24.3', '1p35.1', '5q33.1']

In [11]:
gene_df = (
    pd.DataFrame(
        {
            "symbol": [g.name for g in gene_objs],
            "band": [g.band for g in gene_objs],
        }
    )
    .set_index("symbol")
    .squeeze()
)

In [12]:
gene_df.shape

(6454,)

In [13]:
gene_df.isna().any()

True

In [14]:
gene_df = gene_df.dropna()

In [15]:
gene_df.shape

(6452,)

In [16]:
gene_df.head()

symbol
GAS6          13q34
MMP14       14q11.2
DSP          6p24.3
MARCKSL1     1p35.1
SPARC        5q33.1
Name: band, dtype: object

# Create LV-band dataframe

In [17]:
def get_lv_genes(lv_code: str):
    """
    Given an LV code (such as LV123), it returns a dataframe with gene symbols
    as index values, and two columns: the LV name with weights and "gene_band".
    The dataframe is sorted (in descending order) according to the gene weight.
    """
    lv_genes = multiplier_z[lv_code].sort_values(ascending=False)
    lv_obj = LVAnalysis(lv_code)
    return lv_obj.lv_genes.set_index("gene_name").loc[lv_genes.index]

In [18]:
get_lv_genes("LV246")

Unnamed: 0,LV246,gene_band
SCD,6.672060,10q24.31
ACSS2,6.258514,20q11.22
GPAM,5.714077,10q25.2
DGAT2,4.738347,11q13.5
ACLY,3.708678,17q21.2
...,...,...
FBXO2,0.000000,1p36.22
FBXO3,0.000000,11p13
FBXO5,0.000000,6q25.2
FBXO6,0.000000,1p36.22


In [19]:
lv_gene_bands = {
    lv_code: get_lv_genes(lv_code).rename(columns={lv_code: "lv"})
    for lv_code in multiplier_z.columns
}

In [20]:
lv_gene_bands["LV1"]

Unnamed: 0,lv,gene_band
POLD1,2.989508,19q13.33
TRIM28,2.967235,19q13.43
TOMM40,2.941369,19q13.32
PKMYT1,2.810091,16p13.3
CDT1,2.776054,16q24.3
...,...,...
HIST1H2BF,0.000000,6p22.2
HIST1H2BD,0.000000,6p22.2
NACA,0.000000,12q13.3
PLEKHA1,0.000000,10q26.13


# Summarize LV-band

In [21]:
_tmp = lv_gene_bands["LV246"]

In [22]:
_tmp2 = _tmp.head(70)["gene_band"].value_counts()

In [23]:
_tmp2

4q25        2
17q25.3     2
11q13.1     2
11p11.2     2
12q24.11    2
           ..
3q21.2      0
3q21.1      0
3q13.33     0
3q13.32     0
10p11.1     0
Name: gene_band, Length: 701, dtype: int64

In [24]:
_tmp2[_tmp2.index.str.startswith("1p13")]

1p13.3    0
1p13.2    0
1p13.1    0
Name: gene_band, dtype: int64

In [25]:
def count_band(lv_gene_bands, band):
    """
    It takes the top 70 genes (around 1%) in the LV data (given by lv_gene_bands)
    and counts how many genes' bands starts with the value given by band.
    For instance, if band="1p13", it would count all genes in bands "1p13.1", "1p13.2"
    and all "1p13*".
    """
    top_bands = lv_gene_bands.head(70)["gene_band"].value_counts()
    return top_bands[top_bands.index.str.startswith(band)].sum()

In [26]:
count_band(lv_gene_bands["LV1"], "1p13")

0

In [27]:
BAND

'1p13'

In [28]:
lv_band_summary = {k: count_band(v, BAND) for k, v in lv_gene_bands.items()}

In [29]:
lv_band_summary_df = pd.Series(lv_band_summary).sort_values(ascending=False)

In [30]:
lv_band_summary_df.shape

(987,)

In [31]:
lv_band_summary_df.head()

LV856    7
LV227    6
LV418    5
LV805    5
LV481    4
dtype: int64

# LV246

In [32]:
# count how many top genes in LV246 are in 1p13
lv_band_summary_df["LV246"]

0

In [33]:
# now, take a look at what are the top bands in LV246
lv_gene_bands["LV246"].head(20)

Unnamed: 0,lv,gene_band
SCD,6.67206,10q24.31
ACSS2,6.258514,20q11.22
GPAM,5.714077,10q25.2
DGAT2,4.738347,11q13.5
ACLY,3.708678,17q21.2
GPD1,3.346001,12q13.12
MVD,3.320598,16q24.2
FASN,2.915241,17q25.3
LPL,2.858278,8p21.3
ACSL1,2.682199,4q35.1


In [34]:
lv_gene_bands["LV246"].head(70)["gene_band"].value_counts().head(20)

4q25        2
17q25.3     2
11q13.1     2
11p11.2     2
12q24.11    2
19q13.11    2
19p13.2     2
11q12.2     2
17q12       1
12p11.22    1
12q13.12    1
12p13.33    1
12p13.31    1
7q21.11     1
1p34.2      1
7q21.2      1
12p12.3     1
11q23.1     1
17q21.2     1
11q14.2     1
Name: gene_band, dtype: int64