# Linger : baseline - explore Cell population gene regulatory network
From : https://github.com/Durenlab/LINGER/blob/main/docs/GRN_infer.md#cell-population-gene-regulatory-network

In [1]:
!ls

CECI				PBMCs_tutorial.ipynb  launch_jupyter
LINGER_data			code		      logs
LINGER_output_baseline		data		      scripts
LINGER_output_linger		dry_sub.sh	      sub.sh
Linger_baseline_explorer.ipynb	images		      test_model.pt


## 1. Input Data
- Multi omics matrix (both RNA and ATAC) from [10x genomics](https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5)


In [1]:
import scanpy as sc
import scipy.sparse as sp
import pandas as pd

adata = sc.read_10x_h5('data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5', gex_only=False)
adata

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 11909 × 144978
    var: 'gene_ids', 'feature_types', 'genome'

In [3]:
matrix = adata.X.T    # linger expects k_features x n_cells
adata.var['gene_ids'] = adata.var.index

# features are genes and peaks grouped together (col1 for gene/peak name and col2 for category: gene or peak)
features = pd.DataFrame(adata.var['gene_ids'].values.tolist(),columns=[1])
features[2] = adata.var['feature_types'].values

barcodes = pd.DataFrame(adata.obs_names,columns=[0])
label = pd.read_csv('data/PBMC_label.txt',sep='\t',header=0)

from LingerGRN.preprocess import *
adata_RNA, adata_ATAC = get_adata(matrix,features,barcodes,label)     # adata_RNA and adata_ATAC are scRNA and scATAC

  adata_RNA.obs['label']=label.loc[adata_RNA.obs['barcode']]['label'].values
  adata_ATAC.obs['label']=label.loc[adata_ATAC.obs['barcode']]['label'].values


## 2. adata_RNA & adata_ATAC
From 1. we get two matrices : 
- `adata_RNA` (cells x genes)
- `adata_ATAC` (cells x peaks)

In [41]:
adata_RNA

View of AnnData object with n_obs × n_vars = 9543 × 36601
    obs: 'barcode', 'sample', 'label', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [42]:
adata_ATAC

View of AnnData object with n_obs × n_vars = 9543 × 108377
    obs: 'barcode', 'sample', 'label'
    var: 'gene_ids'

## 3. Preprocess - Pseudobulking - Training
[Skipped part]

## 4. Explore output files

In [1]:
import pandas as pd
method = "linger"

### 4.1 Cell population gene regulatory network

#### 4.1.1 TF binding potential (TF-RE)

In [2]:
tf_re = pd.read_csv(f"LINGER_output_{method}/cell_population_TF_RE_binding.txt", sep="\t", index_col=0)
tf_re.shape    # regions x TFs

(96533, 451)

In [3]:
# Sparsity
(tf_re == 0).mean().mean()

9.784919431639508e-06

In [4]:
tf_re.head(3)

Unnamed: 0,MEIS3,NR1H4,NHLH1,NR5A2,E2F7,SHOX,FOXN1,TCF7L1,NR2C2,MEF2C,...,TAL2,MAX,GFI1B,SOX7,ZNF354C,BACH2,GMEB1,NR5A1,TCF7L2,E2F6
chr1:100028489-100029404,0.998669,0.996867,0.997867,0.99687,0.996745,0.999094,0.99807,0.997644,0.995784,0.99694,...,0.996962,0.998746,0.998721,0.997613,0.994543,0.998448,0.998306,0.997077,0.996636,0.997977
chr1:100034436-100035279,0.997485,0.997344,0.997752,0.994378,0.997132,0.999294,0.997306,0.997492,0.996664,0.997971,...,0.99711,0.996831,0.998079,0.997594,0.994405,0.997962,0.998407,0.996862,0.99712,0.998391
chr1:100035922-100040109,0.99875,0.997446,0.999018,0.997597,0.997613,0.999321,0.997754,0.998279,0.998303,0.996455,...,0.997079,0.998132,0.998435,0.998172,0.995969,0.998702,0.998389,0.997959,0.998297,0.996559


`describe()` info 
- count : non missing values for each TF (=nb regions)
- mean : average binding score of a TF across all regions
- quantiles ex Q3 : 75% of values are below this
- max : largest binding score

In [5]:
df_describe = tf_re.describe()
df_describe

Unnamed: 0,MEIS3,NR1H4,NHLH1,NR5A2,E2F7,SHOX,FOXN1,TCF7L1,NR2C2,MEF2C,...,TAL2,MAX,GFI1B,SOX7,ZNF354C,BACH2,GMEB1,NR5A1,TCF7L2,E2F6
count,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,...,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0,96533.0
mean,0.995271,0.995136,0.995626,0.995384,0.99425,0.996126,0.995972,0.994839,0.995883,0.993908,...,0.994686,0.995315,0.996078,0.995137,0.994364,0.994658,0.994357,0.994642,0.994853,0.994094
std,0.013783,0.015569,0.017299,0.01793,0.024698,0.012961,0.018553,0.012555,0.011871,0.021521,...,0.014254,0.01851,0.012351,0.01238,0.013983,0.028179,0.027245,0.019068,0.017235,0.028403
min,-0.683951,-0.356404,-0.318621,-0.521127,-0.440177,-0.092176,-0.454205,0.301877,0.398704,-0.332693,...,-0.377387,-0.324616,0.110272,0.478065,0.0,-0.389163,-0.431924,-0.709202,0.0,-0.603531
25%,0.995913,0.995882,0.996626,0.996332,0.99537,0.99687,0.99702,0.995303,0.996637,0.994911,...,0.995013,0.996242,0.996816,0.995719,0.99479,0.996632,0.996186,0.995415,0.995905,0.995497
50%,0.997499,0.997605,0.998085,0.997887,0.997442,0.998138,0.998355,0.997174,0.997995,0.997197,...,0.997334,0.99782,0.998088,0.997433,0.997117,0.998064,0.997785,0.997466,0.997503,0.99748
75%,0.998385,0.99854,0.998824,0.998686,0.998528,0.998816,0.998997,0.998239,0.99872,0.998357,...,0.998548,0.998654,0.998765,0.998393,0.998327,0.998778,0.998621,0.998505,0.998402,0.998507
max,0.999865,0.999922,0.999912,0.999898,0.999897,0.999967,0.999954,0.999965,0.999923,0.999933,...,0.999976,0.999858,0.999888,0.999978,0.999978,0.999937,0.999939,0.999937,0.999963,0.999868


In [6]:
# Strongest biding (best TF) per region
best_TF_per_region = tf_re.stack().sort_values(ascending=False).head(10)    
best_TF_per_region

chr14:104722824-104726227  TFAP2E    0.999987
chr16:2002756-2011018      TFAP2E    0.999984
chr19:34962340-34964729    TCF12     0.999984
chr10:118792126-118794019  RXRA      0.999984
chr15:89834664-89835386    SP4       0.999983
chr19:17321524-17325462    TP63      0.999983
chr20:36491235-36493800    ZFX       0.999983
chr15:89836346-89838031    SP4       0.999983
chr16:28168244-28169974    SP7       0.999983
chr19:17127562-17128678    RFX2      0.999983
dtype: float64

In [7]:
# Strongest biding (best region) for one particular TF
tf = 'FOXB1'
strongest_binding_for_FOXB1 = tf_re[tf].sort_values(ascending=False)
strongest_binding_for_FOXB1

chr3:15635801-15636900      0.999930
chr3:15629909-15630937      0.999929
chr2:55161304-55162195      0.999927
chr19:35022494-35024191     0.999927
chr6:126337184-126337963    0.999924
                              ...   
chr9:134887783-134888066    0.605639
chr6:154306535-154307193    0.605358
chr6:107756749-107757054    0.512610
chr15:60017798-60018189     0.000000
chr15:60002079-60002424     0.000000
Name: FOXB1, Length: 96533, dtype: float64

In [8]:
# Active regions = all regions who have at least one TF expressed more than 'thres'
thres = 0.5
active_re = tf_re.loc[(tf_re > thres).any(axis=1)]
active_re.head(3)

Unnamed: 0,MEIS3,NR1H4,NHLH1,NR5A2,E2F7,SHOX,FOXN1,TCF7L1,NR2C2,MEF2C,...,TAL2,MAX,GFI1B,SOX7,ZNF354C,BACH2,GMEB1,NR5A1,TCF7L2,E2F6
chr1:100028489-100029404,0.998669,0.996867,0.997867,0.99687,0.996745,0.999094,0.99807,0.997644,0.995784,0.99694,...,0.996962,0.998746,0.998721,0.997613,0.994543,0.998448,0.998306,0.997077,0.996636,0.997977
chr1:100034436-100035279,0.997485,0.997344,0.997752,0.994378,0.997132,0.999294,0.997306,0.997492,0.996664,0.997971,...,0.99711,0.996831,0.998079,0.997594,0.994405,0.997962,0.998407,0.996862,0.99712,0.998391
chr1:100035922-100040109,0.99875,0.997446,0.999018,0.997597,0.997613,0.999321,0.997754,0.998279,0.998303,0.996455,...,0.997079,0.998132,0.998435,0.998172,0.995969,0.998702,0.998389,0.997959,0.998297,0.996559


In [9]:
tf_corr = tf_re.corr()

In [10]:
tf_corr.head(3)

Unnamed: 0,MEIS3,NR1H4,NHLH1,NR5A2,E2F7,SHOX,FOXN1,TCF7L1,NR2C2,MEF2C,...,TAL2,MAX,GFI1B,SOX7,ZNF354C,BACH2,GMEB1,NR5A1,TCF7L2,E2F6
MEIS3,1.0,0.751707,0.58572,0.661443,0.495097,0.752709,0.567599,0.804534,0.776298,0.657041,...,0.716876,0.665966,0.802588,0.794435,0.770809,0.532537,0.470772,0.587646,0.734883,0.487823
NR1H4,0.751707,1.0,0.611832,0.702828,0.508994,0.756283,0.575493,0.701101,0.71823,0.689899,...,0.809215,0.705883,0.835359,0.704119,0.695015,0.533332,0.477703,0.604497,0.662956,0.50958
NHLH1,0.58572,0.611832,1.0,0.638351,0.509257,0.570848,0.626903,0.637533,0.646243,0.647869,...,0.71377,0.547742,0.681374,0.628966,0.619119,0.497208,0.477691,0.64207,0.602383,0.500226


In [11]:
top10 = (
    tf_corr
    .where(tf_corr < 1.0)   # remove self-correlation
    .stack()                # DataFrame → Series
    .nlargest(10)           # get top 10
)
top10

CEBPA  CEBPB    0.972354
CEBPB  CEBPA    0.972354
FOSB   FOS      0.968729
FOS    FOSB     0.968729
GLI3   GLI2     0.962595
GLI2   GLI3     0.962595
CEBPD  CEBPB    0.956742
CEBPB  CEBPD    0.956742
GLI3   GLI1     0.953974
GLI1   GLI3     0.953974
dtype: float64

To save info : 

In [12]:
save = False

dir = f"results/{method}/cell_pop/"

if save:
    df_describe.to_csv(f"{dir}tf_re_describe.csv")
    best_TF_per_region.to_csv(f"{dir}best_tf_per_region")
    active_re.to_csv(f"{dir}tf_re_active_regions.csv")

    tf_corr.to_csv(f"{dir}tf_tf_correlation_matrix.csv")
    top10.to_csv(f"{dir}tf_tf_top10_correlations.csv", index=True)

#### 4.1.2 cis-regulatory network (RE-TG)

In [13]:
re_tg = pd.read_csv(f"LINGER_output_{method}/cell_population_cis_regulatory.txt", sep="\t", index_col=0)
re_tg.columns = ['TG', 'Score']
re_tg = re_tg.reset_index()
re_tg = re_tg.rename(columns={'index': 'RE', re_tg.columns[0]: 'RE'})
re_tg.shape

(1529212, 3)

In [14]:
re_tg.head(3)

Unnamed: 0,RE,TG,Score
0,chr1:100028489-100029404,CDC14A,0.000458
1,chr1:100028489-100029404,DBT,4e-06
2,chr1:100028489-100029404,DPH5,0.000127


In [15]:
re_tg.describe()

Unnamed: 0,Score
count,1529212.0
mean,0.0002272993
std,0.0004522803
min,0.0
25%,2.607833e-06
50%,5.016058e-05
75%,0.0002477527
max,0.02290544


In [16]:
top_links = re_tg.sort_values("Score", ascending=False)
top_links.head(10)

Unnamed: 0,RE,TG,Score
779296,chr12:10371941-10373159,KLRB1,0.022905
779302,chr12:10371941-10373159,KLRF1,0.020828
1209026,chr18:63154210-63163356,BCL2,0.020134
1209015,chr18:63151875-63153131,BCL2,0.016023
490238,chr7:107044079-107045823,COG5,0.014556
1126826,chr17:48546588-48549046,SKAP1,0.01435
1126958,chr17:48564087-48564786,SKAP1,0.013387
1127090,chr17:48587835-48588683,SKAP1,0.013071
490244,chr7:107044079-107045823,NAMPT,0.012972
272306,chr3:170907548-170910123,TNIK,0.012801


In [17]:
# master enhancers
re_degree = (
    re_tg.groupby("RE")
    .size()
    .sort_values(ascending=False)
    .reset_index(name="Num_TGs")
)

re_degree.head(10)

Unnamed: 0,RE,Num_TGs
0,chr19:49064916-49065346,79
1,chr19:49210028-49210817,79
2,chr19:49084515-49086914,79
3,chr19:49071846-49073027,79
4,chr19:49055830-49058250,79
5,chr19:49205041-49205635,79
6,chr17:7434525-7441924,79
7,chr19:49207430-49208748,79
8,chr19:49224837-49225245,79
9,chr19:49225771-49226661,79


#### 4.1.3 trans-regulatory network (TF-TG)

In [27]:
tf_tg = pd.read_csv(f"LINGER_output_{method}/cell_population_trans_regulatory.txt", sep="\t", index_col=0)
tf_tg.shape

(14907, 451)

In [28]:
tf_tg.head(3)

Unnamed: 0,AHR,AIRE,ALX3,ALX4,AR,ARID3A,ARID3B,ARID5A,ARID5B,ARNT,...,ZNF281,ZNF35,ZNF354C,ZNF410,ZNF415,ZNF423,ZNF691,ZNF711,ZNF740,ZSCAN22
SAMD11,1.025488e-05,1.675971e-06,1.359485e-06,1.063659e-06,3.548121e-06,3.826046e-06,5.743045e-06,6.108635e-06,6.649317e-06,1.1e-05,...,1.5e-05,5.457749e-06,8.718463e-06,1.3e-05,4.873458e-06,6e-06,1.115418e-05,6e-06,1.5e-05,9e-06
NOC2L,3.147416e-05,3.128641e-06,1.362579e-05,1.867031e-05,4.330618e-05,1.45637e-05,1.474275e-05,2.338594e-05,3.025998e-05,3.8e-05,...,4.9e-05,1.571243e-05,4.167249e-05,4.4e-05,2.633863e-05,3e-05,3.62773e-05,5.7e-05,4.4e-05,2.8e-05
KLHL17,8.102367e-07,2.263191e-07,2.404562e-07,3.176238e-07,8.353276e-07,5.289991e-07,5.326239e-07,3.928298e-07,5.909078e-07,1e-06,...,1e-06,4.130948e-07,7.350451e-07,1e-06,2.039892e-07,1e-06,6.866615e-07,2e-06,2e-06,1e-06


In [29]:
tf_tg.describe()

Unnamed: 0,AHR,AIRE,ALX3,ALX4,AR,ARID3A,ARID3B,ARID5A,ARID5B,ARNT,...,ZNF281,ZNF35,ZNF354C,ZNF410,ZNF415,ZNF423,ZNF691,ZNF711,ZNF740,ZSCAN22
count,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,...,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0,14907.0
mean,0.00027,0.000589,6.769711e-05,0.0001094314,0.0001213207,0.000295,0.000195,0.000216,0.000261,0.000211,...,0.000197,0.000147,0.000159,0.000179,8.127084e-05,0.000628,0.000164,0.000184,0.000184,0.00037
std,0.000627,0.002898,0.0001155246,0.0002124556,0.0002398669,0.000554,0.0004,0.000431,0.000945,0.0004,...,0.000492,0.00029,0.000315,0.000345,0.0001907076,0.003544,0.000387,0.000399,0.000435,0.002592
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2e-06,1e-06,4.992419e-07,6.971029e-07,8.482563e-07,2e-06,1e-06,2e-06,2e-06,2e-06,...,1e-06,1e-06,1e-06,2e-06,5.902475e-07,1e-06,1e-06,1e-06,1e-06,2e-06
50%,5.6e-05,2.7e-05,1.752556e-05,2.323521e-05,3.205532e-05,5.4e-05,4e-05,4.8e-05,4.5e-05,4.7e-05,...,4.4e-05,3.7e-05,3.5e-05,4.8e-05,1.732712e-05,3.3e-05,4e-05,4.1e-05,4.1e-05,4.2e-05
75%,0.000285,0.00017,8.203326e-05,0.0001208007,0.0001335287,0.000332,0.00021,0.000241,0.000226,0.000227,...,0.000207,0.000164,0.000161,0.000203,7.390531e-05,0.000166,0.000171,0.000197,0.000183,0.00019
max,0.037435,0.085432,0.001090909,0.002869602,0.003526705,0.008021,0.008919,0.008601,0.039696,0.006772,...,0.026292,0.004226,0.005166,0.008691,0.006025557,0.11843,0.012502,0.020052,0.021299,0.152793


In [30]:
(tf_tg == 0).mean().mean()   # sparsity

0.0045013748953786955

In [31]:
top_links = (
    tf_tg.stack()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

top_links.columns = ["TG", "TF", "Score"]
top_links

Unnamed: 0,TG,TF,Score
0,B2M,NR6A1,0.232767
1,RPL41,NR6A1,0.217422
2,RPS28,NR6A1,0.215272
3,RPS27A,NR6A1,0.204603
4,B2M,ZBTB16,0.195513
5,RPL10,ZBED1,0.194291
6,RPL19,ZBTB16,0.193423
7,RPL30,NR6A1,0.192573
8,RPL10,TOPORS,0.188245
9,EEF1A1,EWSR1,0.186676


In [32]:
best_tf_per_gene = tf_tg.idxmax(axis=1).to_frame("Best_TF")
best_tf_per_gene["Score"] = tf_tg.max(axis=1)

best_tf_per_gene.head()

Unnamed: 0,Best_TF,Score
SAMD11,PRDM4,0.000188
NOC2L,NR6A1,0.001416
KLHL17,ZBED1,2.2e-05
PLEKHN1,ZBED1,5.1e-05
HES4,NR6A1,0.007218


### 4.2 Other files
- TF-RE : `cell_population_TF_RE_binding.txt`
- TF-RE binding split by chr : `chr*_cell_population_TF_RE_binding.txt`
- RE-TG : `cell_population_cis_regulatory.txt`
- TF-TG : `cell_population_trans_regulatory`
<br><br>
- `Region.bed` : [prior knowledge] list of regulatory elements (genomic coordinates)
- `Region_overlap_chr*.bed` : overlap of regions with chromosome-specific elements

In [33]:
!ls LINGER_output_baseline

 Region.bed
 Region_overlap_chr1.bed
 Region_overlap_chr10.bed
 Region_overlap_chr11.bed
 Region_overlap_chr12.bed
 Region_overlap_chr13.bed
 Region_overlap_chr14.bed
 Region_overlap_chr15.bed
 Region_overlap_chr16.bed
 Region_overlap_chr17.bed
 Region_overlap_chr18.bed
 Region_overlap_chr19.bed
 Region_overlap_chr2.bed
 Region_overlap_chr20.bed
 Region_overlap_chr21.bed
 Region_overlap_chr22.bed
 Region_overlap_chr3.bed
 Region_overlap_chr4.bed
 Region_overlap_chr5.bed
 Region_overlap_chr6.bed
 Region_overlap_chr7.bed
 Region_overlap_chr8.bed
 Region_overlap_chr9.bed
 Region_overlap_chrX.bed
'box_plot_ATF1_activity_naive CD4 T cells_Others.png'
'box_plot_ATF1_expression_naive CD4 T cells_Others.png'
'box_plot_FOXN1_activity_naive CD4 T cells_Others.png'
'box_plot_FOXN1_expression_naive CD4 T cells_Others.png'
'box_plot_NPAS2_activity_naive CD4 T cells_Others.png'
'box_plot_NPAS2_expression_naive CD4 T cells_Others.png'
'box_plot_SMAD3_activity_naive CD4 T cells_Others.png'
'box_plot_S

In [34]:
# Region.bed is the list of all regulatory elements (REs) = row in the df
df = pd.read_csv("LINGER_output_baseline/Region.bed", sep="\t", header=None)
df.columns = ["chrom", "start", "end"]
print(df.head(n=10))

# we can see that row 5 :                       	chr1 	629721 	630172
# appears in the cell below, overlapping with       chr1 	629715 	630715
df.shape

  chrom   start     end
0  chr1   10109   10357
1  chr1  180730  181630
2  chr1  191491  191736
3  chr1  267816  268196
4  chr1  586028  586373
5  chr1  629721  630172
6  chr1  633793  634264
7  chr1  777634  779926
8  chr1  816881  817647
9  chr1  819912  823500


(107174, 3)

In [35]:
# overlapping regulatory regions on chromosome 1
df = pd.read_csv("LINGER_output_baseline/Region_overlap_chr1.bed", sep="\t", header=None)
df.columns = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
print(df.head())

# from 107174 REs to 78873 overlapping REs
df.shape

  chrom1  start1    end1 chrom2  start2    end2
0   chr1  629715  630715   chr1  629721  630172
1   chr1  630021  630050   chr1  629721  630172
2   chr1  630061  630170   chr1  629721  630172
3   chr1  633715  634715   chr1  633793  634264
4   chr1  633741  633810   chr1  633793  634264


(78873, 6)