# Linger : baseline - explore Cell population gene regulatory network
From : https://github.com/Durenlab/LINGER/blob/main/docs/GRN_infer.md#cell-population-gene-regulatory-network

In [1]:
!ls

CECI				PBMCs_tutorial.ipynb  launch_jupyter
LINGER_data			code		      logs
LINGER_output_baseline		data		      scripts
LINGER_output_linger		dry_sub.sh	      sub.sh
Linger_baseline_explorer.ipynb	images		      test_model.pt


## 1. Input Data
- Multi omics matrix (both RNA and ATAC) from [10x genomics](https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5)


In [1]:
import scanpy as sc
import scipy.sparse as sp
import pandas as pd

adata = sc.read_10x_h5('data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5', gex_only=False)
adata

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 11909 × 144978
    var: 'gene_ids', 'feature_types', 'genome'

In [3]:
matrix = adata.X.T    # linger expects k_features x n_cells
adata.var['gene_ids'] = adata.var.index

# features are genes and peaks grouped together (col1 for gene/peak name and col2 for category: gene or peak)
features = pd.DataFrame(adata.var['gene_ids'].values.tolist(),columns=[1])
features[2] = adata.var['feature_types'].values

barcodes = pd.DataFrame(adata.obs_names,columns=[0])
label = pd.read_csv('data/PBMC_label.txt',sep='\t',header=0)

from LingerGRN.preprocess import *
adata_RNA, adata_ATAC = get_adata(matrix,features,barcodes,label)     # adata_RNA and adata_ATAC are scRNA and scATAC

  adata_RNA.obs['label']=label.loc[adata_RNA.obs['barcode']]['label'].values
  adata_ATAC.obs['label']=label.loc[adata_ATAC.obs['barcode']]['label'].values


## 2. adata_RNA & adata_ATAC
From 1. we get two matrices : 
- `adata_RNA` (cells x genes)
- `adata_ATAC` (cells x peaks)

In [41]:
adata_RNA

View of AnnData object with n_obs × n_vars = 9543 × 36601
    obs: 'barcode', 'sample', 'label', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [42]:
adata_ATAC

View of AnnData object with n_obs × n_vars = 9543 × 108377
    obs: 'barcode', 'sample', 'label'
    var: 'gene_ids'

## 3. Preprocess - Pseudobulking - Training
[Skipped part]

## 4. Explore output files

In [47]:
import pandas as pd
method = "baseline"

### 4.1 Cell population gene regulatory network

#### 4.1.1 TF binding potential (TF-RE)

In [48]:
tf_re = pd.read_csv(f"LINGER_output_{method}/cell_population_TF_RE_binding.txt", sep="\t", index_col=0)
tf_re.shape    # regions x TFs

(97042, 451)

In [49]:
# Sparsity
(tf_re == 0).mean().mean()

0.8419419831064073

In [50]:
tf_re.head(3)

Unnamed: 0,ALX4,POU3F1,FOXJ3,MAX,HSF2,ZFP3,GZF1,GLI2,HOXB7,HSF1,...,RELA,ARID5A,PBX2,MLXIPL,MEF2C,HOXB4,MSX2,TCF12,MECOM,NFATC4
chr1:100028489-100029404,0.010469,0.380009,0.0,0.412295,0.0,0.0,0.324316,0.0,0.0,0.380206,...,0.63457,0.363192,0.0,0.225078,0.0,0.0,0.265811,0.0,0.0,0.0
chr1:100034436-100035279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1:100035922-100040109,0.0,0.0,0.077789,0.0,0.093418,0.072623,0.0,0.338647,0.38639,0.0,...,0.0,0.0,0.64465,0.0,0.291201,0.42141,0.0,0.244719,0.315431,0.291374


`describe()` info 
- count : non missing values for each TF (=nb regions)
- mean : average binding score of a TF across all regions
- quantiles ex Q3 : 75% of values are below this
- max : largest binding score

In [51]:
df_describe = tf_re.describe()
df_describe

Unnamed: 0,ALX4,POU3F1,FOXJ3,MAX,HSF2,ZFP3,GZF1,GLI2,HOXB7,HSF1,...,RELA,ARID5A,PBX2,MLXIPL,MEF2C,HOXB4,MSX2,TCF12,MECOM,NFATC4
count,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,...,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0,97042.0
mean,0.055502,0.056708,0.052044,0.057451,0.051512,0.056062,0.056943,0.054174,0.051681,0.054406,...,0.053124,0.055405,0.059373,0.054137,0.051818,0.057295,0.056066,0.049998,0.054652,0.049985
std,0.15584,0.152585,0.145518,0.153628,0.147215,0.156662,0.15608,0.155317,0.14698,0.14971,...,0.14536,0.151382,0.163526,0.152395,0.150782,0.157566,0.154775,0.141722,0.155864,0.145618
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.946808,0.953977,0.961167,0.948049,0.96697,0.969294,0.946504,0.964803,0.949356,0.957261,...,0.955897,0.951754,0.948516,0.974334,0.945608,0.965645,0.973815,0.951759,0.923593,0.962458


In [52]:
# Strongest biding (best TF) per region
best_TF_per_region = tf_re.stack().sort_values(ascending=False).head(10)    
best_TF_per_region

chr12:7588629-7589241  SP7       0.992625
chr12:7994608-7996442  FOXB1     0.991988
chr12:7993566-7993739  FOXB1     0.991988
chr12:7588629-7589241  PRDM14    0.990154
chr12:7994608-7996442  PRDM14    0.989562
chr12:7993566-7993739  PRDM14    0.989562
chr12:7588629-7589241  FOXB1     0.989552
chr12:7345278-7345603  PRDM14    0.989529
                       FOXB1     0.988681
chr12:7993566-7993739  SP7       0.987949
dtype: float64

In [53]:
# Strongest biding (best region) for one particular TF
tf = 'FOXB1'
strongest_binding_for_FOXB1 = tf_re[tf].sort_values(ascending=False)
strongest_binding_for_FOXB1

chr12:7994608-7996442     0.991988
chr12:7993566-7993739     0.991988
chr12:7588629-7589241     0.989552
chr12:7345278-7345603     0.988681
chr12:6638054-6639968     0.973200
                            ...   
chr6:3155301-3158719      0.000000
chr6:31502666-31502944    0.000000
chr6:31476489-31476903    0.000000
chr6:31469891-31469933    0.000000
chrX:9997556-9997868      0.000000
Name: FOXB1, Length: 97042, dtype: float64

In [54]:
# Active regions = all regions who have at least one TF expressed more than 'thres'
thres = 0.5
active_re = tf_re.loc[(tf_re > thres).any(axis=1)]
active_re.head(3)

Unnamed: 0,ALX4,POU3F1,FOXJ3,MAX,HSF2,ZFP3,GZF1,GLI2,HOXB7,HSF1,...,RELA,ARID5A,PBX2,MLXIPL,MEF2C,HOXB4,MSX2,TCF12,MECOM,NFATC4
chr1:100028489-100029404,0.010469,0.380009,0.0,0.412295,0.0,0.0,0.324316,0.0,0.0,0.380206,...,0.63457,0.363192,0.0,0.225078,0.0,0.0,0.265811,0.0,0.0,0.0
chr1:100035922-100040109,0.0,0.0,0.077789,0.0,0.093418,0.072623,0.0,0.338647,0.38639,0.0,...,0.0,0.0,0.64465,0.0,0.291201,0.42141,0.0,0.244719,0.315431,0.291374
chr1:100046068-100047735,0.0,0.0,0.0,0.758438,0.0,0.0,0.274057,0.0,0.071968,0.228773,...,0.428219,0.104424,0.0,0.130865,0.0,0.0,0.368671,0.0,0.0,0.0


In [55]:
tf_corr = tf_re.corr()

In [56]:
tf_corr.head(3)

Unnamed: 0,ALX4,POU3F1,FOXJ3,MAX,HSF2,ZFP3,GZF1,GLI2,HOXB7,HSF1,...,RELA,ARID5A,PBX2,MLXIPL,MEF2C,HOXB4,MSX2,TCF12,MECOM,NFATC4
ALX4,1.0,0.554478,0.564401,0.445777,0.654223,0.667032,0.411907,0.682505,0.516092,0.437673,...,0.500391,0.51563,0.566405,0.310611,0.738674,0.551268,0.491359,0.588458,0.474115,0.587764
POU3F1,0.554478,1.0,0.53102,0.545719,0.524307,0.495514,0.590449,0.469985,0.435218,0.625745,...,0.583225,0.641953,0.442418,0.496327,0.487455,0.457916,0.627725,0.489955,0.386249,0.454525
FOXJ3,0.564401,0.53102,1.0,0.502665,0.638456,0.648257,0.535785,0.487904,0.433615,0.50206,...,0.465111,0.594562,0.47483,0.468404,0.635137,0.467466,0.497791,0.567812,0.450475,0.432126


In [57]:
top10 = (
    tf_corr
    .where(tf_corr < 1.0)   # remove self-correlation
    .stack()                # DataFrame → Series
    .nlargest(10)           # get top 10
)
top10

TAL2    PAX7      0.926446
PAX7    TAL2      0.926446
KLF1    GATA1     0.912312
GATA1   KLF1      0.912312
CEBPD   ZBTB16    0.911310
ZBTB16  CEBPD     0.911310
KLF9    ZBTB16    0.910820
ZBTB16  KLF9      0.910820
HNF4G   HNF1A     0.906876
HNF1A   HNF4G     0.906876
dtype: float64

To save info : 

In [58]:
save = False

dir = f"results/{method}/cell_pop/"

if save:
    df_describe.to_csv(f"{dir}tf_re_describe.csv")
    best_TF_per_region.to_csv(f"{dir}best_tf_per_region")
    active_re.to_csv(f"{dir}tf_re_active_regions.csv")

    tf_corr.to_csv(f"{dir}tf_tf_correlation_matrix.csv")
    top10.to_csv(f"{dir}tf_tf_top10_correlations.csv", index=True)

#### 4.1.2 cis-regulatory network (RE-TG)

In [59]:
re_tg = pd.read_csv(f"LINGER_output_{method}/cell_population_cis_regulatory.txt", sep="\t", index_col=0)
re_tg.columns = ['TG', 'Score']
re_tg = re_tg.reset_index()
re_tg = re_tg.rename(columns={'index': 'RE', re_tg.columns[0]: 'RE'})
re_tg.shape

(1528832, 3)

In [60]:
re_tg.head(3)

Unnamed: 0,RE,TG,Score
0,chr1:100028489-100029404,SLC35A3,0.006177391
1,chr1:100028489-100029404,EXTL2,5.4695940000000004e-17
2,chr1:100028489-100029404,CDC14A,2.005422e-07


In [61]:
re_tg.describe()

Unnamed: 0,Score
count,1528832.0
mean,0.002978429
std,0.01012052
min,5.102586e-72
25%,3.020348e-14
50%,1.805959e-09
75%,4.105457e-05
max,0.1182748


In [62]:
top_links = re_tg.sort_values("Score", ascending=False)
top_links.head(10)

Unnamed: 0,RE,TG,Score
6175,chr1:109687254-109688620,GSTM1,0.118275
920063,chr14:75267507-75300328,FOS,0.083809
1389597,chr19:8319797-8325436,RPS28,0.080924
1527901,chrX:72275851-72277795,RPS4X,0.080416
838760,chr12:56115363-56122261,RPL41,0.080331
1314542,chr19:44905418-44906835,APOE,0.077403
1094717,chr17:39194552-39201821,RPL19,0.076953
1221416,chr19:11420302-11424256,RGL3,0.076683
1221364,chr19:11418159-11419612,RGL3,0.076683
1434195,chr20:62385574-62388182,RPS21,0.076376


In [63]:
# master enhancers
re_degree = (
    re_tg.groupby("RE")
    .size()
    .sort_values(ascending=False)
    .reset_index(name="Num_TGs")
)

re_degree.head(10)

Unnamed: 0,RE,Num_TGs
0,chr19:49084515-49086914,79
1,chr19:49231476-49232032,79
2,chr19:49225771-49226661,79
3,chr19:49224837-49225245,79
4,chr19:49210028-49210817,79
5,chr19:49207430-49208748,79
6,chr19:49205041-49205635,79
7,chr19:49106491-49107105,79
8,chr19:49097690-49098385,79
9,chr19:49055830-49058250,79


#### 4.1.3 trans-regulatory network (TF-TG)

In [64]:
tf_tg = pd.read_csv("LINGER_output_baseline/cell_population_trans_regulatory.txt", sep="\t", index_col=0)
tf_tg.shape

(14842, 451)

In [65]:
tf_tg.head(3)

Unnamed: 0,ALX4,POU3F1,FOXJ3,MAX,HSF2,ZFP3,GZF1,GLI2,HOXB7,HSF1,...,RELA,ARID5A,PBX2,MLXIPL,MEF2C,HOXB4,MSX2,TCF12,MECOM,NFATC4
LRRC39,3.782764e-08,3.211212e-07,1.020716e-07,2.470217e-05,7.661409e-07,4.68427e-07,4.855493e-07,2.294528e-06,2.34972e-05,7.41195e-06,...,1.562477e-05,2.891821e-07,3.033045e-05,8.256888e-07,8.253104e-07,2.281567e-05,4.071136e-07,3.72711e-07,1.94248e-05,1.080851e-07
SLC35A3,3.759316e-05,6.571791e-05,1.26786e-05,6.152784e-05,2.232967e-05,1.701231e-05,1.162876e-05,4.030463e-05,0.0002088749,2.78888e-05,...,0.0001082445,6.505713e-05,0.0001631155,9.76667e-05,4.071283e-05,5.319896e-05,4.645208e-05,2.006835e-05,0.0001106104,3.298994e-05
EXTL2,1.783812e-08,5.098728e-08,2.457041e-08,9.338036e-08,3.330273e-08,2.467961e-08,4.5231e-08,2.422341e-08,4.623898e-09,1.275452e-07,...,2.718584e-07,1.578006e-07,3.989572e-08,3.711081e-08,3.695238e-08,2.238947e-09,2.339493e-08,2.637933e-08,4.100492e-08,3.132873e-08


In [66]:
tf_tg.describe()

Unnamed: 0,ALX4,POU3F1,FOXJ3,MAX,HSF2,ZFP3,GZF1,GLI2,HOXB7,HSF1,...,RELA,ARID5A,PBX2,MLXIPL,MEF2C,HOXB4,MSX2,TCF12,MECOM,NFATC4
count,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,...,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0,14842.0
mean,9.2e-05,8.2e-05,0.000126,9.7e-05,0.000117,0.000162,0.000139,0.000156,0.00016,0.000134,...,0.000102,0.000252,0.000231,0.00014,0.00011,0.00012,6.9e-05,0.000112,0.000176,0.000171
std,0.000239,0.000232,0.000347,0.000249,0.000314,0.000423,0.00034,0.00041,0.000414,0.000335,...,0.000258,0.000628,0.00056,0.000363,0.000282,0.000313,0.000179,0.000308,0.000439,0.000466
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.6e-05,4.3e-05,6.2e-05,6.4e-05,6.8e-05,8.6e-05,8.9e-05,8.4e-05,9.3e-05,8.5e-05,...,6.5e-05,0.000141,0.000152,8.6e-05,6.5e-05,7.8e-05,4.3e-05,6e-05,0.000108,8e-05
max,0.004768,0.005994,0.007556,0.003941,0.004508,0.007029,0.0058,0.006717,0.005561,0.00443,...,0.003547,0.008319,0.010202,0.007099,0.006483,0.004757,0.002869,0.004791,0.008357,0.007134


In [67]:
(tf_tg == 0).mean().mean()   # sparsity

0.5121945243781431

In [68]:
top_links = (
    tf_tg.stack()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

top_links.columns = ["TG", "TF", "Score"]
top_links

Unnamed: 0,TG,TF,Score
0,FOS,EGR1,0.015604
1,RPL29,TCF7L1,0.013063
2,RPL29,HOXB3,0.012511
3,FOS,JUN,0.012464
4,FOS,FOSB,0.011517
5,RPL29,ESRRB,0.011421
6,RPL29,ARNT2,0.011373
7,RPL29,SOX6,0.011149
8,RPL29,ZNF423,0.010684
9,RPL29,SOX4,0.010506


In [69]:
best_tf_per_gene = tf_tg.idxmax(axis=1).to_frame("Best_TF")
best_tf_per_gene["Score"] = tf_tg.max(axis=1)

best_tf_per_gene.head()

Unnamed: 0,Best_TF,Score
LRRC39,TRIM28,9.58808e-05
SLC35A3,IRF6,0.0005017259
EXTL2,SMAD3,3.750098e-07
CDC14A,MXI1,0.0007095962
SLC30A7,NRF1,2.850094e-07


### 4.2 Other files
- TF-RE : `cell_population_TF_RE_binding.txt`
- TF-RE binding split by chr : `chr*_cell_population_TF_RE_binding.txt`
- RE-TG : `cell_population_cis_regulatory.txt`
- TF-TG : `cell_population_trans_regulatory`
<br><br>
- `Region.bed` : [prior knowledge] list of regulatory elements (genomic coordinates)
- `Region_overlap_chr*.bed` : overlap of regions with chromosome-specific elements

In [70]:
!ls LINGER_output_baseline

 Region.bed
 Region_overlap_chr1.bed
 Region_overlap_chr10.bed
 Region_overlap_chr11.bed
 Region_overlap_chr12.bed
 Region_overlap_chr13.bed
 Region_overlap_chr14.bed
 Region_overlap_chr15.bed
 Region_overlap_chr16.bed
 Region_overlap_chr17.bed
 Region_overlap_chr18.bed
 Region_overlap_chr19.bed
 Region_overlap_chr2.bed
 Region_overlap_chr20.bed
 Region_overlap_chr21.bed
 Region_overlap_chr22.bed
 Region_overlap_chr3.bed
 Region_overlap_chr4.bed
 Region_overlap_chr5.bed
 Region_overlap_chr6.bed
 Region_overlap_chr7.bed
 Region_overlap_chr8.bed
 Region_overlap_chr9.bed
 Region_overlap_chrX.bed
'box_plot_ATF1_activity_naive CD4 T cells_Others.png'
'box_plot_ATF1_expression_naive CD4 T cells_Others.png'
'box_plot_FOXN1_activity_naive CD4 T cells_Others.png'
'box_plot_FOXN1_expression_naive CD4 T cells_Others.png'
'box_plot_NPAS2_activity_naive CD4 T cells_Others.png'
'box_plot_NPAS2_expression_naive CD4 T cells_Others.png'
'box_plot_SMAD3_activity_naive CD4 T cells_Others.png'
'box_plot_S

In [71]:
# Region.bed is the list of all regulatory elements (REs) = row in the df
df = pd.read_csv("LINGER_output_baseline/Region.bed", sep="\t", header=None)
df.columns = ["chrom", "start", "end"]
print(df.head(n=10))

# we can see that row 5 :                       	chr1 	629721 	630172
# appears in the cell below, overlapping with       chr1 	629715 	630715
df.shape

  chrom   start     end
0  chr1   10109   10357
1  chr1  180730  181630
2  chr1  191491  191736
3  chr1  267816  268196
4  chr1  586028  586373
5  chr1  629721  630172
6  chr1  633793  634264
7  chr1  777634  779926
8  chr1  816881  817647
9  chr1  819912  823500


(107174, 3)

In [72]:
# overlapping regulatory regions on chromosome 1
df = pd.read_csv("LINGER_output_baseline/Region_overlap_chr1.bed", sep="\t", header=None)
df.columns = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
print(df.head())

# from 107174 REs to 78873 overlapping REs
df.shape

  chrom1  start1    end1 chrom2  start2    end2
0   chr1  629715  630715   chr1  629721  630172
1   chr1  630021  630050   chr1  629721  630172
2   chr1  630061  630170   chr1  629721  630172
3   chr1  633715  634715   chr1  633793  634264
4   chr1  633741  633810   chr1  633793  634264


(78873, 6)