# Notebook to extract nodes corresponding to network motifs from a GRN (and mFinder results)

- Last updated: 10/4/2023
- Author: Yang-Joon Kim

- inputs: a GRN (CellOracle Links object), and a mFinder result (network motifs, a txt file)
- outputs(TBD): a dataframe of nodes, edges for corresponding motifs

- Tentative df-  columns: index_triplet, source_node, target_node, edge width, motif_name


In [1]:
# Import libraries
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns

import celloracle as co
co.__version__

from itertools import combinations

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:
INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/google-noto-emoji/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)


'0.14.0'

In [4]:
# plotting module
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault) #Reset rcParams to default

# Editable text and proper LaTeX fonts in illustrator
# matplotlib.rcParams['ps.useafm'] = True
# Editable fonts. 42 is the magic number
mpl.rcParams['pdf.fonttype'] = 42
sns.set(style='whitegrid', context='paper')

# Plotting style function (run this before plotting the final figure)
def set_plotting_style():
    plt.style.use('seaborn-paper')
    plt.rc('axes', labelsize=12)
    plt.rc('axes', titlesize=12)
    plt.rc('xtick', labelsize=10)
    plt.rc('ytick', labelsize=10)
    plt.rc('legend', fontsize=10)
    plt.rc('text.latex', preamble=r'\usepackage{sfmath}')
    plt.rc('xtick.major', pad=2)
    plt.rc('ytick.major', pad=2)
    plt.rc('mathtext', fontset='stixsans', sf='sansserif')
    plt.rc('figure', figsize=[10,9])
    plt.rc('svg', fonttype='none')

set_plotting_style()

In [166]:
# import a custom function to process the mFinder OUTPUT file (txt)
sys.path.append("/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/zebrahub-multiome-analysis/scripts/")
from extract_motifs_mFinder_output import extract_motifs_mFinder_output

In [164]:
help(extract_motifs_mFinder_output)

Help on function extract_motifs_mFinder_output in module extract_motifs_mFinder_output:

extract_motifs_mFinder_output(mFinder_output)



# Section 1. extract nodes from one GRN 

- We took the GRN and motif for "0budstage_Somites" (It's always a combination of timepoint and cell-type).
- Once we establish a pipeline/workflow for extracting nodes for one GRN, we will script it, and scale it up for all GRNs.

In [261]:
mFinder_output = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/09_network_motifs/motifs_0budstage_Somites_OUT.txt"
df_motifs = extract_motifs_mFinder_output(mFinder_output)
df_motifs

MOTIF	NREAL	NRAND		NREAL	NREAL	UNIQ	CREAL
ID		STATS		ZSCORE	PVAL	VAL	[MILI]

46	675	401.4+-19.6	13.99	0.000	9	13.73

0 1 1
1 0 1
0 0 0

108	100	64.2+-8.6	4.15	0.000	8	2.03

0 0 1
1 0 1
1 0 0

238	16	6.1+-2.3	4.30	0.000	4	0.33

0 1 1
1 0 1
1 1 0




Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI],motifs
0,46,675,401.4+-19.6,13.99,0.0,9,13.73,"[[0, 1, 1], [1, 0, 1], [0, 0, 0]]"
1,108,100,64.2+-8.6,4.15,0.0,8,2.03,"[[0, 0, 1], [1, 0, 1], [1, 0, 0]]"
2,238,16,6.1+-2.3,4.3,0.0,4,0.33,"[[0, 1, 1], [1, 0, 1], [1, 1, 0]]"


## load the GRN

In [148]:
filepath_GRN = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/08_0budstage_celltype_GRNs.celloracle.links"
GRN = co.load_hdf5(filepath_GRN)

<celloracle.network_analysis.links_object.Links at 0x150d97cfce20>

In [151]:
celltype = "Somites"
GRN_celltype = GRN.filtered_links[celltype]
GRN_celltype

Unnamed: 0,source,target,coef_mean,coef_abs,p,-logp
358430,foxd2,ved,-0.201453,0.201453,8.021183e-16,15.095762
197037,dmrt2a,mespab,0.198571,0.198571,1.016912e-16,15.992717
196815,hoxc6b,mespaa,0.196897,0.196897,7.465155e-17,16.126961
332389,dmrt2a,tbx6,0.190522,0.190522,5.323086e-15,14.273836
196758,hoxb3a,meox1,0.186918,0.186918,1.006142e-12,11.997341
...,...,...,...,...,...,...
87037,barx1,dlc,0.037743,0.037743,5.479247e-11,10.261279
285163,alx4a,sema3aa,0.037727,0.037727,8.268290e-13,12.082584
22938,tbx16,aspm,0.037720,0.037720,1.547360e-16,15.810409
154452,nr2f1a,hoxb3a,0.037703,0.037703,2.981958e-06,5.525498


In [155]:
source_nodes = set(GRN_celltype.source)
target_nodes = set(GRN_celltype.target)

In [159]:
gene_nodes = source_nodes|target_nodes 
len(gene_nodes)

331

In [167]:
print(len(source_nodes))
print(len(target_nodes))
print(len(source_nodes & target_nodes))
print(len(source_nodes | target_nodes))

167
215
51
331


In [169]:
GRN_celltype

Unnamed: 0,source,target,coef_mean,coef_abs,p,-logp
358430,foxd2,ved,-0.201453,0.201453,8.021183e-16,15.095762
197037,dmrt2a,mespab,0.198571,0.198571,1.016912e-16,15.992717
196815,hoxc6b,mespaa,0.196897,0.196897,7.465155e-17,16.126961
332389,dmrt2a,tbx6,0.190522,0.190522,5.323086e-15,14.273836
196758,hoxb3a,meox1,0.186918,0.186918,1.006142e-12,11.997341
...,...,...,...,...,...,...
87037,barx1,dlc,0.037743,0.037743,5.479247e-11,10.261279
285163,alx4a,sema3aa,0.037727,0.037727,8.268290e-13,12.082584
22938,tbx16,aspm,0.037720,0.037720,1.547360e-16,15.810409
154452,nr2f1a,hoxb3a,0.037703,0.037703,2.981958e-06,5.525498


In [258]:
# From ChatGPT

# Load your GRN dataframe
# df = pd.read_csv("path_to_your_GRN.csv")  # replace with your file path
df = GRN_celltype

# Convert the dataframe into a set of directed edges for easy querying
edges = set(tuple(row) for row in df[['source', 'target']].values)


#df_motifs[['MOTIF_ID', 'motifs']]

# Define your motifs
motif_matrices = [
    [[0, 1, 1], [1, 0, 1], [0, 0, 0]],
    [[0, 0, 1], [1, 0, 1], [1, 0, 0]],
    [[0, 1, 1], [1, 0, 1], [1, 1, 0]]
]

# Function to get adjacency matrix for a triplet from the GRN
def get_adjacency_matrix(triplet, edges):
    matrix = np.zeros((3, 3))
    for i in range(3):
        for j in range(3):
            if (triplet[i], triplet[j]) in edges:
                matrix[i][j] = 1
    return matrix.tolist()

# # Check each triplet in the GRN against the motifs
# instances = {}
# for idx, motif in enumerate(motif_matrices):
#     instances[idx] = []

#     for triplet in combinations(df['source'].unique(), 3):
#         matrix = get_adjacency_matrix(triplet, edges)
#         if matrix == motif:
#             instances[idx].append(triplet)

# Union of unique 'source' and 'target' nodes
all_unique_nodes = set(df['source'].unique()).union(set(df['target'].unique()))

# Check each triplet in the GRN against the motifs
instances = {}
for idx, motif in enumerate(motif_matrices):
    instances[idx] = []

    for triplet in combinations(all_unique_nodes, 3):
        matrix = get_adjacency_matrix(triplet, edges)
        if matrix == motif:
            instances[idx].append(triplet)

# Convert instances into a dataframe
rows = []
for idx, instance_list in instances.items():
    for instance in instance_list:
        rows.append({'MOTIF_ID': idx, 'INSTANCE': instance})

instances_df = pd.DataFrame(rows)

print(instances_df)

     MOTIF_ID                    INSTANCE
0           0  (sox11b, hoxb3a, gadd45ba)
1           0      (raraa, hoxb6b, sulf1)
2           0    (raraa, hoxb6b, ripply2)
3           0      (raraa, hoxb6b, msgn1)
4           0      (raraa, hoxb6b, tob1a)
..        ...                         ...
325         2       (lbx2, meox1, hoxc6b)
326         2       (lbx2, meox1, hoxb3a)
327         2      (lbx2, hoxc6b, hoxc3a)
328         2    (hoxc6b, hoxb7a, hoxb9a)
329         2     (hoxa9a, hoxb10a, vent)

[330 rows x 2 columns]


In [267]:
instances_df["MOTIF_ID"].unique()

array([0, 1, 2])

In [270]:
dict_motif_id = dict(zip(instances_df["MOTIF_ID"].unique(), df_motifs["MOTIF_ID"]))
dict_motif_id

{0: '46', 1: '108', 2: '238'}

In [199]:
# dict_motif_id = {0:"46",
#                  1:"108",
#                  2:"238"}

instances_df.MOTIF_ID = instances_df.MOTIF_ID.map(dict_motif_id)
instances_df

Unnamed: 0,MOTIF_ID,INSTANCE
0,46,"(sox11b, hoxb3a, gadd45ba)"
1,46,"(raraa, hoxb6b, sulf1)"
2,46,"(raraa, hoxb6b, ripply2)"
3,46,"(raraa, hoxb6b, msgn1)"
4,46,"(raraa, hoxb6b, tob1a)"
...,...,...
325,238,"(lbx2, meox1, hoxc6b)"
326,238,"(lbx2, meox1, hoxb3a)"
327,238,"(lbx2, hoxc6b, hoxc3a)"
328,238,"(hoxc6b, hoxb7a, hoxb9a)"


In [193]:
instances_df.value_counts("MOTIF_ID")

MOTIF_ID
1    43
0    26
2    16
dtype: int64

In [203]:
instances_df.to_csv("/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/10_extracted_nodes_motifs/motifs_node_genes_0budstage_Somites.csv")


In [190]:
GRN_celltype[GRN_celltype.source.isin(["hoxc6b"])]

Unnamed: 0,source,target,coef_mean,coef_abs,p,-logp
196815,hoxc6b,mespaa,0.196897,0.196897,7.465155e-17,16.126961
196962,hoxc6b,mespab,0.146066,0.146066,1.072631e-13,12.969550
113871,hoxc6b,fn1a,0.140702,0.140702,2.977078e-17,16.526210
156109,hoxc6b,hoxc3a,0.109417,0.109417,1.386627e-17,16.858040
8743,hoxc6b,aldh1a2,0.105079,0.105079,8.696644e-15,14.060648
...,...,...,...,...,...,...
147163,hoxc6b,her2,0.042444,0.042444,8.353280e-13,12.078143
150082,hoxc6b,hhip,0.041315,0.041315,2.799864e-14,13.552863
154883,hoxc6b,hoxb6b,0.041053,0.041053,3.281597e-12,11.483915
196329,hoxc6b,meis2a,-0.040498,0.040498,2.947274e-18,17.530579


In [192]:
# Check the 3-size subgraph to make sure that we're detecting real motifs
GRN_celltype[GRN_celltype.source.isin(["hoxc6b","meox1","foxc1a"]) & GRN_celltype.target.isin(["hoxc6b","meox1","foxc1a"])]

Unnamed: 0,source,target,coef_mean,coef_abs,p,-logp
115685,meox1,foxc1a,0.111256,0.111256,1.320519e-16,15.879255
196659,hoxc6b,meox1,0.072215,0.072215,3.578723e-10,9.446272
115594,hoxc6b,foxc1a,0.055481,0.055481,1.162409e-09,8.934641
156420,meox1,hoxc6b,0.039263,0.039263,1.769696e-10,9.752101


# Step 2. extract the motif information from all GRNs (timepoints + cell-types)

- 

In [231]:
all_files = os.listdir("/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/09_network_motifs/")

filtered_files = [f for f in all_files if 'motifs' in f]
print(f"Number of files containing 'motifs': {len(filtered_files)}")
filtered_files.sort() # sort numerically (timepoints), and alphabetically (cell-types)

# Sets to store unique timepoints and cell-types
timepoints = set()
cell_types = set()

for filename in filtered_files:
    segments = filename.split('_')[1:-1]  # Splitting filename and excluding 'motifs' and 'OUT.txt'
    timepoints.add(segments[0])  # The timepoint is the second segment
    cell_type = '_'.join(segments[1:])  # Joining the segments to get the cell-type
    cell_types.add(cell_type)

# Convert sets to lists
timepoints = list(timepoints)
cell_types = list(cell_types)

# sort the timepoints list from early to late timepoints
def extract_numeric(s):
    # Extracting digits from the string
    return int(''.join(filter(str.isdigit, s)))

timepoints = sorted(timepoints, key=extract_numeric)

# sort the cell-types (based on the pseudotime/RNA velocity cell-cell transition graphs)
cell_types = ["Neural_Crest","Neural_Anterior","Differentiating_Neurons","Neural_Posterior","NMPs",
              "PSM","Somites","Muscle","Adaxial_Cells","Lateral_Mesoderm","Endoderm",
              "Germline","Epidermal","Notochord",
              "unassigned"]

print("Timepoints:", timepoints)
print("Cell Types:", cell_types)

Number of files containing 'motifs': 80
Timepoints: ['0budstage', '5somite', '10somite', '15somite', '20somite', '30somite']
Cell Types: ['Neural_Crest', 'Neural_Anterior', 'Differentiating_Neurons', 'Neural_Posterior', 'NMPs', 'PSM', 'Somites', 'Muscle', 'Adaxial_Cells', 'Lateral_Mesoderm', 'Endoderm', 'Germline', 'Epidermal', 'Notochord', 'unassigned']


In [249]:
# define the filepath where all the mFinder output files are saved
mfinder_output_path = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/09_network_motifs/"

# define the master dataframe
df_motifs = pd.DataFrame(columns=['MOTIF_ID', 'NREAL', 'NRAND_STATS', 'NREAL_ZSCORE', 'NREAL_PVAL',
       'UNIQ_VAL', 'CREAL_[MILI]', 'motifs', 'timepoint', 'cell-type'])

for stage in timepoints:
    for celltype in cell_types:
        # define the filename
        filename = "motifs"+"_"+stage+"_"+celltype+"_OUT.txt"
        # if the file exists, then extract the information
        if filename in filtered_files:
            df_temp = extract_motifs_mFinder_output(mfinder_output_path + filename)
            
            # Add timepoints and cell-types into the dataframe
            df_temp["timepoint"] = stage
            df_temp["cell-type"] = celltype
            
            # Append the dataframe to the master dataframe
            df_motifs.append(df_temp)
df_motifs
            

MOTIF	NREAL	NRAND		NREAL	NREAL	UNIQ	CREAL
ID		STATS		ZSCORE	PVAL	VAL	[MILI]

46	675	401.4+-19.6	13.99	0.000	9	13.73

0 1 1
1 0 1
0 0 0

108	100	64.2+-8.6	4.15	0.000	8	2.03

0 0 1
1 0 1
1 0 0

238	16	6.1+-2.3	4.30	0.000	4	0.33

0 1 1
1 0 1
1 1 0


MOTIF	NREAL	NRAND		NREAL	NREAL	UNIQ	CREAL
ID		STATS		ZSCORE	PVAL	VAL	[MILI]

46	675	401.4+-19.6	13.99	0.000	9	13.73

0 1 1
1 0 1
0 0 0

108	100	64.2+-8.6	4.15	0.000	8	2.03

0 0 1
1 0 1
1 0 0

238	16	6.1+-2.3	4.30	0.000	4	0.33

0 1 1
1 0 1
1 1 0


MOTIF	NREAL	NRAND		NREAL	NREAL	UNIQ	CREAL
ID		STATS		ZSCORE	PVAL	VAL	[MILI]

46	675	401.4+-19.6	13.99	0.000	9	13.73

0 1 1
1 0 1
0 0 0

108	100	64.2+-8.6	4.15	0.000	8	2.03

0 0 1
1 0 1
1 0 0

238	16	6.1+-2.3	4.30	0.000	4	0.33

0 1 1
1 0 1
1 1 0


MOTIF	NREAL	NRAND		NREAL	NREAL	UNIQ	CREAL
ID		STATS		ZSCORE	PVAL	VAL	[MILI]

46	675	401.4+-19.6	13.99	0.000	9	13.73

0 1 1
1 0 1
0 0 0

108	100	64.2+-8.6	4.15	0.000	8	2.03

0 0 1
1 0 1
1 0 0

238	16	6.1+-2.3	4.30	0.000	4	0.33

0 1 1
1 0 1
1 1 0


MOTIF	NREAL	

Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI],motifs,timepoint,cell-type


In [250]:
df_temp

Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI],motifs,timepoint,cell-type
0,46,675,401.4+-19.6,13.99,0.0,9,13.73,"[[0, 1, 1], [1, 0, 1], [0, 0, 0]]",30somite,unassigned
1,108,100,64.2+-8.6,4.15,0.0,8,2.03,"[[0, 0, 1], [1, 0, 1], [1, 0, 0]]",30somite,unassigned
2,238,16,6.1+-2.3,4.3,0.0,4,0.33,"[[0, 1, 1], [1, 0, 1], [1, 1, 0]]",30somite,unassigned


In [253]:
df_motifs

Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI],motifs,timepoint,cell-type


In [256]:
df_temp.append(df_motifs)

Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI],motifs,timepoint,cell-type
0,46,675,401.4+-19.6,13.99,0.0,9,13.73,"[[0, 1, 1], [1, 0, 1], [0, 0, 0]]",30somite,unassigned
1,108,100,64.2+-8.6,4.15,0.0,8,2.03,"[[0, 0, 1], [1, 0, 1], [1, 0, 0]]",30somite,unassigned
2,238,16,6.1+-2.3,4.3,0.0,4,0.33,"[[0, 1, 1], [1, 0, 1], [1, 1, 0]]",30somite,unassigned


In [254]:
df_motifs.append(df_temp, ignore_index=True)
df_motifs

Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI],motifs,timepoint,cell-type
