# A template notebook to run mFinder from Uri Alon's lab

- mFinder is a package to compute network motifs, a pattern of motifs that is over-represented than randomly permuted network.

- Designed for Windows machine, but can run on Linux environment.
- mfinder 1.21 is used here.

Last updated: 09/19/2023

Author: Yang-Joon Kim



In [2]:
# 0. Import
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns


In [3]:
import celloracle as co
co.__version__

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:
INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/google-noto-emoji/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)


'0.14.0'

In [4]:
# visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

# Main Command - using for loop to run mFinder for all cell-types and all timepoints(stages)

In [76]:
# Change the current working directory
#os.chdir("/hpc/projects/data.science/yangjoon.kim/github_repos/mfinder/mfinder1.21/")
#! export PATH=/hpc/projects/data.science/yangjoon.kim/github_repos/mfinder/mfinder1.21:$PATH
#os.system("export PATH=/hpc/projects/data.science/yangjoon.kim/github_repos/mfinder/mfinder1.21:$PATH")


In [13]:
# set the mfinder path in the PATH variable
mfinder_path = "/hpc/projects/data.science/yangjoon.kim/github_repos/mfinder/mfinder1.21"

cmd = "export PATH="+mfinder_path+":$PATH"
cmd

os.system(cmd)

0

## Step 1. Define the master adata object with all timepoints

- 

In [12]:
# move to the directory where the output files will be saved (motif txt)
filepath = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/09_network_motifs/"
os.chdir(filepath)

In [8]:
# Load the zebrahub early timepoints
zebrahub = sc.read_h5ad("/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/annotated_data/ZF_atlas_v01/ZF_atlas_v01.h5ad")
zebrahub

AnnData object with n_obs × n_vars = 38189 × 27877
    obs: 'n_genes', 'n_counts', '10X_run', 'fish', 'fish_n', 'timepoint', 'seqrun', 'log_ncounts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_nc', 'pct_counts_nc', 'cell_annotation', 'global_annotation'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'nc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'fish_colors', 'global_annotation_colors', 'hvg', 'neighbors', 'pca', 'timepoint_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [10]:
zebrahub.obs.timepoint.unique()

['10somite', '0 budstage', '20somite', '30somite', '5somite', '15somite']
Categories (6, object): ['0 budstage', '5somite', '10somite', '15somite', '20somite', '30somite']

In [11]:
timepoints = ['0 budstage','5somite','10somite','15somite','20somite','30somite']
timepoints

['0 budstage', '5somite', '10somite', '15somite', '20somite', '30somite']

In [16]:
stage = timepoints[0]

GRN_stage = co.load_hdf5(links_filepath + "08_"+stage+"_celltype_GRNs.celloracle.links")
GRN_stage.filtered_links

{'Differentiating_Neurons':          source   target  coef_mean  coef_abs             p      -logp
 86657     nhlh2      dla   0.143526  0.143526  1.248927e-17  16.903463
 34191     foxd2     btg1   0.129191  0.129191  4.572162e-10   9.339878
 86689    zbtb18      dla   0.127046  0.127046  1.403359e-13  12.852831
 110654  uncx4.1    fgf8a   0.111938  0.111938  4.035099e-13  12.394146
 80220   uncx4.1  cyp26c1   0.106434  0.106434  1.181142e-17  16.927698
 ...         ...      ...        ...       ...           ...        ...
 232407    meis3    olig4   0.023177  0.023177  1.894128e-08   7.722591
 169224   hoxc1a     irx7  -0.023163  0.023163  4.268921e-08   7.369682
 342603    mafbb     tmsb   0.023161  0.023161  7.182542e-09   8.143722
 16973     tlx3b    apoc1   0.023159  0.023159  4.092135e-04   3.388050
 154290     arxb   hoxb1b  -0.023143  0.023143  7.714583e-09   8.112688
 
 [2000 rows x 6 columns],
 'Endoderm':         source   target  coef_mean  coef_abs             p      -log

In [17]:
filepath

'/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/09_network_motifs/'

In [None]:
# Run mFinder for all timepoints and all cell-types

links_filepath = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/"

for stage in timepoints:
    # import the Links object
    GRN_stage = co.load_hdf5(links_filepath + "08_"+stage+"_celltype_GRNs.celloracle.links")
    # grab only the filtered GRNs (2000 edges for each GRN)
    GRN_all = GRN_stage.filtered_links
    
    # for each cell-type, run mFinder
    for celltype in GRN_all.keys():

        print(celltype)

        # Step 1. subset the GRN for each celltype
        GRN_celltype = GRN_all[celltype]

        # Step 2. reformat the GRN (unique integers for gene_names)    
        list_genes_TFs = list(set(GRN_celltype.source).union(set(GRN_celltype.target)))
        # Create a dictionary mapping integers to gene names
        gene_dict = {index: gene_name for index, gene_name in enumerate(list_genes_TFs)}
        gene_dict

        # Now, we will reformat the GRN as described above
        # 1) grab the GRN, then extract the "source", "target", and create a dataframe
        # 2) add the "edge weight" as "1" for the third column
        df_mfinder = pd.DataFrame(columns =["source", "target", "edge_weight"])
        df_mfinder

        df_mfinder["source"] = GRN_celltype["source"]
        df_mfinder["target"] = GRN_celltype["target"]
        df_mfinder["edge_weight"] = 1

        df_mfinder

        # 3) convert the "source", "target" gene_names to "integers" using the gene_dict
        df_mfinder["source"] = df_mfinder["source"].map({v: k for k, v in gene_dict.items()})
        df_mfinder["target"] = df_mfinder["target"].map({v: k for k, v in gene_dict.items()})
        df_mfinder
        # save the reformatted GRN into a txt file
        df_mfinder.to_csv(filepath + "filtered_GRN_"+stage+"_"+celltype+"_mfinder_format.txt",
                          sep="\t", header=False, index=False)

        # Step 3. run mFinder
        # default setting (network_size=3, num_random_)
        # input filename
        input_file = "filtered_GRN_"+stage+"_"+celltype+"_mfinder_format.txt"
        # output filename
        output_file = "motifs_"+stage+"_"+celltype

        # define the mFinder command
        cmd = "mfinder "+input_file + " -f "+output_file
        cmd
        # run mFinder
        os.system(cmd)
        print(stage + "_" + celltype + " network motif computing completed")

Differentiating_Neurons
Usage : mfinder <Network input file name> -s <motif size> -r <no. of randon networks> [-f <output file name>] [more flags]

	-s <motif size>  :Motif size to search
	-r <rand net num> :Number of random networks to generate
	-f <output file name>  : Output file name
	-nd : Input network is a non-directed network.
	-p <num of samples>: run with Sampling method,
	-omem : output members list of all subgraphs
	-h : help

	Additional flags:

	Motif criteria flags:
	-m <value> : mfactor threshold to use when calculating motifs
	-z <value> : Z-score threshold to use when calculating motifs
	-u : Uniqueness threshold
	-nu : Dont count uniqueness and ignore uniqueness threshold

	Random networks randomization flags:

	-rs : use stubs method for generating random networks
	-rclust : Preserve clustering sequence in random networks
	-met :Use Metropolis algorithm to conserve triad-census
		in random networks
		(for s>3; Default : Do not use Metropolis)
	-t0 <(default 0.001)> 

In [6]:
# move to the directory where the output files will be saved
filepath = "/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/baseGRN_CisBP_RNA_zebrahub/09_network_motifs/"
os.chdir(filepath)

In [10]:
# import the Links object
GRN_links_TDR118 = co.load_hdf5("/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/data/processed_data/TDR118_cicero_output/08_TDR118_celltype_GRNs.celloracle.links")
GRN_links_TDR118

# grab only the filtered object (2000 edges)
GRN_all = GRN_links_TDR118.filtered_links

NameError: name 'stage' is not defined

In [11]:
# Run mFinder for each timepoint, for each celltype, then save the result to the "filepath"

for timepoints in stages:

    for celltype in GRN_all.keys():

        print(celltype)

        # Step 1. subset the GRN for each celltype
        GRN_celltype = GRN_all[celltype]

        # Step 2. reformat the GRN (unique integers for gene_names)    
        list_genes_TFs = list(set(GRN_celltype.source).union(set(GRN_celltype.target)))
        # Create a dictionary mapping integers to gene names
        gene_dict = {index: gene_name for index, gene_name in enumerate(list_genes_TFs)}
        gene_dict

        # Now, we will reformat the GRN as described above
        # 1) grab the GRN, then extract the "source", "target", and create a dataframe
        # 2) add the "edge weight" as "1" for the third column
        df_mfinder = pd.DataFrame(columns =["source", "target", "edge_weight"])
        df_mfinder

        df_mfinder["source"] = GRN_celltype["source"]
        df_mfinder["target"] = GRN_celltype["target"]
        df_mfinder["edge_weight"] = 1

        df_mfinder

        # 3) convert the "source", "target" gene_names to "integers" using the gene_dict
        df_mfinder["source"] = df_mfinder["source"].map({v: k for k, v in gene_dict.items()})
        df_mfinder["target"] = df_mfinder["target"].map({v: k for k, v in gene_dict.items()})
        df_mfinder
        # save the reformatted GRN into a txt file
        df_mfinder.to_csv(filepath + "filtered_GRN_"+celltype+"_mfinder_format.txt",
                          sep="\t", header=False, index=False)

        # Step 3. run mFinder
        # default setting (network_size=3, num_random_)
        # input filename
        input_file = "filtered_GRN_"+celltype+"_mfinder_format.txt"
        # output filename
        output_file = "motifs_" + celltype

        # define the mFinder command
        cmd = "mfinder "+input_file + " -f "+output_file
        cmd
        # run mFinder
        os.system(cmd)
        print(celltype + " network motif computing completed")

Adaxial_Cells
Input Network file is filtered_GRN_Adaxial_Cells_mfinder_format.txt
mfinder Version 1.20

Loading Network
	Reading Network file in <Source,Target,Weight> Format
Searching motifs size 3
Processing Real network...

 (Real network processing runtime was:    1.0 seconds.)
Processing Random networks
..........
 Estimated run time left :     18 seconds.

..........
 Estimated run time left :     20 seconds.

..........
 Estimated run time left :     19 seconds.

..........
 Estimated run time left :     16 seconds.

..........
 Estimated run time left :     14 seconds.

..........
 Estimated run time left :     11 seconds.

..........
 Estimated run time left :      9 seconds.

..........
 Estimated run time left :      6 seconds.

..........
 Estimated run time left :      3 seconds.

..........
 Estimated run time left :      0 seconds.


Calculating Results...

MOTIF FINDER RESULTS:

	Network name: filtered_GRN_Adaxial_Cells_mfinder_format.txt
	Network type: Directed
	Num of

In [None]:
# for celltype in GRN_all.keys():
    
#     print(celltype)
    
#     # Step 3. run mFinder
#     # default setting (network_size=3, num_random_)
#     # input filename
#     input_file = "filtered_GRN_"+celltype+"_mfinder_format.txt"
#     # output filename
#     output_file = "motifs_" + celltype

#     # define the mFinder command
#     cmd = "mfinder "+input_file + " -f "+output_file
#     cmd
#     # run mFinder
#     os.system(cmd)

NOTES
- Refer to mFinder documentation (Uri Alon's lab website)
- link: 

1) Use "screen"
2) the motif computation takes less than a minute (with the default settings, network_size=3, for a GRN with 2000 edges, from CellOracle).
3) however, if we increase the network_size=4, the runtime increased to 20 minutes, for the same input dataset.
