# Notebook to extract nodes corresponding to network motifs from a GRN (and mFinder results)

- Last updated: 9/28/2023
- Author: Yang-Joon Kim

- inputs: a GRN (CellOracle Links object), and a mFinder result (network motifs, a txt file)
- outputs(TBD): a dataframe of nodes, edges for corresponding motifs

- Tentative df-  columns: index_triplet, source_node, target_node, edge width, motif_name


In [1]:
# Import libraries
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns

import celloracle as co
co.__version__

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:
INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/google-noto-emoji/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)


'0.14.0'

In [4]:
# plotting module
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault) #Reset rcParams to default

# Editable text and proper LaTeX fonts in illustrator
# matplotlib.rcParams['ps.useafm'] = True
# Editable fonts. 42 is the magic number
mpl.rcParams['pdf.fonttype'] = 42
sns.set(style='whitegrid', context='paper')

# Plotting style function (run this before plotting the final figure)
def set_plotting_style():
    plt.style.use('seaborn-paper')
    plt.rc('axes', labelsize=12)
    plt.rc('axes', titlesize=12)
    plt.rc('xtick', labelsize=10)
    plt.rc('ytick', labelsize=10)
    plt.rc('legend', fontsize=10)
    plt.rc('text.latex', preamble=r'\usepackage{sfmath}')
    plt.rc('xtick.major', pad=2)
    plt.rc('ytick.major', pad=2)
    plt.rc('mathtext', fontset='stixsans', sf='sansserif')
    plt.rc('figure', figsize=[10,9])
    plt.rc('svg', fonttype='none')

set_plotting_style()

In [142]:
sys.path.append("/hpc/projects/data.science/yangjoon.kim/zebrahub_multiome/zebrahub-multiome-analysis/scripts/")
from extract_motifs_mFinder_output import extract_motifs_mFinder_output

In [143]:
help(extract_motifs_mFinder_output)

Help on function extract_motifs_mFinder_output in module extract_motifs_mFinder_output:

extract_motifs_mFinder_output(mFinder_output)



In [7]:
df = extract_motifs_mFinder_output()

['\n',
 '   Summary motif results\n',
 'mfinder Version 1.20\n',
 '\n',
 'MOTIF FINDER RESULTS:\n',
 '\n',
 '\tNetwork name: filtered_GRN_0budstage_Somites_mfinder_format.txt\n',
 '\tNetwork type: Directed\n',
 '\tNum of Nodes: 330 Num of Edges: 2000\n',
 '\tNum of Nodes with edges: 330\n',
 '\tMaximal out degree (out-hub) : 76\n',
 '\tMaximal in degree (in-hub) : 43\n',
 '\tRoots num: 116 Leaves num: 163\n',
 '\tSingle Edges num: 1910 Mutual Edges num: 44\n',
 '\n',
 '\tMotif size searched  3\n',
 '\tTotal number of 3-node subgraphs : 49166\n',
 '\tNumber of random networks generated : 100\n',
 '\tRandom networks generation method: Switches\n',
 '\tNum of Switches range: 100.0-200.0, Success switches Ratio:0.556+-0.00\n',
 '\n',
 'The following motifs were found:\n',
 '\n',
 'Criteria taken : Nreal Zscore > 2.00\n',
 '                 Pval ignored (due to small number of random networks)\n',
 '                 Mfactor > 1.10\n',
 '                 Uniqueness >= 4\n',
 '\n',
 '\n',
 '\

In [90]:
# extract the significant motifs (Z_score > 2)

# Initialize variables to store extracted lines
extracted_lines = []
# start_pattern - note that the start_pattern appears a couple of times (for insignificant motifs as well)
start_pattern = 'MOTIF\tNREAL\tNRAND\t\tNREAL\tNREAL\tUNIQ\tCREAL'

# Flag to indicate when to start and stop extraction
start_extraction = False
end_pattern_count = 0

# Append the start pattern to the extracted lines
extracted_lines.append(start_pattern)

# Iterate through the lines
for line in lines:
    if start_extraction:
        extracted_lines.append(line.strip())  # Add the stripped line to the result
        if not line.strip():
            end_pattern_count += 1
            if end_pattern_count == 2:
                break  # Stop extraction when two consecutive empty lines are found
        else:
            end_pattern_count = 0  # Reset the count if a non-empty line is encountered
    elif line.strip() == start_pattern:
        start_extraction = True  # Start extraction when the start pattern is found

# Print the extracted lines
for extracted_line in extracted_lines:
    print(extracted_line)

MOTIF	NREAL	NRAND		NREAL	NREAL	UNIQ	CREAL
ID		STATS		ZSCORE	PVAL	VAL	[MILI]

46	675	401.4+-19.6	13.99	0.000	9	13.73

0 1 1
1 0 1
0 0 0

108	100	64.2+-8.6	4.15	0.000	8	2.03

0 0 1
1 0 1
1 0 0

238	16	6.1+-2.3	4.30	0.000	4	0.33

0 1 1
1 0 1
1 1 0




In [110]:
motif_line_indices = [index for index,line in enumerate(extracted_lines) if len(line.split('\t')) == 7]
motif_line_indices

[3, 9, 15]

In [100]:
motifs

[[['46', '675', '401.4+-19.6', '13.99', '0.000', '9', '13.73']],
 [['108', '100', '64.2+-8.6', '4.15', '0.000', '8', '2.03']],
 [['238', '16', '6.1+-2.3', '4.30', '0.000', '4', '0.33']]]

In [120]:
data = [
    '0 1 1 \n',
    '1 0 1 \n',
    '0 0 0 \n',
]
data

['0 1 1 \n', '1 0 1 \n', '0 0 0 \n']

In [122]:
motif

array([[0, 1, 1],
       [1, 0, 1],
       [0, 0, 0]])

In [128]:
# str1 = 'MOTIF\tNREAL\tNRAND\t\tNREAL\tNREAL\tUNIQ\tCREAL\n'
# str2 = 'ID\t\tSTATS\t\tZSCORE\tPVAL\tVAL\t[MILI]\t\n'

# # Remove extra whitespace and newline characters
# str1 = str1.strip()
# str2 = str2.strip()

# the first two lines of the extracted text block are the column names
str1 = extracted_lines[0]
str2 = extracted_lines[1]

# concatenate the first and the second strings to create the column names
col_names = [em1 + ("_" + em2 if em2 else "") for em1, em2 in zip(str1.split("\t"), str2.split("\t"))]

# filter out the element that is an empty string (mFinder's mistake in formatting)
col_names = [element for element in col_names if element]
col_names

# create a dataframe to save the motif ID and scores
df = pd.DataFrame(columns=col_names)

# line indices for the motifs
motif_line_indices = [index for index,line in enumerate(extracted_lines) if len(line.split('\t')) == 7]

motifs_list = []

for index, line_index in enumerate(motif_line_indices):
    line = extracted_lines[line_index]
    df.loc[index] = line.split("\t")
    
    # extract the motif
    motif_matrice = extracted_lines[line_index+2:line_index+5]
    # Split into rows
    rows = [list(map(int, row.split())) for row in motif_matrice]
    # Convert values to a NumPy array
    motif = np.array(rows)
    
    motifs_list.append(motif)
    
df["motifs"] = motifs_list
df



Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI],motifs
0,46,675,401.4+-19.6,13.99,0.0,9,13.73,"[[0, 1, 1], [1, 0, 1], [0, 0, 0]]"
1,108,100,64.2+-8.6,4.15,0.0,8,2.03,"[[0, 0, 1], [1, 0, 1], [1, 0, 0]]"
2,238,16,6.1+-2.3,4.3,0.0,4,0.33,"[[0, 1, 1], [1, 0, 1], [1, 1, 0]]"


In [86]:
# # Initialize variables to store extracted lines
# extracted_lines = []
# start_pattern = 'MOTIF\tNREAL\tNRAND\t\tNREAL\tNREAL\tUNIQ\tCREAL'

# # Flag to indicate when to start and stop extraction
# start_extraction = False
# end_pattern_count = 0

# # Iterate through the lines
# for line in lines:
#     if start_extraction:
#         extracted_lines.append(line.strip())  # Add the stripped line to the result
#         if not line.strip():
#             end_pattern_count += 1
#             if end_pattern_count == 2:
#                 break  # Stop extraction when two consecutive empty lines are found
#         else:
#             end_pattern_count = 0  # Reset the count if a non-empty line is encountered
#     elif line.strip() == start_pattern:
#         start_extraction = True  # Start extraction when the start pattern is found

# # Print the extracted lines
# for extracted_line in extracted_lines:
#     print(extracted_line)

ID		STATS		ZSCORE	PVAL	VAL	[MILI]

46	675	401.4+-19.6	13.99	0.000	9	13.73

0 1 1
1 0 1
0 0 0

108	100	64.2+-8.6	4.15	0.000	8	2.03

0 0 1
1 0 1
1 0 0

238	16	6.1+-2.3	4.30	0.000	4	0.33

0 1 1
1 0 1
1 1 0




In [68]:
rows = [list(map(int, row.split())) for row in data]
motif1 = np.array(rows)
motif1

array([[0, 1, 1],
       [1, 0, 1],
       [0, 0, 0]])

In [35]:
str1 = 'MOTIF\tNREAL\tNRAND\t\tNREAL\tNREAL\tUNIQ\tCREAL\n'
str2 = 'ID\t\tSTATS\t\tZSCORE\tPVAL\tVAL\t[MILI]\t\n'

# Remove extra whitespace and newline characters
str1 = str1.strip()
str2 = str2.strip()

print(str1.split("\t"))
print(str2.split("\t"))

['MOTIF', 'NREAL', 'NRAND', '', 'NREAL', 'NREAL', 'UNIQ', 'CREAL']
['ID', '', 'STATS', '', 'ZSCORE', 'PVAL', 'VAL', '[MILI]']


In [42]:
result = [em1 + ("_" + em2 if em2 else "") for em1, em2 in zip(str1.split("\t"), str2.split("\t"))]

# filter out the element that is an empty string (mFinder's mistake in formatting)
result = [element for element in result if element]
result

['MOTIF_ID',
 'NREAL',
 'NRAND_STATS',
 'NREAL_ZSCORE',
 'NREAL_PVAL',
 'UNIQ_VAL',
 'CREAL_[MILI]']

In [41]:
str3 = '6\t23933\t24121.0+-74.2\t-2.53\t0.990\t486.78\t47\n'
str3 = str3.strip()
str3

print(str3.split("\t"))

['6', '23933', '24121.0+-74.2', '-2.53', '0.990', '486.78', '47']


In [49]:
data=str3.split("\t")
data

['6', '23933', '24121.0+-74.2', '-2.53', '0.990', '486.78', '47']

In [57]:
# create a dataframe to save the motif ID and scores
df = pd.DataFrame(columns=result)
df.loc[0] = str3.split("\t")
df

Unnamed: 0,MOTIF_ID,NREAL,NRAND_STATS,NREAL_ZSCORE,NREAL_PVAL,UNIQ_VAL,CREAL_[MILI]
0,6,23933,24121.0+-74.2,-2.53,0.99,486.78,47


In [62]:
# define a numpy array for the motif (size=3)
data = [
    '0 1 1 \n',
    '1 0 1 \n',
    '0 0 0 \n',
]

# Split into rows
rows = [row.strip().split() for row in data]

# Convert values to integers
motif1 = np.array([[int(value) for value in row] for row in rows])

# Print the resulting 3x3 NumPy array
print(motif1)
motif1

df["motif"] = [motif1]
df

[[0 1 1]
 [1 0 1]
 [0 0 0]]


array([[0, 1, 1],
       [1, 0, 1],
       [0, 0, 0]])