In [1]:
run_memelite ls

explore_richard_data.R.ipynb   test_memelite_py.ipynb
explore_richard_data_sh.ipynb  test_memelite_v1_sh.ipynb
parallel_py.ipynb	       test_memelite_v2_sh.ipynb


In [17]:
run_memelite python - <<'EOF'
import numpy as np
from memelite.io    import read_meme
from memelite.utils import one_hot_encode

def pwm_to_logodds(arr_motif_pwm_4xW, bg=(0.25,0.25,0.25,0.25), eps=1e-6):
    """Convert PWM (4,W) to log-odds (W,4)."""
    # Convert dtype and clip to avoid log2(0) = -inf
    arr_motif_pwm = arr_motif_pwm_4xW.astype(float)
    arr_motif_pwm = np.clip(arr_motif_pwm, eps, 1.0)

    # calculate the log-odds scores relative to background
    arr_motif_lod = np.log2(arr_motif_pwm / np.array(bg)[:,None])
    arr_motif_lod = arr_motif_lod.T # (4,W -> W,4)
    return arr_motif_lod # (W,4)

# Example usage
txt_fpath_motif = "/hpc/group/igvf/kk319/data/jaspar2024/JASPAR2024_CORE_vertebrates_non-redundant.meme"
dct_arr_motif   = read_meme(txt_fpath_motif)

# get a motif
txt_motif_name = "MA0002.3 Runx1"
arr_motif_pwm  = dct_arr_motif[txt_motif_name] # Shape (4, W)
print(arr_motif_pwm.shape)
print(arr_motif_pwm)
print()
arr_motif_lod = pwm_to_logodds(arr_motif_pwm)  
print(arr_motif_lod.shape)
print(arr_motif_lod)
EOF

(4, 9)
[[0.0615 0.0285 0.     0.0435 0.     0.0085 0.005  0.0655 0.25  ]
 [0.536  0.     0.0375 0.0635 0.     0.021  0.2    0.2315 0.079 ]
 [0.0745 0.0035 0.936  0.035  0.9935 0.924  0.1255 0.0405 0.1445]
 [0.328  0.968  0.0265 0.858  0.0065 0.0465 0.6695 0.6625 0.5265]]

(9, 4)
[[ -2.02326978   1.10030491  -1.74661576   0.39176772]
 [ -3.13289427 -17.93156857  -6.15842936   1.95307895]
 [-17.93156857  -2.73696559   1.90458043  -3.23786383]
 [ -2.52284079  -1.9770996   -2.83650127   1.77904955]
 [-17.93156857 -17.93156857   1.99059187  -5.26534457]
 [ -4.87832144  -3.57346686   1.88596476  -2.42662547]
 [ -5.64385619  -0.32192809  -0.99424073   1.42115596]
 [ -1.93236128  -0.1109159   -2.62593428   1.40599236]
 [  0.          -1.66200354  -0.7908586    1.07450544]]


In [20]:
run_memelite python - <<'EOF'
import numpy as np
from memelite.io    import read_meme
from memelite.utils import one_hot_encode

def pwm_to_logodds(arr_motif_pwm_4xW, bg=(0.25,0.25,0.25,0.25), eps=1e-6):
    """Convert PWM (4,W) to log-odds (W,4)."""
    # Convert dtype and clip to avoid log2(0) = -inf
    arr_motif_pwm = arr_motif_pwm_4xW.astype(float)
    arr_motif_pwm = np.clip(arr_motif_pwm, eps, 1.0)

    # calculate the log-odds scores relative to background
    arr_motif_lod = np.log2(arr_motif_pwm / np.array(bg)[:,None])
    arr_motif_lod = arr_motif_lod.T # (4,W -> W,4)
    return arr_motif_lod # (W,4)

def scan_motif(txt_seq, arr_motif_Wx4):
    """Scan a motif across sequence and return score vector."""
    # one hot encode sequence
    X = one_hot_encode(txt_seq).T  # (L,4)

    # get sequence and motif length
    L, W = X.shape[0], arr_motif_Wx4.shape[0]
    
    # matrix multiply
    scores = np.tensordot(X, arr_motif_Wx4, axes=([1],[1]))  # (L,W)
    scores = np.sum(scores, axis=1)  # (L,)
    return scores[:L-W+1]  # valid positions

# Example usage
txt_fpath_motif   = "/hpc/group/igvf/kk319/data/jaspar2024/JASPAR2024_CORE_vertebrates_non-redundant.meme"
dct_arr_motif_pwm = read_meme(txt_fpath_motif)

# get a motif
txt_motif_name = "MA0002.3 Runx1"
arr_motif_pwm  = dct_arr_motif_pwm[txt_motif_name] # Shape (4, W)
arr_motif_lod  = pwm_to_logodds(arr_motif_pwm)  

# 
txt_seq = "AAATGTGGTAACCCATGCGT"
scores = scan_motif(txt_seq, arr_motif_lod)
print("Scores:", scores)
print("Max score:", np.max(scores))
EOF

Scores: [-55.99668089 -55.99668089 -55.99668089  -2.90428389  -9.37144294
  -2.90428389  -9.37144294  -9.37144294  -2.90428389 -55.99668089
 -55.99668089 -45.14521182]
Max score: -2.904283888066286


In [4]:
run_memelite python - <<'EOF'
import numpy as np
from memelite.io    import read_meme
from memelite.utils import one_hot_encode

def pwm_to_logodds(arr_motif_pwm_4xW, bg=(0.25,0.25,0.25,0.25), eps=1e-6):
    """
    Convert PWM (4,W) to log-odds (W,4).

    Parameters
    ----------
    arr_motif_pwm_4xW : 
        Motif pwm matrix of shape (4,W).

    Returns
    -------
    np.ndarray
        Motif log-odds matrix of shape (W,4).
    """
    # Convert dtype and clip to avoid log2(0) = -inf
    arr_motif_pwm = arr_motif_pwm_4xW.astype(float)
    arr_motif_pwm = np.clip(arr_motif_pwm, eps, 1.0)

    # calculate the log-odds scores relative to background
    arr_motif_lod = np.log2(arr_motif_pwm / np.array(bg)[:,None])
    arr_motif_lod = arr_motif_lod.T # (4,W -> W,4)
    return arr_motif_lod # (W,4)

def reverse_complement_matrix(arr_seq_Nx4):
    """
    Reverse-complement any sequence matrix (N,4).
    Assumes columns order = [A, C, G, T].

    Parameters
    ----------
    arr_motif_Wx4 : 
        one-hot enocded matrix of a sequence with shape (N, 4).

    Returns
    -------
    np.ndarray
        one-hot enocded matrix of the Reverse-complement sequence with shape (N, 4)
    """
    lst_base_order = [3, 2, 1, 0]   # T,G,C,A
    arr_seq_rc = arr_seq_Nx4[::-1, lst_base_order] # flip positions + swap bases
    return arr_seq_rc

def scan_motif(txt_seq, arr_motif_Wx4):
    """
    Scan a motif across a sequence using sliding windows.

    Parameters
    ----------
    txt_seq : str
        DNA sequence (string of A/C/G/T).
    arr_motif_Wx4 : np.ndarray
        Motif log-odds matrix of shape (W,4).

    Returns
    -------
    np.ndarray
        Vector of scores, length = L - W + 1
    """
    # one hot encode sequence and get sequence/motif length
    X = one_hot_encode(txt_seq).T # (L,4)
    L, W = X.shape[0], arr_motif_Wx4.shape[0]

    if L < W:
        raise ValueError(f"Sequence length {L} shorter than motif length {W}")

    # build sliding windows: shape (L-W+1, W, 4)
    arr_windows = np.lib.stride_tricks.sliding_window_view(X, (W,4))
    arr_windows = arr_windows.reshape(-1, W, 4)

    # multiply each window with motif score and sum
    arr_scores = np.tensordot(arr_windows, arr_motif_Wx4, axes=([1,2],[0,1]))

    return arr_scores # shape (L-W+1,)

def scan_motif_both_strands(txt_seq, arr_motif_Wx4):
    """
    Scan motif on both forward and reverse-complement strands.

    Returns
    -------
    tuple of np.ndarray
        (scores_forward, scores_reverse)
    """
    # Forward motif scanning
    arr_score_fwd = scan_motif(txt_seq, arr_motif_Wx4)

    # Reverse motif scanning by reverse complement motif
    arr_motif_rc   = reverse_complement_matrix(arr_motif_Wx4)
    arr_score_rev = scan_motif(txt_seq, arr_motif_rc)

    return arr_score_fwd, arr_score_rev


# import motifs pwm
txt_fpath_motif   = "/hpc/group/igvf/kk319/data/jaspar2024/JASPAR2024_CORE_vertebrates_non-redundant.meme"
dct_arr_motif_pwm = read_meme(txt_fpath_motif)

# get a motif
txt_motif_name = "MA0002.3 Runx1"
arr_motif_pwm  = dct_arr_motif_pwm[txt_motif_name] # Shape (4, W)
arr_motif_lod  = pwm_to_logodds(arr_motif_pwm)  

# Example sequence
txt_seq = "ACGTGCTATGTGGTCTAATCGTACGATGCTAGCTAGTACG"

# Example scanning
arr_score_fwd, arr_score_rev = scan_motif_both_strands(txt_seq, arr_motif_lod)

num_score_fwd_max = np.max(arr_score_fwd)
num_score_rev_max = np.max(arr_score_rev)

num_index_fwd_max = np.argmax(arr_score_fwd)
num_index_rev_max = np.argmax(arr_score_rev)

print("Forward scores:", arr_score_fwd)
print("Reverse scores:", arr_score_rev)
print("Forward best score:", num_score_fwd_max, "at index", num_index_fwd_max)
print("Reverse best score:", num_score_rev_max, "at index", num_index_rev_max)
EOF

Forward scores: [-17.29078324 -36.51940603 -10.13999487 -27.4618787  -30.81390569
 -16.08083834 -13.44512337   9.87474129 -11.19299608 -10.0754744
 -28.94293773 -29.16354193 -23.04196141 -45.53205645 -25.8280853
 -37.11245824 -17.35279177 -18.73518608 -37.21330287 -31.89315532
 -25.45671641 -28.70343865 -22.86841425 -24.09558239 -41.24386318
 -10.3170272  -21.68611723 -28.32382787 -45.26816506 -12.54556009
 -27.35842785 -32.42055325]
Reverse scores: [-45.52150762 -15.92179967 -56.06925409 -18.54121839 -67.02991988
 -49.32659867 -35.33647449 -52.08004126 -30.61919443 -43.84455151
  -9.15146901 -29.80070375 -35.94291503 -36.53485921 -29.97485138
 -19.76278623 -30.02779128 -37.21330287 -18.73518608 -14.22921709
 -54.04951602 -21.86856532 -25.0451967  -40.00519453 -32.92163131
 -26.36500294 -14.06246836 -48.00467403 -28.60048188 -18.66058214
 -29.87652218 -41.10881713]
Forward best score: 9.874741286013993 at index 7
Reverse best score: -9.151469012735964 at index 10


## Parallelization

In [2]:
run_memelite python - <<'EOF'
from memelite.io import read_meme

# Load JASPAR motifs
txt_fpath_motif = "/hpc/group/igvf/kk319/data/jaspar2024/JASPAR2024_CORE_vertebrates_non-redundant.meme"
dct_arr_motif_pwm = read_meme(txt_fpath_motif)

# Compute motif lengths
motif_lengths = {name: pwm.shape[1] for name, pwm in dct_arr_motif_pwm.items()}

print("Total motifs:", len(motif_lengths))
print("Max motif length:", max(motif_lengths.values()))
print("Min motif length:", min(motif_lengths.values()))

# Show a few longest motifs
longest = sorted(motif_lengths.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 longest motifs:")
for name, length in longest:
    print(f"{name}: {length} bp")
EOF

Total motifs: 879
Max motif length: 33
Min motif length: 5
Top 5 longest motifs:
MA1930.2 CTCF: 33 bp
MA1929.2 CTCF: 31 bp
MA2335.1 ZNF558: 29 bp
MA1594.1 ZNF382: 24 bp
MA1654.2 ZNF16: 21 bp


In [10]:
run_memelite python - <<'EOF'
import numpy as np
import os, time
import statistics

from memelite.io    import read_meme
from memelite.utils import one_hot_encode

def pwm_to_logodds(arr_motif_pwm_4xW, bg=(0.25,0.25,0.25,0.25), eps=1e-6):
    """
    Convert PWM (4,W) to log-odds (W,4).

    Parameters
    ----------
    arr_motif_pwm_4xW : 
        Motif pwm matrix of shape (4,W).

    Returns
    -------
    np.ndarray
        Motif log-odds matrix of shape (W,4).
    """
    # Convert dtype and clip to avoid log2(0) = -inf
    arr_motif_pwm = arr_motif_pwm_4xW.astype(float)
    arr_motif_pwm = np.clip(arr_motif_pwm, eps, 1.0)

    # calculate the log-odds scores relative to background
    arr_motif_lod = np.log2(arr_motif_pwm / np.array(bg)[:,None])
    arr_motif_lod = arr_motif_lod.T # (4,W -> W,4)
    return arr_motif_lod # (W,4)

def reverse_complement_matrix(arr_seq_Nx4):
    """
    Reverse-complement any sequence matrix (N,4).
    Assumes columns order = [A, C, G, T].

    Parameters
    ----------
    arr_motif_Wx4 : 
        one-hot enocded matrix of a sequence with shape (N, 4).

    Returns
    -------
    np.ndarray
        one-hot enocded matrix of the Reverse-complement sequence with shape (N, 4)
    """
    lst_base_order = [3, 2, 1, 0]   # T,G,C,A
    arr_seq_rc = arr_seq_Nx4[::-1, lst_base_order] # flip positions + swap bases
    return arr_seq_rc

def scan_motif(txt_seq, arr_motif_Wx4):
    """
    Scan a motif across a sequence using sliding windows.

    Parameters
    ----------
    txt_seq : str
        DNA sequence (string of A/C/G/T).
    arr_motif_Wx4 : np.ndarray
        Motif log-odds matrix of shape (W,4).

    Returns
    -------
    np.ndarray
        Vector of scores, length = L - W + 1
    """
    # one hot encode sequence and get sequence/motif length
    X = one_hot_encode(txt_seq).T # (L,4)
    L, W = X.shape[0], arr_motif_Wx4.shape[0]

    if L < W:
        raise ValueError(f"Sequence length {L} shorter than motif length {W}")

    # build sliding windows: shape (L-W+1, W, 4)
    arr_windows = np.lib.stride_tricks.sliding_window_view(X, (W,4))
    arr_windows = arr_windows.reshape(-1, W, 4)

    # multiply each window with motif score and sum
    arr_scores = np.tensordot(arr_windows, arr_motif_Wx4, axes=([1,2],[0,1]))

    return arr_scores # shape (L-W+1,)

def scan_motif_both_strands(txt_seq, arr_motif_Wx4):
    """
    Scan motif on both forward and reverse-complement strands.

    Returns
    -------
    tuple of np.ndarray
        (scores_forward, scores_reverse)
    """
    # Forward motif scanning
    arr_score_fwd = scan_motif(txt_seq, arr_motif_Wx4)

    # Reverse motif scanning by reverse complement motif
    arr_motif_rc   = reverse_complement_matrix(arr_motif_Wx4)
    arr_score_rev = scan_motif(txt_seq, arr_motif_rc)

    return arr_score_fwd, arr_score_rev

def scan_all_motifs_serial(txt_seq, dct_arr_motif_pwm):
    """
    Scan all motifs on a single sequence (serial version).
    Returns dict[motif_name] -> {"forward": fwd_scores, "reverse": rev_scores}.
    """
    dct_results = {}
    for txt_motif_name, arr_motif_pwm_4xW in dct_arr_motif_pwm.items():
        arr_motif_lod_Wx4 = pwm_to_logodds(arr_motif_pwm_4xW)
        arr_scores_fwd, arr_scores_rev = scan_motif_both_strands(txt_seq, arr_motif_lod_Wx4)
        dct_results[txt_motif_name] = {"forward": arr_scores_fwd, "reverse": arr_scores_rev}
    return dct_results

# import motifs pwm
txt_fpath_motif   = "/hpc/group/igvf/kk319/data/jaspar2024/JASPAR2024_CORE_vertebrates_non-redundant.meme"
dct_arr_motif_pwm = read_meme(txt_fpath_motif)

# Example sequence
txt_seq = "ACGTACGTACGTACGTACGTACGTACGTACGTACGATGCATGCATGCATGCATGCATGCATGCATGCATGCAT"

# Benchmark scanning (Serial)
lst_num_runtime_serial = []
for _ in range(5):
    t0 = time.time()
    _  = scan_all_motifs_serial(txt_seq, dct_arr_motif_pwm)
    lst_num_runtime_serial.append(time.time() - t0)

print("Mean:", statistics.mean(lst_num_runtime_serial))
print("Std:", statistics.stdev(lst_num_runtime_serial))
#time_start = time.time()
#dct_results_serial = scan_all_motifs_serial(txt_seq, dct_arr_motif_pwm)
#print(f"Serial: {time.time() - time_start:.3f} sec")
EOF

Mean: 0.128998327255249
Std: 0.0018945704806931603


In [11]:
run_memelite python - <<'EOF'
import numpy as np
import os, time
import statistics

from memelite.io    import read_meme
from memelite.utils import one_hot_encode

from concurrent.futures import ProcessPoolExecutor, as_completed

def pwm_to_logodds(arr_motif_pwm_4xW, bg=(0.25,0.25,0.25,0.25), eps=1e-6):
    """
    Convert PWM (4,W) to log-odds (W,4).

    Parameters
    ----------
    arr_motif_pwm_4xW : 
        Motif pwm matrix of shape (4,W).

    Returns
    -------
    np.ndarray
        Motif log-odds matrix of shape (W,4).
    """
    # Convert dtype and clip to avoid log2(0) = -inf
    arr_motif_pwm = arr_motif_pwm_4xW.astype(float)
    arr_motif_pwm = np.clip(arr_motif_pwm, eps, 1.0)

    # calculate the log-odds scores relative to background
    arr_motif_lod = np.log2(arr_motif_pwm / np.array(bg)[:,None])
    arr_motif_lod = arr_motif_lod.T # (4,W -> W,4)
    return arr_motif_lod # (W,4)

def reverse_complement_matrix(arr_seq_Nx4):
    """
    Reverse-complement any sequence matrix (N,4).
    Assumes columns order = [A, C, G, T].

    Parameters
    ----------
    arr_motif_Wx4 : 
        one-hot enocded matrix of a sequence with shape (N, 4).

    Returns
    -------
    np.ndarray
        one-hot enocded matrix of the Reverse-complement sequence with shape (N, 4)
    """
    lst_base_order = [3, 2, 1, 0]   # T,G,C,A
    arr_seq_rc = arr_seq_Nx4[::-1, lst_base_order] # flip positions + swap bases
    return arr_seq_rc

def scan_motif(txt_seq, arr_motif_Wx4):
    """
    Scan a motif across a sequence using sliding windows.

    Parameters
    ----------
    txt_seq : str
        DNA sequence (string of A/C/G/T).
    arr_motif_Wx4 : np.ndarray
        Motif log-odds matrix of shape (W,4).

    Returns
    -------
    np.ndarray
        Vector of scores, length = L - W + 1
    """
    # one hot encode sequence and get sequence/motif length
    X = one_hot_encode(txt_seq).T # (L,4)
    L, W = X.shape[0], arr_motif_Wx4.shape[0]

    if L < W:
        raise ValueError(f"Sequence length {L} shorter than motif length {W}")

    # build sliding windows: shape (L-W+1, W, 4)
    arr_windows = np.lib.stride_tricks.sliding_window_view(X, (W,4))
    arr_windows = arr_windows.reshape(-1, W, 4)

    # multiply each window with motif score and sum
    arr_scores = np.tensordot(arr_windows, arr_motif_Wx4, axes=([1,2],[0,1]))

    return arr_scores # shape (L-W+1,)

def scan_motif_both_strands(txt_seq, arr_motif_Wx4):
    """
    Scan motif on both forward and reverse-complement strands.

    Returns
    -------
    tuple of np.ndarray
        (scores_forward, scores_reverse)
    """
    # Forward motif scanning
    arr_score_fwd = scan_motif(txt_seq, arr_motif_Wx4)

    # Reverse motif scanning by reverse complement motif
    arr_motif_rc   = reverse_complement_matrix(arr_motif_Wx4)
    arr_score_rev = scan_motif(txt_seq, arr_motif_rc)

    return arr_score_fwd, arr_score_rev

def scan_all_motifs_serial(txt_seq, dct_arr_motif_pwm):
    """
    Scan all motifs on a single sequence (serial version).
    Returns dict[motif_name] -> {"forward": fwd_scores, "reverse": rev_scores}.
    """
    dct_results = {}
    for txt_motif_name, arr_motif_pwm_4xW in dct_arr_motif_pwm.items():
        arr_motif_lod_Wx4 = pwm_to_logodds(arr_motif_pwm_4xW)
        arr_scores_fwd, arr_scores_rev = scan_motif_both_strands(txt_seq, arr_motif_lod_Wx4)
        dct_results[txt_motif_name] = {"forward": arr_scores_fwd, "reverse": arr_scores_rev}
    return dct_results

def scan_one_motif(txt_motif_name, arr_motif_pwm_4xW, txt_seq):
    """Helper: scan one motif on one sequence."""
    arr_motif_lod_Wx4 = pwm_to_logodds(arr_motif_pwm_4xW)
    arr_scores_fwd, arr_scores_rev = scan_motif_both_strands(txt_seq, arr_motif_lod_Wx4)
    return txt_motif_name, arr_scores_fwd, arr_scores_rev

def scan_all_motifs_parallel(txt_seq, dct_arr_motif_pwm, max_workers=8):
    """
    Scan all motifs on a single sequence (parallel version).
    Returns dict[motif_name] -> {"forward": fwd_scores, "reverse": rev_scores}.
    """
    dct_results = {}
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        futures = [
            ex.submit(scan_one_motif, name, pwm, txt_seq)
            for name, pwm in dct_arr_motif_pwm.items()
        ]
        for f in as_completed(futures):
            name, fwd, rev = f.result()
            dct_results[name] = {"forward": fwd, "reverse": rev}
    return dct_results

def benchmark_scan(txt_seq, dct_arr_motif_pwm, num_runs=5, max_workers=8):
    # Serial
    times_serial = []
    for _ in range(num_runs):
        t0 = time.time()
        _ = scan_all_motifs_serial(txt_seq, dct_arr_motif_pwm)
        times_serial.append(time.time() - t0)

    # Parallel
    times_parallel = []
    for _ in range(num_runs):
        t0 = time.time()
        _ = scan_all_motifs_parallel(txt_seq, dct_arr_motif_pwm, max_workers=max_workers)
        times_parallel.append(time.time() - t0)

    print("---- Benchmark ----")
    print(f"Serial   mean={statistics.mean(times_serial):.3f}s  std={statistics.stdev(times_serial):.3f}")
    print(f"Parallel mean={statistics.mean(times_parallel):.3f}s  std={statistics.stdev(times_parallel):.3f} (workers={max_workers})")

# import motifs pwm
txt_fpath_motif   = "/hpc/group/igvf/kk319/data/jaspar2024/JASPAR2024_CORE_vertebrates_non-redundant.meme"
dct_arr_motif_pwm = read_meme(txt_fpath_motif)

# Example sequence
txt_seq = "ACGTACGTACGTACGTACGTACGTACGTACGTACGATGCATGCATGCATGCATGCATGCATGCATGCATGCAT"

# Benchmark scanning (Serial)
benchmark_scan(txt_seq, dct_arr_motif_pwm, num_runs=5, max_workers=8)
EOF

---- Benchmark ----
Serial   mean=0.132s  std=0.002
Parallel mean=0.184s  std=0.044 (workers=8)
