**Set environment**

In [1]:
source ../run_config_project.sh
show_env

BASE DIRECTORY (FD_BASE):      /hpc/group/igvf/kk319
REPO DIRECTORY (FD_REPO):      /hpc/group/igvf/kk319/repo
WORK DIRECTORY (FD_WORK):      /hpc/group/igvf/kk319/work
DATA DIRECTORY (FD_DATA):      /hpc/group/igvf/kk319/data
CONTAINER DIR. (FD_SING):      /hpc/group/igvf/kk319/container

You are working with           
PATH OF PROJECT (FD_PRJ):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR
PROJECT RESULTS (FD_RES):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/results
PROJECT SCRIPTS (FD_EXE):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/scripts
PROJECT DATA    (FD_DAT):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/data
PROJECT NOTE    (FD_NBK):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/notebooks
PROJECT DOCS    (FD_DOC):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/docs
PROJECT LOG     (FD_LOG):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/log
PROJECT REF     (FD_REF):      /hpc/group/igvf/kk319/repo/Proj_IGVF_BlueSTARR/references
PR

## Explore motif

```
MEME version 4

ALPHABET= ACGT

Background letter frequencies
A 0.25 C 0.25 G 0.25 T 0.25
```

In [2]:
TXT_FDIRY=${FD_DATA}/jaspar2024
TXT_FNAME="JASPAR2024_CORE_vertebrates_non-redundant.meme"
TXT_FPATH=${TXT_FDIRY}/${TXT_FNAME}

cat ${TXT_FPATH} | head -n 22

MEME version 4

ALPHABET= ACGT

strands: + -

Background letter frequencies
A 0.25 C 0.25 G 0.25 T 0.25

MOTIF MA0002.3 Runx1
letter-probability matrix: alength= 4 w= 9 nsites= 2000 E= 0
 0.061500  0.536000  0.074500  0.328000
 0.028500  0.000000  0.003500  0.968000
 0.000000  0.037500  0.936000  0.026500
 0.043500  0.063500  0.035000  0.858000
 0.000000  0.000000  0.993500  0.006500
 0.008500  0.021000  0.924000  0.046500
 0.005000  0.200000  0.125500  0.669500
 0.065500  0.231500  0.040500  0.662500
 0.250000  0.079000  0.144500  0.526500
URL http://jaspar.genereg.net/matrix/MA0002.3



## Export motif to numpy array

In [5]:
run_memelite python - <<'EOF'
import numpy as np
import os

from memelite.io import read_meme

def pwm_to_logodds(arr_motif_pwm, bg=(0.25,0.25,0.25,0.25), eps=1e-6):
    """
    Convert PWM (W,4) to log-odds (W,4).

    Parameters
    ----------
    arr_motif_pwm : 
        Motif pwm matrix of shape (W,4).

    Returns
    -------
    np.ndarray
        Motif log-odds matrix of shape (W,4).
    """
    # Ensure dtype and clip to avoid log2(0) = -inf
    arr_motif_pwm = arr_motif_pwm.astype(float)
    arr_motif_pwm = np.clip(arr_motif_pwm, eps, 1.0)

    # calculate the log-odds scores relative to background
    arr_motif_lod = np.log2(arr_motif_pwm / np.array(bg)[None,:])
    return arr_motif_lod

def main(txt_fpath_inp, txt_fpath_out, bg=(0.25,0.25,0.25,0.25)):
    """Main function"""
    
    ### import meme
    dct_arr_motif_pwm_4xW = read_meme(txt_fpath_inp)

    ### convert PWM to log-odds
    ### Note: motif from meme file with shape (4, W)
    dct_arr_motif_pwm_Wx4 = dict()
    dct_arr_motif_lod_Wx4 = dict()
    
    for txt_motif_name, arr_motif_pwm_4xW in dct_arr_motif_pwm_4xW.items():
        ### convert pwm (4,W) -> pwm (W,4) -> log-odds (W,4)
        arr_motif_pwm_Wx4 = arr_motif_pwm_4xW.T.astype(float)
        arr_motif_lod_Wx4 = pwm_to_logodds(arr_motif_pwm_Wx4, bg=bg)

        ### collect results
        dct_arr_motif_pwm_Wx4[txt_motif_name] = arr_motif_pwm_Wx4
        dct_arr_motif_lod_Wx4[txt_motif_name] = arr_motif_lod_Wx4

    ### export motif pwm and log-odds
    np.savez_compressed(
        txt_fpath_out,
        pwms  = np.array(dct_arr_motif_pwm_Wx4, dtype=object),  # dict of {motif_name: (W,4)}
        lods  = np.array(dct_arr_motif_lod_Wx4, dtype=object),  # dict of {motif_name: (W,4)}
        bg    = np.array(bg, dtype=np.float32),                 # background probs (A,C,G,T)
        names = np.array(list(dct_arr_motif_pwm_Wx4.keys()), dtype=object),  # motif names
        alphabet="ACGT",
    )

    print(f"Saved {len(dct_arr_motif_pwm_4xW)} motifs -> {txt_fpath_out}")
    
if __name__ == "__main__":

    ### Define input/output file path
    txt_fdiry = "/hpc/group/igvf/kk319/data/jaspar2024"
    
    txt_fname_inp = "JASPAR2024_CORE_vertebrates_non-redundant.meme"
    txt_fpath_inp = os.path.join(txt_fdiry, txt_fname_inp)

    txt_fname_out = "JASPAR2024_CORE_vertebrates_non-redundant.npz"
    txt_fpath_out = os.path.join(txt_fdiry, txt_fname_out)
    
    ### Run main function
    main(txt_fpath_inp, txt_fpath_out)

EOF

Saved 879 motifs -> /hpc/group/igvf/kk319/data/jaspar2024/JASPAR2024_CORE_vertebrates_non-redundant.npz
