This notebook contains example code to load data from DNALongBench. 

In [1]:
root = '/work/magroup/shared/DNA_LLM/DNALongBench/'

In [2]:
!nvidia-smi

Thu Jul  3 01:30:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               On  |   00000000:56:00.0 Off |                  Off |
| 30%   26C    P8             22W /  270W |    5862MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch
from utils import load_data

2025-07-03 01:30:25.899439: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-03 01:30:25.913108: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-03 01:30:25.928712: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-03 01:30:25.933791: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-03 01:30:25.946556: I tensorflow/core/platform/cpu_feature_guar

# Regulatory Sequence Activity Prediction

This task has two subsets: "human" and "mouse"

In [4]:
train_loader, valid_loader, test_loader = load_data(root=root, task_name = 'regulatory_sequence_activity', organism = 'human', cell_type=None, batch_size=16)

In [5]:
for batch in train_loader: 
        x, y = batch
        print('x:',x.size())
        print('y:',y.size())
        break


2025-07-03 01:30:34.876140: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 40903 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:56:00.0, compute capability: 8.6
2025-07-03 01:30:35.237066: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


x: torch.Size([16, 196608, 4])
y: torch.Size([16, 896, 5313])


# Contact Map Prediction

This task has five cell types: 'HFF', 'H1hESC', 'GM12878', 'IMR90', 'HCT116'.

In [6]:
train_loader, valid_loader, test_loader = load_data(root=root, task_name = 'contact_map_prediction', organism = None, cell_type='HFF', batch_size=16)

In [7]:
for batch in train_loader: 
        x, y = batch
        print('x:',x.size())
        print('y:',y.size())
        break


x: torch.Size([16, 1048576, 4])
y: torch.Size([16, 99681])


# Transcription Initiation Signal Prediction

In [8]:
train_loader, valid_loader, test_loader = load_data(root=root, task_name = 'transcription_initiation_signal_prediction', organism = 'None', cell_type='None', batch_size=16)

In [9]:
for batch in train_loader: 
        x, y = batch
        print('x:',x.size())
        print('y:',y.size())
        break


x: torch.Size([16, 100000, 4])
y: torch.Size([16, 10, 100000])


# Enhancer target gene prediction

In [10]:
train_loader, valid_loader, test_loader = load_data(root = root, task_name = 'enhancer_target_gene_prediction', organism = None, cell_type = None, batch_size = 16)

> load config done
> init fasta extractor done
> Start parsing EPI records to build the dataset train


100%|██████████| 2602/2602 [00:20<00:00, 129.68it/s]


# Finish parsing EPI records
# Total records:  2602
# Skipped records due to different chromosomes:  0
# Skipped records due to distance cutoff:  0
# Skipped records due to unknown strand:  0
# Select records 2066 with subset train 
> load config done
> init fasta extractor done
> Start parsing EPI records to build the dataset valid


100%|██████████| 2602/2602 [00:02<00:00, 1024.43it/s]


# Finish parsing EPI records
# Total records:  2602
# Skipped records due to different chromosomes:  0
# Skipped records due to distance cutoff:  0
# Skipped records due to unknown strand:  0
# Select records 266 with subset valid 
> load config done
> init fasta extractor done
> Start parsing EPI records to build the dataset test


100%|██████████| 2602/2602 [00:02<00:00, 940.73it/s] 

# Finish parsing EPI records
# Total records:  2602
# Skipped records due to different chromosomes:  0
# Skipped records due to distance cutoff:  0
# Skipped records due to unknown strand:  0
# Select records 270 with subset test 





In [11]:
for batch in train_loader: 
        x, y = batch
        print('x:',x.size())
        print('y:',y.size())
        break


x: torch.Size([16, 450000, 4])
y: torch.Size([16])


# eQTL prediction

This task has nine cell types: 'Adipose_Subcutaneous', 'Artery_Tibial', 'Cells_Cultured_fibroblasts', 'Muscle_Skeletal', 'Nerve_Tibial', 'Skin_Not_Sun_Exposed_Suprapubic', 'Skin_Sun_Exposed_Lower_leg', 'Thyroid', 'Whole_Blood'.

In [12]:
train_loader, valid_loader, test_loader = load_data(root = root, task_name = 'eqtl_prediction', organism = None, cell_type = 'Adipose_Subcutaneous', batch_size = 16)

> load config done
> init fasta extractor done
> Start parsing eQTL records to build the dataset train


100%|██████████| 2181/2181 [00:22<00:00, 98.67it/s] 


# Finish parsing eQTL records
# Total records:  2181
# Skipped records due to different chromosomes:  0
# Skipped records due to distance cutoff:  1
# Skipped records due to unknown strand:  0
# Select records 1279 with subset train 
> load config done
> init fasta extractor done
> Start parsing eQTL records to build the dataset valid


100%|██████████| 2181/2181 [00:06<00:00, 337.62it/s]


# Finish parsing eQTL records
# Total records:  2181
# Skipped records due to different chromosomes:  0
# Skipped records due to distance cutoff:  1
# Skipped records due to unknown strand:  0
# Select records 565 with subset valid 
> load config done
> init fasta extractor done
> Start parsing eQTL records to build the dataset test


100%|██████████| 2181/2181 [00:04<00:00, 516.71it/s]

# Finish parsing eQTL records
# Total records:  2181
# Skipped records due to different chromosomes:  0
# Skipped records due to distance cutoff:  3
# Skipped records due to unknown strand:  0
# Select records 332 with subset test 





In [13]:
for batch in train_loader: 
        print('x_ref:', batch['x_ref'].size())
        print('x_alt', batch['x_alt'].size())
        print('y:',batch['y'].size())
        break


x_ref: torch.Size([16, 450000, 4])
x_alt torch.Size([16, 450000, 4])
y: torch.Size([16])
