In [1]:
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# initialize path and varialbes

In [2]:
TRAIN_DATA_PATH = f"data/sample_train.txt"
VALID_DATA_PATH = f"data/sample_val.txt"
TEST_DATA_PATH = f"data/sample_test.txt"
MODEL_LOG_DIR = f"model_weight"
TRAIN_BATCH_SIZE = 32
N_PROCS = 8
VALID_BATCH_SIZE = 32
lr = 0.001 #0.005 for DREAM-RNN and DREAM-CNN, 0.001 for DREAM-Attn
BATCH_PER_EPOCH = len(pd.read_csv(TRAIN_DATA_PATH))//TRAIN_BATCH_SIZE
BATCH_PER_VALIDATION = len(pd.read_csv(VALID_DATA_PATH))//TRAIN_BATCH_SIZE
SEQ_SIZE = 249
NUM_EPOCHS = 5 #80
CUDA_DEVICE_ID = 0
generator = torch.Generator()
generator.manual_seed(42)
device = torch.device(f"cuda:{CUDA_DEVICE_ID}")

# Model

### DREAM-RNN

In [11]:
from prixfixe.autosome import AutosomeFinalLayersBlock
from prixfixe.bhi import BHIFirstLayersBlock
from prixfixe.bhi import BHICoreBlock
from prixfixe.prixfixe import PrixFixeNet

first = BHIFirstLayersBlock(
            in_channels = 5,
            out_channels = 320,
            seqsize = 249,
            kernel_sizes = [9, 15],
            pool_size = 1,
            dropout = 0.2
        )

core = BHICoreBlock(
in_channels = first.out_channels,
out_channels = 320,
seqsize = first.infer_outseqsize(),
lstm_hidden_channels = 320,
kernel_sizes = [9, 15],
pool_size = 1,
dropout1 = 0.2,
dropout2 = 0.5
)

final = AutosomeFinalLayersBlock(in_channels=core.out_channels)

model = PrixFixeNet(
    first=first,
    core=core,
    final=final,
    generator=generator
)

from torchinfo import summary
print(summary(model, (1, 5, 249)))

Layer (type:depth-idx)                   Output Shape              Param #
PrixFixeNet                              [1, 1]                    --
├─BHIFirstLayersBlock: 1-1               --                        --
│    └─ModuleList: 2-1                   --                        --
│    │    └─ConvBlock: 3-1               [1, 160, 249]             7,360
│    │    └─ConvBlock: 3-2               [1, 160, 249]             12,160
├─BHICoreBlock: 1-2                      --                        --
│    └─LSTM: 2-2                         [1, 249, 640]             1,643,520
│    └─ModuleList: 2-3                   --                        --
│    │    └─ConvBlock: 3-3               [1, 160, 249]             921,760
│    │    └─ConvBlock: 3-4               [1, 160, 249]             1,536,160
│    └─Dropout: 2-4                      [1, 320, 249]             --
├─AutosomeFinalLayersBlock: 1-3          --                        --
│    └─Conv1d: 2-5                       [1, 256, 249]     

### DREAM-CNN

In [4]:
from prixfixe.autosome import (AutosomeCoreBlock,
                      AutosomeFinalLayersBlock)
from prixfixe.bhi import BHIFirstLayersBlock
from prixfixe.prixfixe import PrixFixeNet

first = BHIFirstLayersBlock(
            in_channels = 5,
            out_channels = 320,
            seqsize = 249,
            kernel_sizes = [9, 15],
            pool_size = 1,
            dropout = 0.2
        )

core = AutosomeCoreBlock(in_channels=first.out_channels,
                        out_channels =64,
                        seqsize=first.infer_outseqsize())

final = AutosomeFinalLayersBlock(in_channels=core.out_channels)

model = PrixFixeNet(
    first=first,
    core=core,
    final=final,
    generator=generator
)

from torchinfo import summary
print(summary(model, (1, 5, 249)))

Layer (type:depth-idx)                        Output Shape              Param #
PrixFixeNet                                   [1, 1]                    --
├─BHIFirstLayersBlock: 1-1                    --                        --
│    └─ModuleList: 2-1                        --                        --
│    │    └─ConvBlock: 3-1                    [1, 160, 249]             7,360
│    │    └─ConvBlock: 3-2                    [1, 160, 249]             12,160
├─AutosomeCoreBlock: 1-2                      --                        --
│    └─ModuleDict: 2-2                        --                        --
│    │    └─Sequential: 3-3                   [1, 320, 249]             420,048
│    │    └─Sequential: 3-4                   [1, 128, 249]             573,696
│    │    └─Sequential: 3-5                   [1, 128, 249]             173,856
│    │    └─Sequential: 3-6                   [1, 128, 249]             229,632
│    │    └─Sequential: 3-7                   [1, 128, 249]         

### DREAM-Attn

In [5]:
from prixfixe.autosome import (
                      AutosomeFirstLayersBlock,
                      AutosomeFinalLayersBlock)
from prixfixe.unlockdna import UnlockDNACoreBlock
from prixfixe.prixfixe import PrixFixeNet

first = AutosomeFirstLayersBlock(in_channels=5,
                                out_channels=256, 
                                seqsize=249)
core = UnlockDNACoreBlock(
    in_channels = first.out_channels, out_channels= first.out_channels, seqsize = 249, 
    n_blocks = 4,kernel_size = 15, rate = 0.1, num_heads = 8)

final = AutosomeFinalLayersBlock(in_channels=core.out_channels)

model = PrixFixeNet(
    first=first,
    core=core,
    final=final,
    generator=generator
)

from torchinfo import summary
print(summary(model, (1, 5, 249)))

Layer (type:depth-idx)                        Output Shape              Param #
PrixFixeNet                                   [1, 1]                    --
├─AutosomeFirstLayersBlock: 1-1               --                        --
│    └─Sequential: 2-1                        [1, 256, 249]             --
│    │    └─Conv1d: 3-1                       [1, 256, 249]             8,960
│    │    └─BatchNorm1d: 3-2                  [1, 256, 249]             512
│    │    └─SiLU: 3-3                         [1, 256, 249]             --
├─UnlockDNACoreBlock: 1-2                     --                        --
│    └─Embedding: 2-2                         [1, 249, 256]             63,744
│    └─ModuleList: 2-3                        --                        --
│    │    └─ConformerSASwiGLULayer: 3-4       [1, 256, 249]             1,121,280
│    │    └─ConformerSASwiGLULayer: 3-5       [1, 256, 249]             1,121,280
│    │    └─ConformerSASwiGLULayer: 3-6       [1, 256, 249]             1

# DataProcessor

In [6]:
from prixfixe.autosome import AutosomeDataProcessor

dataprocessor = AutosomeDataProcessor(
    path_to_training_data=TRAIN_DATA_PATH,
    path_to_validation_data=VALID_DATA_PATH,
    train_batch_size=TRAIN_BATCH_SIZE, 
    batch_per_epoch=BATCH_PER_EPOCH,
    train_workers=N_PROCS,
    valid_batch_size=VALID_BATCH_SIZE,
    valid_workers=N_PROCS,
    shuffle_train=True,
    shuffle_val=False,
    seqsize=SEQ_SIZE,
    generator=generator
)

In [7]:
next(dataprocessor.prepare_train_dataloader())

{'x': tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 1., 1., 1.],
          [1., 0., 1.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [1., 0., 1.,  ..., 0., 1., 1.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.]],
 
         ...,
 
         [[1., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 1., 0.],
          [0., 1., 0.,  ..., 0., 0., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],
 
         [[1., 1., 1.,  ..., 0., 0., 1.],
          [0., 0., 0.,  ..., 0., 1., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
 

# Trainer

In [8]:
from prixfixe.autosome import AutosomeTrainer
trainer = AutosomeTrainer(
    model,
    device=torch.device(f"cuda:{CUDA_DEVICE_ID}"), 
    model_dir=MODEL_LOG_DIR,
    dataprocessor=dataprocessor,
    num_epochs=NUM_EPOCHS,
    lr = lr)

trainer.fit()

  0%|                                                                                                                                                                                                                                                       | 0/5 [00:00<?, ?it/s]
Train epoch:   0%|                                                                                                                                                                                                                                         | 0/31 [00:00<?, ?it/s][A
Train epoch:   3%|███████▎                                                                                                                                                                                                                         | 1/31 [00:00<00:04,  6.21it/s][A
Train epoch:  13%|█████████████████████████████                                                                                                                          

# Prediction

In [9]:
test_df = pd.read_csv(TEST_DATA_PATH, sep='\t')
test_df['rev'] = test_df['ID'].str.contains('\-_').astype(int)

model.load_state_dict(torch.load(f"{MODEL_LOG_DIR}/model_best_MSE.pth"))
model.eval()

def one_hot_encode(seq):
    mapping = {'A': [1, 0, 0, 0],
            'G': [0, 1, 0, 0],
            'C': [0, 0, 1, 0],
            'T': [0, 0, 0, 1],
            'N': [0, 0, 0, 0]}
    return [mapping[base] for base in seq]

In [10]:
# One-hot encode sequences and concatenate 'rev' column
encoded_seqs = []
Y_test_dev = []
Y_test_hk = []

for i, row in tqdm(test_df.iterrows()):
    encoded_seq = one_hot_encode(row['Sequence'])
    rev_value = [row['rev']] * len(encoded_seq)
    encoded_seq_with_rev = [list(encoded_base) + [rev] for encoded_base, rev in zip(encoded_seq, rev_value)]
    encoded_seqs.append(encoded_seq_with_rev)
    Y_test_dev.append(row['Dev_log2_enrichment'])
    Y_test_hk.append(row['Hk_log2_enrichment'])

pred_expr_dev = []
pred_expr_hk = []

for seq in tqdm(encoded_seqs):
    pred = model(torch.tensor(np.array(seq).reshape(1,249,5).transpose(0,2,1), device = device, dtype = torch.float32)) # #can also predict on batches to speed up prediction
    pred_expr_dev.append(pred[0].detach().cpu().flatten().tolist())
    pred_expr_hk.append(pred[1].detach().cpu().flatten().tolist())

1000it [00:00, 4750.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 308.39it/s]
