# Import libraries and functions

In [1]:
import pandas as pd
import torch
import os
from prixfixe.autosome import AutosomeDataProcessor, AutosomeFirstLayersBlock, AutosomeCoreBlock, AutosomeFinalLayersBlock, AutosomeTrainer, AutosomePredictor
from prixfixe.bhi import BHIFirstLayersBlock,BHICoreBlock
from prixfixe.unlockdna import UnlockDNACoreBlock
from prixfixe.prixfixe import PrixFixeNet

  from .autonotebook import tqdm as notebook_tqdm


# Initialize paths and variables

In [2]:
TRAIN_DATA_PATH = "data/demo_train.txt" #change filename to actual training data
VALID_DATA_PATH = "data/demo_val.txt" #change filename to actual validaiton data
TRAIN_BATCH_SIZE = 512 # replace with 1024, if 1024 doesn't fit in gpu memory, decrease by order of 2 (512,256)
BATCH_PER_EPOCH = 10 #replace with total amount of possible batches in the training data
N_PROCS = 8
VALID_BATCH_SIZE = 4096
BATCH_PER_VALIDATION = 10 #replace with total amount of possible batches in the validaiton data
PLASMID_PATH = "data/plasmid.json"
SEQ_SIZE = 150
NUM_EPOCHS = 5 #replace with 80
CUDA_DEVICE_ID = 0
lr = 0.005 # 0.001 for attention layers in coreBlock

# DataProcessor

In [3]:
generator = torch.Generator()
generator.manual_seed(2147483647)

dataprocessor = AutosomeDataProcessor(
    path_to_training_data=TRAIN_DATA_PATH,
    path_to_validation_data=VALID_DATA_PATH,
    train_batch_size=TRAIN_BATCH_SIZE, 
    batch_per_epoch=BATCH_PER_EPOCH,
    train_workers=N_PROCS,
    valid_batch_size=VALID_BATCH_SIZE,
    valid_workers=N_PROCS,
    shuffle_train=True,
    shuffle_val=False,
    plasmid_path=PLASMID_PATH,
    seqsize=SEQ_SIZE,
    generator=generator
)

# Prix-Fixe Model

### DREAM-CNN Model

In [4]:
first = BHIFirstLayersBlock(
    in_channels = dataprocessor.data_channels(),
    out_channels = 320,
    seqsize = dataprocessor.data_seqsize(),
    kernel_sizes = [9, 15],
    pool_size = 1,
    dropout = 0.2
    )

core = AutosomeCoreBlock(in_channels=first.out_channels,
                         out_channels =64,
                         seqsize=first.infer_outseqsize())

final = AutosomeFinalLayersBlock(in_channels=core.out_channels, 
                                 seqsize=core.infer_outseqsize())
model = PrixFixeNet(
    first=first,
    core=core,
    final=final,
    generator=generator
)

In [5]:
from torchinfo import summary
summary(model, (1, 6, 150))

Layer (type:depth-idx)                        Output Shape              Param #
PrixFixeNet                                   [1]                       --
├─BHIFirstLayersBlock: 1-1                    --                        --
│    └─ModuleList: 2-1                        --                        --
│    │    └─ConvBlock: 3-1                    [1, 160, 150]             8,800
│    │    └─ConvBlock: 3-2                    [1, 160, 150]             14,560
├─AutosomeCoreBlock: 1-2                      --                        --
│    └─ModuleDict: 2-2                        --                        --
│    │    └─Sequential: 3-3                   [1, 320, 150]             420,048
│    │    └─Sequential: 3-4                   [1, 128, 150]             573,696
│    │    └─Sequential: 3-5                   [1, 128, 150]             173,856
│    │    └─Sequential: 3-6                   [1, 128, 150]             229,632
│    │    └─Sequential: 3-7                   [1, 128, 150]         

In [6]:
MODEL_LOG_DIR = f"prix_fixe_model_weights/0_1_0_0"
model.load_state_dict(torch.load(os.path.join(MODEL_LOG_DIR, 'model_best.pth')))

<All keys matched successfully>

### DREAM-RNN Model

In [7]:
first = BHIFirstLayersBlock(
    in_channels = dataprocessor.data_channels(),
    out_channels = 320,
    seqsize = dataprocessor.data_seqsize(),
    kernel_sizes = [9, 15],
    pool_size = 1,
    dropout = 0.2
    )

core = BHICoreBlock(
    in_channels = first.out_channels,
    out_channels = 320,
    seqsize = first.infer_outseqsize(),
    lstm_hidden_channels = 320,
    kernel_sizes = [9, 15],
    pool_size = 1,
    dropout1 = 0.2,
    dropout2 = 0.5
    )

final = AutosomeFinalLayersBlock(in_channels=core.out_channels, 
                                 seqsize=core.infer_outseqsize())
model = PrixFixeNet(
    first=first,
    core=core,
    final=final,
    generator=generator
)

In [8]:
from torchinfo import summary
summary(model, (1, 6, 150))

Layer (type:depth-idx)                   Output Shape              Param #
PrixFixeNet                              [1]                       --
├─BHIFirstLayersBlock: 1-1               --                        --
│    └─ModuleList: 2-1                   --                        --
│    │    └─ConvBlock: 3-1               [1, 160, 150]             8,800
│    │    └─ConvBlock: 3-2               [1, 160, 150]             14,560
├─BHICoreBlock: 1-2                      --                        --
│    └─LSTM: 2-2                         [1, 150, 640]             1,643,520
│    └─ModuleList: 2-3                   --                        --
│    │    └─ConvBlock: 3-3               [1, 160, 150]             921,760
│    │    └─ConvBlock: 3-4               [1, 160, 150]             1,536,160
│    └─Dropout: 2-4                      [1, 320, 150]             --
├─AutosomeFinalLayersBlock: 1-3          --                        --
│    └─Sequential: 2-5                   [1, 18, 150]      

In [9]:
MODEL_LOG_DIR = f"prix_fixe_model_weights/0_1_1_0"
model.load_state_dict(torch.load(os.path.join(MODEL_LOG_DIR, 'model_best.pth')))

<All keys matched successfully>

### DREAM-Attn Model

In [10]:
first = AutosomeFirstLayersBlock(in_channels=dataprocessor.data_channels(),
                                   out_channels=256, 
                                   seqsize=dataprocessor.data_seqsize())

core = UnlockDNACoreBlock(
    in_channels = first.out_channels, out_channels= first.out_channels, seqsize = dataprocessor.data_seqsize(), n_blocks = 4,
                                     kernel_size = 15, rate = 0.1, num_heads = 8)

final = AutosomeFinalLayersBlock(in_channels=core.out_channels, 
                                 seqsize=core.infer_outseqsize())
model = PrixFixeNet(
    first=first,
    core=core,
    final=final,
    generator=generator
)

In [11]:
first = AutosomeFirstLayersBlock(in_channels=dataprocessor.data_channels(),
                                   out_channels=256, 
                                   seqsize=dataprocessor.data_seqsize())

core = UnlockDNACoreBlock(
    in_channels = first.out_channels, out_channels= first.out_channels, seqsize = dataprocessor.data_seqsize(), n_blocks = 4,
                                     kernel_size = 15, rate = 0.1, num_heads = 8)

final = AutosomeFinalLayersBlock(in_channels=core.out_channels, 
                                 seqsize=core.infer_outseqsize())
model = PrixFixeNet(
    first=first,
    core=core,
    final=final,
    generator=generator
)

In [12]:
from torchinfo import summary
summary(model, (1, 6, 150))

Layer (type:depth-idx)                        Output Shape              Param #
PrixFixeNet                                   [1]                       --
├─AutosomeFirstLayersBlock: 1-1               --                        --
│    └─Sequential: 2-1                        [1, 256, 150]             --
│    │    └─Conv1d: 3-1                       [1, 256, 150]             10,752
│    │    └─BatchNorm1d: 3-2                  [1, 256, 150]             512
│    │    └─SiLU: 3-3                         [1, 256, 150]             --
├─UnlockDNACoreBlock: 1-2                     --                        --
│    └─Embedding: 2-2                         [1, 150, 256]             38,400
│    └─ModuleList: 2-3                        --                        --
│    │    └─ConformerSASwiGLULayer: 3-4       [1, 256, 150]             1,121,280
│    │    └─ConformerSASwiGLULayer: 3-5       [1, 256, 150]             1,121,280
│    │    └─ConformerSASwiGLULayer: 3-6       [1, 256, 150]             

In [None]:
# you neeed to download the model_weights
MODEL_LOG_DIR = f"prix_fixe_model_weights/0_0_2_0" 
model.load_state_dict(torch.load(os.path.join(MODEL_LOG_DIR, 'model_best.pth')))

# Trainer

In [14]:
trainer = AutosomeTrainer(
    model,    
    device=torch.device(f"cuda:{CUDA_DEVICE_ID}"), 
    model_dir="data/model_weights",
    dataprocessor=dataprocessor,
    num_epochs=NUM_EPOCHS,
    lr = lr)

In [15]:
trainer.fit()

  0%|                                                     | 0/5 [00:00<?, ?it/s]
Train epoch:   0%|                                       | 0/10 [00:00<?, ?it/s][A
Train epoch:  10%|███                            | 1/10 [00:00<00:07,  1.27it/s][A
Train epoch:  20%|██████▏                        | 2/10 [00:01<00:03,  2.08it/s][A
Train epoch:  30%|█████████▎                     | 3/10 [00:01<00:02,  2.60it/s][A
Train epoch:  40%|████████████▍                  | 4/10 [00:01<00:02,  2.95it/s][A
Train epoch:  50%|███████████████▌               | 5/10 [00:01<00:01,  3.19it/s][A
Train epoch:  60%|██████████████████▌            | 6/10 [00:02<00:01,  3.35it/s][A
Train epoch:  70%|█████████████████████▋         | 7/10 [00:02<00:00,  3.47it/s][A
Train epoch:  80%|████████████████████████▊      | 8/10 [00:02<00:00,  3.54it/s][A
Train epoch:  90%|███████████████████████████▉   | 9/10 [00:02<00:00,  3.58it/s][A
Train epoch: 100%|██████████████████████████████| 10/10 [00:03<00:00,  3.62it/s

# Predict

In [16]:
import random
predictor = AutosomePredictor(model=model, model_pth='data/model_weights/model_best.pth', device=torch.device(f"cuda:0"))
dna = "TGCATTTTTTTCACATC"+ ''.join(random.choice('ACGT') for _ in range(80)) + "GGTTACGGCTGTT"
predictor.predict(dna)

14.352845668792725

# Prediction on the test dataset

In [None]:
test_df = pd.read_csv('data/filtered_test_data_with_MAUDE_expression.txt', header=None, sep='\t')

from tqdm import tqdm
pred_expr = []
for seq in tqdm(test_df.iloc[:, 0]):
    pred_expr.append(predictor.predict(seq))

In [1]:
from scipy.stats import pearsonr, spearmanr
print(pearsonr(pred_expr, list(test_df.iloc[:, 1])), spearmanr(pred_expr, list(test_df.iloc[:, 1])))

# Score your submission on DREAM Challenge test dataset

In [3]:
pred_expr = pd.read_csv('data/sample_submission.txt', sep = '\t', header = None).iloc[:,1]
from prixfixe.evaluation import evaluate_predictions
evaluate_predictions(pred_expr)

******************************************************
Pearson Score: 0.7657255844881551

Spearman Score: 0.8228750904214907

******************************************************
all r: 0.957144539749361

all r²: 0.916125669972016

all ρ: 0.961451653086994

******************************************************
high r: 0.6200899915391505

low r: 0.6211738513565918

yeast r: 0.8382821111688279

random r: 0.9677444394489736

challenging r: 0.9354983554787447

SNVs r: 0.8227819183935022

motif perturbation r: 0.9671482009080143

motif tiling r: 0.9449999831802987

******************************************************
high ρ: 0.5754373259429003

low ρ: 0.596033541641311

yeast ρ: 0.839060331461191

random ρ: 0.970287191964816

challenging ρ: 0.9289802083256298

SNVs ρ: 0.6775184531537061

motif perturbation ρ: 0.9611406141596464

motif tiling ρ: 0.9273541130425778

******************************************************
