# RNA folding prediction
https://www.kaggle.com/competitions/stanford-ribonanza-rna-folding/overview

In [1]:
TRAIN_DATA_PATH = '../data/RNA folding/train_data_QUICK_START.csv'
TRAIN_DATA_EXT_PATH = '../data/RNA folding/train_extracted.csv'
TEST_DATA_PATH = '../data/RNA folding/test_sequences.csv'
TEST_DATA_EXT_PATH = '../data/RNA folding/test_extracted.csv'
SUBMISSION_SAMPLE_FILE_PATH = '../data/RNA folding/sample_submission.csv'
SUBMISSION_FILE_PATH = '../data/RNA folding/submission.csv'

### Import data

In [2]:
import pandas as pd

train_data_pd = pd.read_csv(TRAIN_DATA_PATH)
train_data_pd.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reactivity_0001,reactivity_0002,reactivity_0003,reactivity_0004,reactivity_0005,reactivity_0006,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,2A3_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_2A3,,,,,,,...,,,,,,,,,,
1,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,DMS_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_DMS,,,,,,,...,,,,,,,,,,
2,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,2A3_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_2A3,,,,,,,...,,,,,,,,,,
3,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,DMS_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_DMS,,,,,,,...,,,,,,,,,,
4,00021f968267,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,2A3_MaP,DasLabBigLib_OneMil_Replicates_from_previous_l...,,,,,,,...,,,,,,,,,,


In [10]:
train_extracted_pd = pd.read_csv(TRAIN_DATA_EXT_PATH)
train_extracted_pd.head()

Unnamed: 0,sequence,sequence_ext
0,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,.....((((((.....)))))).....((((((((((((((....)...
1,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,.....((((((.....))))))........(((((..(.....).....
2,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,.....((((((.....))))))........(((((.((((.........
3,GGGAACGACUCGAGUAGAGUCGAAAAGGAGAUCGAAGACGACUUAC...,.....((((((.....))))))....((((((((.....(.........
4,GGGAACGACUCGAGUAGAGUCGAAAAGAUAUGGACUGACGAAGUCG...,.....((((((.....))))))....(((..(((((((((..((((...


### Make dataset class

In [4]:
import sys
sys.path.append('..')

import numpy as np

from python_scripts.transformers.dataset import MaskedDataset, RNADataset_2

masked_dataset = MaskedDataset(
    data=train_data_pd[:1000],
    vocab=pd.read_csv('../data/RNA folding/vocab.csv'),
    max_len=512
)

rna_dataset = RNADataset_2(
    data=train_data_pd[:2000],
    data_ext = train_extracted_pd[:1000],
    vocab=pd.read_csv('../data/RNA folding/vocab.csv'),
    max_len=512
)

In [5]:
len(masked_dataset), len(rna_dataset)

(1000, 1000)

### Make models

In [None]:
import torch
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomMasked, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(masked_dataset.vocab),
    hidden=32,
    dim_k=4,
)
masked_model = BERTCustomMasked(bertmodel)

summary(masked_model)

In [7]:
masked_model(next(iter(DataLoader(masked_dataset, 3)))).shape

torch.Size([3, 512, 23])

In [5]:
import torch
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomRNAReactivity_2, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(masked_dataset.vocab),
    hidden=128,
    dim_k=16,
    num_layer=12,
    num_attn_head=12
)
RNA_model = BERTCustomRNAReactivity_2(bertmodel)

summary(RNA_model)

Layer (type:depth-idx)                             Param #
BERTCustomRNAReactivity_2                          --
├─BERTCustom: 1-1                                  --
│    └─CombEmbedding: 2-1                          --
│    │    └─TokenEmbedding: 3-1                    2,944
│    │    └─PositionEmbedding: 3-2                 --
│    │    └─Dropout: 3-3                           --
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-4                      231,232
│    │    └─EncoderBlock: 3-5                      231,232
│    │    └─EncoderBlock: 3-6                      231,232
│    │    └─EncoderBlock: 3-7                      231,232
│    │    └─EncoderBlock: 3-8                      231,232
│    │    └─EncoderBlock: 3-9                      231,232
│    │    └─EncoderBlock: 3-10                     231,232
│    │    └─EncoderBlock: 3-11                     231,232
│    │    └─EncoderBlock: 3-12                     231,232
│    │    └─EncoderBlock: 3-1

In [9]:
RNA_model(next(iter(DataLoader(rna_dataset, 3)))[0]).shape

torch.Size([3, 2, 512])

In [10]:
RNA_model(next(iter(DataLoader(rna_dataset, 3)))[0])

tensor([[[ 1.0453e+00, -1.0082e-01, -8.5790e-02,  ...,  6.9567e-01,
           9.0658e-01,  3.9749e-01],
         [-9.6738e-02, -8.4663e-01,  1.2376e-01,  ...,  5.4337e-01,
           9.8046e-01,  8.4537e-01]],

        [[-6.1251e-04,  6.8470e-01, -2.9411e-01,  ...,  1.5148e+00,
          -4.5277e-03,  8.0242e-01],
         [ 8.4973e-02, -2.8476e-01, -8.3920e-01,  ...,  3.9015e-02,
           4.3316e-01,  3.0112e-01]],

        [[-1.0954e+00,  8.2157e-01, -3.0471e-01,  ..., -4.3983e-02,
           1.2286e+00,  1.2809e+00],
         [-7.6830e-01, -1.9092e-01, -1.8542e+00,  ...,  8.1878e-03,
           6.4648e-02,  1.1038e+00]]], grad_fn=<TransposeBackward0>)

In [11]:
next(iter(DataLoader(rna_dataset, 3)))[1]

tensor([[[-100., -100., -100.,  ..., -100., -100., -100.],
         [-100., -100., -100.,  ..., -100., -100., -100.]],

        [[-100., -100., -100.,  ..., -100., -100., -100.],
         [-100., -100., -100.,  ..., -100., -100., -100.]],

        [[-100., -100., -100.,  ..., -100., -100., -100.],
         [-100., -100., -100.,  ..., -100., -100., -100.]]],
       dtype=torch.float64)

In [12]:
next(iter(DataLoader(rna_dataset, 3)))[0].dtype, next(iter(DataLoader(rna_dataset, 3)))[1].dtype

(torch.int64, torch.float64)

### Train model by masking tokens

In [10]:
from torchmetrics import Accuracy
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import MaskedDataModule
from python_scripts.transformers.task import MaskingTask

masked_datamodule = MaskedDataModule(masked_dataset, batch_size=8)

masked_optimizer = torch.optim.Adam(masked_model.parameters(), 1e-3)
masked_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    masked_optimizer,
    T_max=5,
    verbose=True,
)

maskingtask = MaskingTask(
    model=masked_model,
    loss_fn=torch.nn.CrossEntropyLoss(),
    optimizer=masked_optimizer,
    scheduler=masked_scheduler,
    acc_fn=Accuracy(task='multiclass', num_classes=len(masked_dataset.vocab), ignore_index=-100)
)

callbacks = []
callbacks.append(ModelCheckpoint(
    monitor='val_avg_accuracy',
    save_top_k=2,
    mode='max'
))
callbacks.append(EarlyStopping(
    monitor='val_avg_accuracy',
    min_delta=0.1,
    patience=3,
    verbose=False,
    mode='max'
))

trainer = pl.Trainer(
    max_epochs=5,
    callbacks=callbacks
)

# maskingtask = MaskingTask.load_from_checkpoint(
#     './lightning_logs/version_0/checkpoints/epoch=0-step=33562.ckpt',
#     model=masked_model,
#     loss_fn=torch.nn.CrossEntropyLoss(),
#     optimizer=masked_optimizer,
#     scheduler=masked_scheduler,
#     acc_fn=Accuracy(task='multiclass', num_classes=len(masked_dataset.vocab), ignore_index=-100)
# )

trainer.fit(maskingtask, datamodule=masked_datamodule)
trainer.test(maskingtask, datamodule=masked_datamodule)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type               | Params
-----------------------------------------------
0 | model   | BERTCustomMasked   | 11.1 M
1 | loss_fn | CrossEntropyLoss   | 0     
2 | acc_fn  | MulticlassAccuracy | 0     
-----------------------------------------------
11.1 M    Trainable params
0         Non-trainable params
11.1 M    Total params
44.241    Total estimated model params size (MB)


Adjusting learning rate of group 0 to 1.0000e-03.
Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  tp = tp.sum(dim=0 if multidim_average == "global" else 1)
  rank_zero_warn(


Epoch 0:  99%|█████████▉| 99/100 [00:40<00:00,  2.47it/s, v_num=7, train_loss=0.0441, train_accuracy=0.341]Adjusting learning rate of group 0 to 9.0451e-04.
Epoch 0: 100%|██████████| 100/100 [00:40<00:00,  2.47it/s, v_num=7, train_loss=0.0441, train_accuracy=0.323]
Epoch 0, Avg. Training Loss: 0.047 Avg. Training Accuracy: 0.301 Avg. Validation Loss: 0.049 Avg. Validation Accuracy: 0.280
Epoch 1:  99%|█████████▉| 99/100 [00:41<00:00,  2.41it/s, v_num=7, train_loss=0.044, train_accuracy=0.333, val_loss=0.0446, val_accuracy=0.313]  Adjusting learning rate of group 0 to 6.5451e-04.
Epoch 1: 100%|██████████| 100/100 [00:41<00:00,  2.41it/s, v_num=7, train_loss=0.0428, train_accuracy=0.422, val_loss=0.0446, val_accuracy=0.313]
Epoch 1, Avg. Training Loss: 0.045 Avg. Training Accuracy: 0.314 Avg. Validation Loss: 0.045 Avg. Validation Accuracy: 0.325
Epoch 2:  99%|█████████▉| 99/100 [00:41<00:00,  2.39it/s, v_num=7, train_loss=0.0448, train_accuracy=0.318, val_loss=0.0447, val_accuracy=0.322

  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 13/13 [00:01<00:00,  9.13it/s]


[{'test_loss': 0.04438428953289986, 'test_accuracy': 0.323074072599411}]

In [13]:
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataModule
from python_scripts.transformers.task import RNATask

rna_datamodule = RNADataModule(rna_dataset, batch_size=8)

def rna_rmse_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
    not_ignore = y != ignore_index
    return torch.sqrt(torch.square(x[not_ignore] - y[not_ignore]).mean())

def rna_mse_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
    not_ignore = y != ignore_index
    return torch.square(x[not_ignore] - y[not_ignore]).mean()

def rna_mae_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
    not_ignore = y != ignore_index
    return torch.abs(x[not_ignore] - y[not_ignore]).mean()

# rna_optimizer = torch.optim.Adam(RNA_model.parameters(), 1e-3)
rna_optimizer = torch.optim.SGD(RNA_model.parameters(), 1e-3, 0.9)
# rna_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
#     rna_optimizer,
#     T_max=5,
#     eta_min=1e-4,
#     verbose=True,
# )
# rna_scheduler = torch.optim.lr_scheduler.MultiStepLR(
#     rna_optimizer,
#     [4, 7, 10, 13, 16, 19],
#     verbose=True,
#     gamma=0.3
# )
# rna_scheduler = torch.optim.lr_scheduler.CyclicLR(
#     optimizer=rna_optimizer,
#     base_lr=1e-6,
#     max_lr=1e-3,
#     step_size_up=3000,
#     step_size_down=7000,
#     verbose=True
# )
rna_scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer=rna_optimizer,
    max_lr=1e-3,
    steps_per_epoch=100,
    epochs=5,
    div_factor=1e2,
    verbose=False
)

rna_task = RNATask(
    model=RNA_model,
    loss_fn=rna_mae_loss,
    optimizer=rna_optimizer,
    scheduler=rna_scheduler,
)

callbacks = []
callbacks.append(ModelCheckpoint(
    monitor='val_avg_loss',
    save_top_k=3,
    mode='min'
))
# callbacks.append(EarlyStopping(
#     monitor='val_avg_loss',
#     min_delta=0.001,
#     patience=3,
#     verbose=True,
#     mode='min'
# ))

trainer = pl.Trainer(
    max_epochs=5,
    callbacks=callbacks,
)

# trainer = pl.Trainer(resume_from_checkpoint='../notebooks/lightning_logs/version_0/checkpoints/epoch=0-step=100.ckpt')

trainer.fit(rna_task, datamodule=rna_datamodule)
trainer.test(rna_task, datamodule=rna_datamodule)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                      | Params
----------------------------------------------------
0 | model | BERTCustomRNAReactivity_2 | 2.8 M 
----------------------------------------------------
2.8 M     Trainable params
0         Non-trainable params
2.8 M     Total params
11.112    Total estimated model params size (MB)


Sanity Checking:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0:   1%|          | 1/100 [00:02<04:39,  2.83s/it, v_num=7, train_loss=0.636]

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 0: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s, v_num=7, train_loss=0.365]
Epoch 0, Avg. Training Loss: 0.4895 Avg. Validation Loss: 0.3142
0.0007585010606532027
Epoch 1: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s, v_num=7, train_loss=0.270, val_loss=0.276]
Epoch 1, Avg. Training Loss: 0.3200 Avg. Validation Loss: 0.2309
0.0009485190986744373
Epoch 2: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s, v_num=7, train_loss=0.260, val_loss=0.231]
Epoch 2, Avg. Training Loss: 0.2758 Avg. Validation Loss: 0.2234
0.0006068809706154799
Epoch 3: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s, v_num=7, train_loss=0.269, val_loss=0.223]
Epoch 3, Avg. Training Loss: 0.2609 Avg. Validation Loss: 0.2188
0.00018475966821879009
Epoch 4: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s, v_num=7, train_loss=0.267, val_loss=0.219]
Epoch 4, Avg. Training Loss: 0.2553 Avg. Validation Loss: 0.2199
2.114189442253868e-08
Epoch 4: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s, v_num=7, tr

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 100/100 [00:31<00:00,  3.15it/s, v_num=7, train_loss=0.267, val_loss=0.220]


  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 13/13 [00:01<00:00, 11.31it/s]


[{'test_loss': 0.21816213428974152}]

In [6]:
import pandas as pd

test_data_pd = pd.read_csv(TEST_DATA_PATH)
test_data_pd.head()

Unnamed: 0,id_min,id_max,sequence_id,sequence,future
0,0,176,eee73c1836bc,GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGG...,0
1,177,353,d2a929af7a97,GGGAACGACUCGAGUAGAGUCGAAAAUGUAAUCAGAUUGCUUCUCC...,0
2,354,530,d39a4425ff45,GGGAACGACUCGAGUAGAGUCGAAAAAACACAUGAAUUUGAGGGUU...,0
3,531,707,1fc41e92d553,GGGAACGACUCGAGUAGAGUCGAAAAUCAGAGCUGGCAAAUGGAUG...,0
4,708,884,1d0826fb892f,GGGAACGACUCGAGUAGAGUCGAAAAUUUGGUAUUUGAUGCAUUAA...,0


In [7]:
test_extracted_pd = pd.read_csv(TEST_DATA_EXT_PATH)
test_extracted_pd.head()

Unnamed: 0,sequence,sequence_ext
0,GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGG...,(((((((((((.....)))))).....)))))......(((((((....
1,GGGAACGACUCGAGUAGAGUCGAAAAUGUAAUCAGAUUGCUUCUCC...,.....((((((.....))))))................((..((((...
2,GGGAACGACUCGAGUAGAGUCGAAAAAACACAUGAAUUUGAGGGUU...,.....((((((.....))))))...............((((((.(....
3,GGGAACGACUCGAGUAGAGUCGAAAAUCAGAGCUGGCAAAUGGAUG...,.....((((((.....))))))....((.(((..((((((.((.((...
4,GGGAACGACUCGAGUAGAGUCGAAAAUUUGGUAUUUGAUGCAUUAA...,.....((((((.....)))))).................((........


In [9]:
import lightning.pytorch as pl

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataset_3
from python_scripts.transformers.dataset import RNADataModule
from python_scripts.transformers.task import RNATask

rna_extracted_dataset = RNADataset_3(
    data_ext = test_extracted_pd,
    vocab=pd.read_csv('../data/RNA folding/vocab.csv'),
    max_len=512
)

rna_datamodule = RNADataModule(rna_dataset, predict_dataset=rna_extracted_dataset, batch_size=8)

def rna_mae_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
    not_ignore = y != ignore_index
    return torch.abs(x[not_ignore] - y[not_ignore]).mean()

# rna_optimizer = torch.optim.Adam(RNA_model.parameters(), 1e-3)
rna_optimizer = torch.optim.SGD(RNA_model.parameters(), 1e-3, 0.9)
# rna_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
#     rna_optimizer,
#     T_max=5,
#     eta_min=1e-4,
#     verbose=True,
# )
# rna_scheduler = torch.optim.lr_scheduler.MultiStepLR(
#     rna_optimizer,
#     [4, 7, 10, 13, 16, 19],
#     verbose=True,
#     gamma=0.3
# )
# rna_scheduler = torch.optim.lr_scheduler.CyclicLR(
#     optimizer=rna_optimizer,
#     base_lr=1e-6,
#     max_lr=1e-3,
#     step_size_up=3000,
#     step_size_down=7000,
#     verbose=True
# )
rna_scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer=rna_optimizer,
    max_lr=1e-3,
    steps_per_epoch=100,
    epochs=5,
    div_factor=1e2,
    verbose=False
)

rna_task = RNATask.load_from_checkpoint(
    checkpoint_path='./lightning_logs/Small_set_5epochs/checkpoints/epoch=4-step=500.ckpt',
    model=RNA_model,
    loss_fn=rna_mae_loss,
    optimizer=rna_optimizer,
    scheduler=rna_scheduler,
)

trainer = pl.Trainer()

trainer.predict(rna_task, datamodule=rna_datamodule, return_predictions=False)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Predicting DataLoader 0:   0%|          | 0/167978 [00:00<?, ?it/s]



Predicting DataLoader 0:   0%|          | 3/167978 [00:00<8:35:18,  5.43it/s] 



Predicting DataLoader 0:   0%|          | 328/167978 [00:30<4:17:10, 10.87it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
submission = pd.read_csv(SUBMISSION_SAMPLE_FILE_PATH)
submission['reactivity_2A3_MaP'] = rna_task.predict_outputs['2A3_MaP']
submission['reactivity_DMS_MaP'] = rna_task.predict_outputs['DMS_MaP']
submission.to_csv(SUBMISSION_FILE_PATH, index=False)

In [23]:
checkpoint = torch.load('./lightning_logs/version_0/checkpoints/epoch=0-step=100.ckpt')
checkpoint.keys()

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])