# RNA folding prediction
https://www.kaggle.com/competitions/stanford-ribonanza-rna-folding/overview

In [1]:
TRAIN_DATA_PATH = '../data/RNA folding/train_data_QUICK_START.csv'
TEST_DATA_PATH = '../data/RNA folding/test_sequences.csv'
SUBMISSION_FILE_PATH = '../data/RNA folding/sample_submission.csv'

### Import data

In [2]:
import pandas as pd

train_data_pd = pd.read_csv(TRAIN_DATA_PATH)

In [3]:
train_data_pd.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reactivity_0001,reactivity_0002,reactivity_0003,reactivity_0004,reactivity_0005,reactivity_0006,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,2A3_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_2A3,,,,,,,...,,,,,,,,,,
1,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,DMS_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_DMS,,,,,,,...,,,,,,,,,,
2,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,2A3_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_2A3,,,,,,,...,,,,,,,,,,
3,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,DMS_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_DMS,,,,,,,...,,,,,,,,,,
4,00021f968267,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,2A3_MaP,DasLabBigLib_OneMil_Replicates_from_previous_l...,,,,,,,...,,,,,,,,,,


### Make dataset class

In [4]:
import sys
sys.path.append('..')

import numpy as np

from python_scripts.transformers.dataset import MaskedDataset, RNADataset

masked_dataset = MaskedDataset(
    data=train_data_pd[:1000],
    vocab=pd.read_csv('../data/RNA folding/vocab.csv'),
    max_len=512
)

rna_dataset = RNADataset(
    data=train_data_pd[:1000],
    vocab=pd.read_csv('../data/RNA folding/vocab.csv'),
    max_len=512
)

In [5]:
len(masked_dataset), len(rna_dataset)

(1000, 1000)

### Make models

In [6]:
import torch
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomMasked, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(masked_dataset.vocab),
    hidden=256,
    dim_k=32,
)
masked_model = BERTCustomMasked(bertmodel)

summary(masked_model)

Layer (type:depth-idx)                             Param #
BERTCustomMasked                                   --
├─BERTCustom: 1-1                                  --
│    └─CombEmbedding: 2-1                          --
│    │    └─TokenEmbedding: 3-1                    2,816
│    │    └─PositionEmbedding: 3-2                 --
│    │    └─Dropout: 3-3                           --
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-4                      921,216
│    │    └─EncoderBlock: 3-5                      921,216
│    │    └─EncoderBlock: 3-6                      921,216
│    │    └─EncoderBlock: 3-7                      921,216
│    │    └─EncoderBlock: 3-8                      921,216
│    │    └─EncoderBlock: 3-9                      921,216
│    │    └─EncoderBlock: 3-10                     921,216
│    │    └─EncoderBlock: 3-11                     921,216
│    │    └─EncoderBlock: 3-12                     921,216
│    │    └─EncoderBlock: 3-1

In [7]:
masked_model(next(iter(DataLoader(masked_dataset, 3)))).shape

torch.Size([3, 512, 11])

In [8]:
import torch
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomRNAReactivity, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(masked_dataset.vocab),
    hidden=256,
    dim_k=32,
)
RNA_model = BERTCustomRNAReactivity(bertmodel)

summary(RNA_model)

Layer (type:depth-idx)                             Param #
BERTCustomRNAReactivity                            --
├─BERTCustom: 1-1                                  --
│    └─CombEmbedding: 2-1                          --
│    │    └─TokenEmbedding: 3-1                    2,816
│    │    └─PositionEmbedding: 3-2                 --
│    │    └─Dropout: 3-3                           --
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-4                      921,216
│    │    └─EncoderBlock: 3-5                      921,216
│    │    └─EncoderBlock: 3-6                      921,216
│    │    └─EncoderBlock: 3-7                      921,216
│    │    └─EncoderBlock: 3-8                      921,216
│    │    └─EncoderBlock: 3-9                      921,216
│    │    └─EncoderBlock: 3-10                     921,216
│    │    └─EncoderBlock: 3-11                     921,216
│    │    └─EncoderBlock: 3-12                     921,216
│    │    └─EncoderBlock: 3-1

In [9]:
RNA_model(next(iter(DataLoader(rna_dataset, 3)))[0]).shape

torch.Size([3, 512])

In [10]:
next(iter(DataLoader(rna_dataset, 3)))[0].dtype, next(iter(DataLoader(rna_dataset, 3)))[1].dtype

(torch.int64, torch.float64)

### Train model by masking tokens

In [10]:
from torchmetrics import Accuracy
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import MaskedDataModule
from python_scripts.transformers.task import MaskingTask

masked_datamodule = MaskedDataModule(masked_dataset, batch_size=8)

masked_optimizer = torch.optim.Adam(masked_model.parameters(), 1e-3)
masked_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    masked_optimizer,
    T_max=5,
    verbose=True,
)

maskingtask = MaskingTask(
    model=masked_model,
    loss_fn=torch.nn.CrossEntropyLoss(),
    optimizer=masked_optimizer,
    scheduler=masked_scheduler,
    acc_fn=Accuracy(task='multiclass', num_classes=len(masked_dataset.vocab), ignore_index=-100)
)

callbacks = []
callbacks.append(ModelCheckpoint(
    monitor='val_avg_accuracy',
    save_top_k=2,
    mode='max'
))
callbacks.append(EarlyStopping(
    monitor='val_avg_accuracy',
    min_delta=0.1,
    patience=3,
    verbose=False,
    mode='max'
))

trainer = pl.Trainer(
    max_epochs=5,
    callbacks=callbacks
)

# maskingtask = MaskingTask.load_from_checkpoint(
#     './lightning_logs/version_0/checkpoints/epoch=0-step=33562.ckpt',
#     model=masked_model,
#     loss_fn=torch.nn.CrossEntropyLoss(),
#     optimizer=masked_optimizer,
#     scheduler=masked_scheduler,
#     acc_fn=Accuracy(task='multiclass', num_classes=len(masked_dataset.vocab), ignore_index=-100)
# )

trainer.fit(maskingtask, datamodule=masked_datamodule)
trainer.test(maskingtask, datamodule=masked_datamodule)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type               | Params
-----------------------------------------------
0 | model   | BERTCustomMasked   | 11.1 M
1 | loss_fn | CrossEntropyLoss   | 0     
2 | acc_fn  | MulticlassAccuracy | 0     
-----------------------------------------------
11.1 M    Trainable params
0         Non-trainable params
11.1 M    Total params
44.241    Total estimated model params size (MB)


Adjusting learning rate of group 0 to 1.0000e-03.
Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  tp = tp.sum(dim=0 if multidim_average == "global" else 1)
  rank_zero_warn(


Epoch 0:  99%|█████████▉| 99/100 [00:40<00:00,  2.47it/s, v_num=7, train_loss=0.0441, train_accuracy=0.341]Adjusting learning rate of group 0 to 9.0451e-04.
Epoch 0: 100%|██████████| 100/100 [00:40<00:00,  2.47it/s, v_num=7, train_loss=0.0441, train_accuracy=0.323]
Epoch 0, Avg. Training Loss: 0.047 Avg. Training Accuracy: 0.301 Avg. Validation Loss: 0.049 Avg. Validation Accuracy: 0.280
Epoch 1:  99%|█████████▉| 99/100 [00:41<00:00,  2.41it/s, v_num=7, train_loss=0.044, train_accuracy=0.333, val_loss=0.0446, val_accuracy=0.313]  Adjusting learning rate of group 0 to 6.5451e-04.
Epoch 1: 100%|██████████| 100/100 [00:41<00:00,  2.41it/s, v_num=7, train_loss=0.0428, train_accuracy=0.422, val_loss=0.0446, val_accuracy=0.313]
Epoch 1, Avg. Training Loss: 0.045 Avg. Training Accuracy: 0.314 Avg. Validation Loss: 0.045 Avg. Validation Accuracy: 0.325
Epoch 2:  99%|█████████▉| 99/100 [00:41<00:00,  2.39it/s, v_num=7, train_loss=0.0448, train_accuracy=0.318, val_loss=0.0447, val_accuracy=0.322

  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 13/13 [00:01<00:00,  9.13it/s]


[{'test_loss': 0.04438428953289986, 'test_accuracy': 0.323074072599411}]

In [11]:
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataModule
from python_scripts.transformers.task import RNATask

rna_datamodule = RNADataModule(rna_dataset, batch_size=8)

rna_optimizer = torch.optim.Adam(masked_model.parameters(), 1e-3)
# rna_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
#     rna_optimizer,
#     T_max=5,
#     verbose=True,
# )
rna_scheduler = torch.optim.lr_scheduler.MultiStepLR(
    rna_optimizer,
    [3, 6, 9]
)

rna_task = RNATask(
    model=RNA_model,
    loss_fn=lambda x, y: torch.sqrt(torch.nn.MSELoss()(x, y)),
    optimizer=rna_optimizer,
    scheduler=rna_scheduler,
)

callbacks = []
callbacks.append(ModelCheckpoint(
    monitor='val_avg_loss',
    save_top_k=2,
    mode='min'
))
callbacks.append(EarlyStopping(
    monitor='val_avg_loss',
    min_delta=0.1,
    patience=3,
    verbose=False,
    mode='min'
))

trainer = pl.Trainer(
    max_epochs=5,
    callbacks=callbacks
)

# rna_task = RNATask.load_from_checkpoint(
#     './lightning_logs/version_0/checkpoints/epoch=0-step=33562.ckpt',
#     model=RNA_model,
#     loss_fn=torch.nn.MSELoss(),
#     optimizer=rna_optimizer,
#     scheduler=rna_scheduler,
# )

trainer.fit(rna_task, datamodule=rna_datamodule)
trainer.test(rna_task, datamodule=rna_datamodule)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type                    | Params
--------------------------------------------------
0 | model | BERTCustomRNAReactivity | 11.1 M
--------------------------------------------------
11.1 M    Trainable params
0         Non-trainable params
11.1 M    Total params
44.231    Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 100/100 [00:34<00:00,  2.88it/s, v_num=26, train_loss=90.00]
Epoch 0, Avg. Training Loss: 90.084 Avg. Validation Loss: 90.197
Epoch 1: 100%|██████████| 100/100 [00:35<00:00,  2.84it/s, v_num=26, train_loss=90.10, val_loss=90.20]
Epoch 1, Avg. Training Loss: 90.084 Avg. Validation Loss: 90.189
Epoch 2: 100%|██████████| 100/100 [00:36<00:00,  2.77it/s, v_num=26, train_loss=90.00, val_loss=90.20]
Epoch 2, Avg. Training Loss: 90.084 Avg. Validation Loss: 90.189
Epoch 3: 100%|██████████| 100/100 [00:36<00:00,  2.78it/s, v_num=26, train_loss=90.50, val_loss=90.20]
Epoch 3, Avg. Training Loss: 90.083 Avg. Validation Loss: 90.189
Epoch 3: 100%|██████████| 100/100 [00:37<00:00,  2.67it/s, v_num=26, train_loss=90.50, val_loss=90.20]


  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 13/13 [00:01<00:00, 10.09it/s]


[{'test_loss': 90.1828384399414}]