# RNA folding prediction
https://www.kaggle.com/competitions/stanford-ribonanza-rna-folding/overview

In [1]:
TRAIN_DATA_PATH = '../data/RNA folding/train_data_QUICK_START.csv'
TEST_DATA_PATH = '../data/RNA folding/test_sequences.csv'
SUBMISSION_FILE_PATH = '../data/RNA folding/sample_submission.csv'

### Import data

In [2]:
import pandas as pd

train_data_pd = pd.read_csv(TRAIN_DATA_PATH)

In [3]:
train_data_pd.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reactivity_0001,reactivity_0002,reactivity_0003,reactivity_0004,reactivity_0005,reactivity_0006,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,2A3_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_2A3,,,,,,,...,,,,,,,,,,
1,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,DMS_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_DMS,,,,,,,...,,,,,,,,,,
2,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,2A3_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_2A3,,,,,,,...,,,,,,,,,,
3,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,DMS_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_DMS,,,,,,,...,,,,,,,,,,
4,00021f968267,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,2A3_MaP,DasLabBigLib_OneMil_Replicates_from_previous_l...,,,,,,,...,,,,,,,,,,


### Make dataset class

In [4]:
import sys
sys.path.append('..')

import numpy as np

from python_scripts.transformers.dataset import MaskedDataset, RNADataset

masked_dataset = MaskedDataset(
    data=train_data_pd.iloc[:1000]['sequence'],
    vocab=pd.read_csv('../data/RNA folding/vocab.csv'),
    max_len=512
)

label = []
for i in range(1, 207):
    n = 4 - len(str(i))
    label.append(train_data_pd[f"reactivity_{'0' * n + str(i)}"])
rna_dataset = RNADataset(
    data=train_data_pd['sequence'],
    label=np.array(label).transpose((1, 0)),
    vocab=pd.read_csv('../data/RNA folding/vocab.csv'),
    max_len=512
)

In [5]:
len(masked_dataset[0])

512

Make models

In [6]:
import torch
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomMasked, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(masked_dataset.vocab),
    hidden=256,
    dim_k=32,
)
model = BERTCustomMasked(bertmodel)

def collate_fn(batch):
    return torch.tensor(batch)

summary(model)

Layer (type:depth-idx)                             Param #
BERTCustomMasked                                   --
├─BERTCustom: 1-1                                  --
│    └─CombEmbedding: 2-1                          --
│    │    └─TokenEmbedding: 3-1                    2,304
│    │    └─PositionEmbedding: 3-2                 --
│    │    └─Dropout: 3-3                           --
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-4                      921,216
│    │    └─EncoderBlock: 3-5                      921,216
│    │    └─EncoderBlock: 3-6                      921,216
│    │    └─EncoderBlock: 3-7                      921,216
│    │    └─EncoderBlock: 3-8                      921,216
│    │    └─EncoderBlock: 3-9                      921,216
│    │    └─EncoderBlock: 3-10                     921,216
│    │    └─EncoderBlock: 3-11                     921,216
│    │    └─EncoderBlock: 3-12                     921,216
│    │    └─EncoderBlock: 3-1

In [7]:
model(next(iter(DataLoader(masked_dataset, 3, collate_fn=collate_fn)))).shape

torch.Size([3, 512, 9])

### Train model by masking tokens

In [8]:
from torchmetrics import Accuracy
import lightning.pytorch as pl

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import MaskedDataModule
from python_scripts.transformers.task import MaskingTask

masked_datamodule = MaskedDataModule(masked_dataset, batch_size=16)

maskingtask = MaskingTask(
    model=model,
    loss_fn=torch.nn.CrossEntropyLoss(),
    optimizer=torch.optim.Adam(model.parameters(), 1e-3),
    scheduler=torch.optim.lr_scheduler.CosineAnnealingLR,
    acc_fn=Accuracy(task='multiclass', num_classes=len(masked_dataset.vocab), ignore_index=-100)
)

trainer = pl.Trainer(
    max_epochs=10,
)

trainer.fit(maskingtask, datamodule=masked_datamodule)
trainer.test(maskingtask, datamodule=masked_datamodule)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name    | Type               | Params
-----------------------------------------------
0 | model   | BERTCustomMasked   | 11.1 M
1 | loss_fn | CrossEntropyLoss   | 0     
2 | acc_fn  | MulticlassAccuracy | 0     
-----------------------------------------------
11.1 M    Trainable params
0         Non-trainable params
11.1 M    Total params
44.237    Total estimated model params size (MB)


Adjusting learning rate of group 0 to 1.0000e-03.
Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:02<00:02,  2.11s/it]

  tp = tp.sum(dim=0 if multidim_average == "global" else 1)


                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 50/50 [00:39<00:00,  1.26it/s, v_num=1, train_loss=0.0433, train_accuracy=0.312]{'train_avg_loss': tensor(0.0472, device='mps:0'), 'train_avg_accuracy': tensor(0.2844, device='mps:0'), 'val_avg_loss': tensor(0.0506, device='mps:0'), 'val_avg_accuracy': tensor(0.2801, device='mps:0')}

Epoch 0, Avg. Training Loss: 0.047 Avg. Training Accuracy: 0.284 Avg. Validation Loss: 0.051 Avg. Validation Accuracy: 0.280
Epoch 1: 100%|██████████| 50/50 [00:40<00:00,  1.24it/s, v_num=1, train_loss=0.0438, train_accuracy=0.331, val_loss=0.0439, val_accuracy=0.319]{'train_avg_loss': tensor(0.0439, device='mps:0'), 'train_avg_accuracy': tensor(0.3194, device='mps:0'), 'val_avg_loss': tensor(0.0437, device='mps:0'), 'val_avg_accuracy': tensor(0.3139, device='mps:0')}

Epoch 1, Avg. Training Loss: 0.044 Avg. Training Accuracy: 0.319 Avg. Validation Loss: 0.044 Avg. Validation Accuracy: 0.314
Epoch 2: 100%|██████████| 50/50 [00:40<00:00,  1.24it/s, v_num=1, train_loss=0.0444, trai

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 50/50 [00:39<00:00,  1.27it/s, v_num=1, train_loss=0.0455, train_accuracy=0.321, val_loss=0.0438, val_accuracy=0.342]
