# RNA Reactivity Training
https://www.kaggle.com/competitions/stanford-ribonanza-rna-folding/overview

In [1]:
TRAIN_DATA_PATH = 'E:/data/Ribonanza_RNA_folding/train_data_QUICK_START.csv'
TRAIN_DATA_EXT_PATH = 'E:/data/Ribonanza_RNA_folding/train_extracted.csv'
BPP_DATA_PATH = 'E:/data/Ribonanza_RNA_folding/Ribonanza_bpp_files/extra_data'
BPP_PROBS_TRAIN_PATH = 'E:/data/Ribonanza_RNA_folding/Ribonanza_bpp_files/extra_data_train'
VOCAB_PATH = 'E:/data/Ribonanza_RNA_folding/vocab.csv'

In [2]:
import pandas as pd

train_data_pd = pd.read_csv(TRAIN_DATA_PATH)
train_data_pd.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reactivity_0001,reactivity_0002,reactivity_0003,reactivity_0004,reactivity_0005,reactivity_0006,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,2A3_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_2A3,,,,,,,...,,,,,,,,,,
1,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,DMS_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_DMS,,,,,,,...,,,,,,,,,,
2,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,2A3_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_2A3,,,,,,,...,,,,,,,,,,
3,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,DMS_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_DMS,,,,,,,...,,,,,,,,,,
4,00021f968267,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,2A3_MaP,DasLabBigLib_OneMil_Replicates_from_previous_l...,,,,,,,...,,,,,,,,,,


In [3]:
train_extracted_pd = pd.read_csv(TRAIN_DATA_EXT_PATH)
train_extracted_pd.head()

Unnamed: 0,sequence,sequence_ext
0,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,.....((((((.....)))))).....((((((((((((((....)...
1,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,.....((((((.....))))))........(((((..(.....).....
2,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,.....((((((.....))))))........(((((.((((.........
3,GGGAACGACUCGAGUAGAGUCGAAAAGGAGAUCGAAGACGACUUAC...,.....((((((.....))))))....((((((((.....(.........
4,GGGAACGACUCGAGUAGAGUCGAAAAGAUAUGGACUGACGAAGUCG...,.....((((((.....))))))....(((..(((((((((..((((...


In [4]:
len(train_data_pd), len(train_extracted_pd)

(335616, 167808)

In [5]:
import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataset_probs_train


rna_dataset = RNADataset_probs_train(
    data=train_data_pd,
    data_ext=train_extracted_pd,
    prob_dir_path=BPP_PROBS_TRAIN_PATH,
    vocab=pd.read_csv(VOCAB_PATH),
    max_len=210
)

In [6]:
len(rna_dataset)

167808

In [7]:
rna_dataset[0][0].shape, rna_dataset[0][1].shape, rna_dataset[0][2].shape

(torch.Size([210]), torch.Size([2, 210, 4]), torch.Size([210, 210]))

In [8]:
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomRNAReactivity, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(rna_dataset.vocab),
    hidden=512,
    dim_k=64,
    num_layer=12,
    num_attn_head=8
)
RNA_model = BERTCustomRNAReactivity(bertmodel)

summary(RNA_model)

Layer (type:depth-idx)                             Param #
BERTCustomRNAReactivity                            --
├─BERTCustom: 1-1                                  --
│    └─CombEmbedding: 2-1                          --
│    │    └─TokenEmbedding: 3-1                    11,776
│    │    └─PositionEmbedding: 3-2                 --
│    │    └─Dropout: 3-3                           --
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-4                      3,152,385
│    │    └─EncoderBlock: 3-5                      3,152,385
│    │    └─EncoderBlock: 3-6                      3,152,385
│    │    └─EncoderBlock: 3-7                      3,152,385
│    │    └─EncoderBlock: 3-8                      3,152,385
│    │    └─EncoderBlock: 3-9                      3,152,385
│    │    └─EncoderBlock: 3-10                     3,152,385
│    │    └─EncoderBlock: 3-11                     3,152,385
│    │    └─EncoderBlock: 3-12                     3,152,385
│    │    

In [9]:
sample_data = next(iter(DataLoader(rna_dataset, 3)))

RNA_model(sample_data[0], sample_data[2]).shape

torch.Size([3, 2, 210, 4])

In [10]:
import torch
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataModule
from python_scripts.transformers.task import RNATask

rna_datamodule = RNADataModule(whole_train_dataset=rna_dataset, batch_size=16, probs_adjusted=True, num_workers=2)

def rna_rmse_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
    not_ignore = y != ignore_index
    return torch.sqrt(torch.square(x[not_ignore] - y[not_ignore]).mean())

def rna_mse_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
    not_ignore = y != ignore_index
    return torch.square(x[not_ignore] - y[not_ignore]).mean()

def rna_mae_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
    not_ignore = y != ignore_index
    return torch.abs(x[not_ignore] - y[not_ignore]).mean()

rna_optimizer = torch.optim.Adam(RNA_model.parameters(), 1e-3)
# rna_optimizer = torch.optim.SGD(RNA_model.parameters(), 1e-3, 0.9)
# rna_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
#     rna_optimizer,
#     T_max=5,
#     eta_min=1e-4,
#     verbose=True,
# )
# rna_scheduler = torch.optim.lr_scheduler.MultiStepLR(
#     rna_optimizer,
#     [4, 7, 10, 13, 16, 19],
#     verbose=True,
#     gamma=0.3
# )
# rna_scheduler = torch.optim.lr_scheduler.CyclicLR(
#     optimizer=rna_optimizer,
#     base_lr=1e-6,
#     max_lr=1e-3,
#     step_size_up=3000,
#     step_size_down=7000,
#     verbose=True
# )
rna_scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer=rna_optimizer,
    max_lr=1e-4,
    steps_per_epoch=8391,
    epochs=50,
    div_factor=1e2,
    pct_start=0.01,
    verbose=False
)

rna_task = RNATask(
    model=RNA_model,
    loss_fn=rna_mae_loss,
    optimizer=rna_optimizer,
    scheduler=rna_scheduler,
)

callbacks = []
callbacks.append(ModelCheckpoint(
    monitor='val_avg_loss',
    save_top_k=3,
    mode='min'
))
callbacks.append(EarlyStopping(
    monitor='val_avg_loss',
    min_delta=0.0001,
    patience=5,
    verbose=True,
    mode='min'
))

trainer = pl.Trainer(
    max_epochs=50,
    callbacks=callbacks,
    precision='16-mixed'
)

# rna_task = RNATask.load_from_checkpoint(
#     checkpoint_path='./lightning_log/~~'
#     model=RNA_model,
#     loss_fn=rna_mae_loss,
#     optimizer=rna_optimizer,
#     scheduler=rna_scheduler,
# )

# trainer.fit(rna_task, ckpt_path="some/path/to/my_checkpoint.ckpt")# trainer = pl.Trainer(resume_from_checkpoint='../notebooks/lightning_logs/version_0/checkpoints/epoch=0-step=100.ckpt')

trainer.fit(rna_task, datamodule=rna_datamodule)
trainer.test(rna_task, datamodule=rna_datamodule)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\TimJimTangtong\Miniconda3\envs\lightning\lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                    | Params
--------------------------------------------------
0 | model | BERTCustomRNAReactivity | 37.8 M
--------------------------------------------------
37.8 M    Trainab

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\TimJimTangtong\Miniconda3\envs\lightning\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


In [11]:
[list(RNA_model.bert.encoder_blocks[i].attention.attention.parameters()) for i in range(12)]

[[Parameter containing:
  tensor(1.0066, requires_grad=True)],
 [Parameter containing:
  tensor(1.0059, requires_grad=True)],
 [Parameter containing:
  tensor(0.9988, requires_grad=True)],
 [Parameter containing:
  tensor(1.0101, requires_grad=True)],
 [Parameter containing:
  tensor(1.0060, requires_grad=True)],
 [Parameter containing:
  tensor(1.0035, requires_grad=True)],
 [Parameter containing:
  tensor(1.0019, requires_grad=True)],
 [Parameter containing:
  tensor(1.0060, requires_grad=True)],
 [Parameter containing:
  tensor(1.0050, requires_grad=True)],
 [Parameter containing:
  tensor(0.9990, requires_grad=True)],
 [Parameter containing:
  tensor(0.9999, requires_grad=True)],
 [Parameter containing:
  tensor(0.9965, requires_grad=True)]]