# RNA Reactivity Prediction
https://www.kaggle.com/competitions/stanford-ribonanza-rna-folding/overview

In [1]:
TEST_DATA_PATH = '../data/small_sets/test_sequences.csv'
TEST_DATA_EXT_PATH = '../data/small_sets/test_extracted.csv'
SUBMISSION_SAMPLE_FILE_PATH = '../data/sample_submission.csv'
SUBMISSION_FILE_PATH = '../data/submission.csv'

In [2]:
import pandas as pd

test_extracted_pd = pd.read_csv(TEST_DATA_EXT_PATH)
test_extracted_pd.head()

Unnamed: 0,sequence,sequence_ext
0,GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGG...,(((((((((((.....)))))).....)))))......(((((((....
1,GGGAACGACUCGAGUAGAGUCGAAAAUGUAAUCAGAUUGCUUCUCC...,.....((((((.....))))))................((..((((...
2,GGGAACGACUCGAGUAGAGUCGAAAAAACACAUGAAUUUGAGGGUU...,.....((((((.....))))))...............((((((.(....
3,GGGAACGACUCGAGUAGAGUCGAAAAUCAGAGCUGGCAAAUGGAUG...,.....((((((.....))))))....((.(((..((((((.((.((...
4,GGGAACGACUCGAGUAGAGUCGAAAAUUUGGUAUUUGAUGCAUUAA...,.....((((((.....)))))).................((........


In [3]:
len(test_extracted_pd)

20000

In [4]:
import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataset_pred

rna_extracted_dataset = RNADataset_pred(
    data_ext = test_extracted_pd,
    vocab=pd.read_csv('../data/vocab.csv'),
    max_len=512
)

In [5]:
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomRNAReactivity, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(rna_extracted_dataset.vocab),
    hidden=128,
    dim_k=16,
    num_layer=12,
    num_attn_head=12
)
RNA_model = BERTCustomRNAReactivity(bertmodel)

summary(RNA_model)

Layer (type:depth-idx)                             Param #
BERTCustomRNAReactivity                            --
├─BERTCustom: 1-1                                  --
│    └─CombEmbedding: 2-1                          --
│    │    └─TokenEmbedding: 3-1                    2,944
│    │    └─PositionEmbedding: 3-2                 --
│    │    └─Dropout: 3-3                           --
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-4                      231,232
│    │    └─EncoderBlock: 3-5                      231,232
│    │    └─EncoderBlock: 3-6                      231,232
│    │    └─EncoderBlock: 3-7                      231,232
│    │    └─EncoderBlock: 3-8                      231,232
│    │    └─EncoderBlock: 3-9                      231,232
│    │    └─EncoderBlock: 3-10                     231,232
│    │    └─EncoderBlock: 3-11                     231,232
│    │    └─EncoderBlock: 3-12                     231,232
│    │    └─EncoderBlock: 3-1

In [6]:
import torch

import lightning.pytorch as pl

import numpy as np

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataModule
from python_scripts.transformers.task import RNATask

rna_datamodule = RNADataModule(predict_dataset=rna_extracted_dataset, batch_size=8)

# def rna_mae_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
#     not_ignore = y != ignore_index
#     return torch.abs(x[not_ignore] - y[not_ignore]).mean()

# rna_optimizer = torch.optim.SGD(RNA_model.parameters(), 1e-3, 0.9)
# rna_scheduler = torch.optim.lr_scheduler.OneCycleLR(
#     optimizer=rna_optimizer,
#     max_lr=1e-3,
#     steps_per_epoch=100,
#     epochs=5,
#     div_factor=1e2,
#     verbose=False
# )

rna_task = RNATask.load_from_checkpoint(
    checkpoint_path='./lightning_logs/test/checkpoints/epoch=0-step=2000.ckpt',
    model=RNA_model,
    # loss_fn=rna_mae_loss,
    # optimizer=rna_optimizer,
    # scheduler=rna_scheduler,
)

trainer = pl.Trainer()

predictions = trainer.predict(rna_task, datamodule=rna_datamodule)

predictions = np.concatenate(predictions, axis=1).clip(0, 1)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Predicting DataLoader 0:   0%|          | 0/2500 [00:00<?, ?it/s]



Predicting DataLoader 0: 100%|██████████| 2500/2500 [03:46<00:00, 11.05it/s]


In [8]:
submission = pd.read_csv(SUBMISSION_SAMPLE_FILE_PATH)
# submission = pd.DataFrame({'reactivity_2A3_MaP': [], 'reactivity_DMS_MaP': []})
submission['reactivity_2A3_MaP'] = predictions[0]
submission['reactivity_DMS_MaP'] = predictions[1]
submission.to_csv(SUBMISSION_FILE_PATH, index=False)