# RNA Reactivity Prediction
https://www.kaggle.com/competitions/stanford-ribonanza-rna-folding/overview

In [1]:
TEST_DATA_PATH = '../data/small_sets/test_sequences.csv'
TEST_DATA_EXT_PATH = '../data/small_sets/test_extracted.csv'
SUBMISSION_SAMPLE_FILE_PATH = '../data/sample_submission.csv'
SUBMISSION_FILE_PATH = '../data/submission.csv'
DATA_PATH = '../data'
VOCAB_PATH = '../data/vocab.csv'

In [2]:
import pandas as pd

test_extracted_pd = pd.read_csv(TEST_DATA_EXT_PATH)
test_extracted_pd.head()

Unnamed: 0,sequence,sequence_ext
0,GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGG...,(((((((((((.....)))))).....)))))......(((((((....
1,GGGAACGACUCGAGUAGAGUCGAAAAUGUAAUCAGAUUGCUUCUCC...,.....((((((.....))))))................((..((((...
2,GGGAACGACUCGAGUAGAGUCGAAAAAACACAUGAAUUUGAGGGUU...,.....((((((.....))))))...............((((((.(....
3,GGGAACGACUCGAGUAGAGUCGAAAAUCAGAGCUGGCAAAUGGAUG...,.....((((((.....))))))....((.(((..((((((.((.((...
4,GGGAACGACUCGAGUAGAGUCGAAAAUUUGGUAUUUGAUGCAUUAA...,.....((((((.....)))))).................((........


In [3]:
len(test_extracted_pd)

20000

In [4]:
import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataset_pred

rna_extracted_dataset = RNADataset_pred(
    data_ext = test_extracted_pd,
    vocab=pd.read_csv(VOCAB_PATH),
    max_len=512
)

In [5]:
from torchinfo import summary

import sys
sys.path.append('..')

from python_scripts.transformers.model import BERTCustomRNAReactivity, BERTCustom
from torch.utils.data import DataLoader

bertmodel = BERTCustom(
    vocab_size=len(rna_extracted_dataset.vocab),
    hidden=512,
    dim_k=64,
    num_layer=12,
    num_attn_head=8
)
RNA_model = BERTCustomRNAReactivity(bertmodel)

summary(RNA_model)

Layer (type:depth-idx)                             Param #
BERTCustomRNAReactivity                            --
├─BERTCustom: 1-1                                  --
│    └─CombEmbedding: 2-1                          --
│    │    └─TokenEmbedding: 3-1                    11,776
│    │    └─PositionEmbedding: 3-2                 --
│    │    └─Dropout: 3-3                           --
│    └─ModuleList: 2-2                             --
│    │    └─EncoderBlock: 3-4                      3,152,384
│    │    └─EncoderBlock: 3-5                      3,152,384
│    │    └─EncoderBlock: 3-6                      3,152,384
│    │    └─EncoderBlock: 3-7                      3,152,384
│    │    └─EncoderBlock: 3-8                      3,152,384
│    │    └─EncoderBlock: 3-9                      3,152,384
│    │    └─EncoderBlock: 3-10                     3,152,384
│    │    └─EncoderBlock: 3-11                     3,152,384
│    │    └─EncoderBlock: 3-12                     3,152,384
│    │    

In [6]:
import torch

import lightning.pytorch as pl

import numpy as np

import sys
sys.path.append('..')

from python_scripts.transformers.dataset import RNADataModule
from python_scripts.transformers.task import RNATask

rna_datamodule = RNADataModule(predict_dataset=rna_extracted_dataset, batch_size=8)

# def rna_mae_loss(x: torch.tensor, y: torch.tensor, ignore_index=-100):
#     not_ignore = y != ignore_index
#     return torch.abs(x[not_ignore] - y[not_ignore]).mean()

# rna_optimizer = torch.optim.SGD(RNA_model.parameters(), 1e-3, 0.9)
# rna_scheduler = torch.optim.lr_scheduler.OneCycleLR(
#     optimizer=rna_optimizer,
#     max_lr=1e-3,
#     steps_per_epoch=100,
#     epochs=5,
#     div_factor=1e2,
#     verbose=False
# )

rna_task = RNATask.load_from_checkpoint(
    checkpoint_path='./lightning_logs/version_19/checkpoints/epoch=9-step=1000.ckpt',
    model=RNA_model,
    # loss_fn=rna_mae_loss,
    # optimizer=rna_optimizer,
    # scheduler=rna_scheduler,
)

trainer = pl.Trainer()

predictions = trainer.predict(rna_task, datamodule=rna_datamodule)

predictions = np.concatenate(predictions, axis=1).clip(0, 1)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Predicting DataLoader 0:   0%|          | 0/2500 [00:00<?, ?it/s]



Predicting DataLoader 0: 100%|██████████| 2500/2500 [08:21<00:00,  4.99it/s]


In [9]:
submission = pd.read_csv(SUBMISSION_SAMPLE_FILE_PATH)
submission['reactivity_2A3_MaP'] = predictions[0]
submission['reactivity_DMS_MaP'] = predictions[1]
submission.to_csv(SUBMISSION_FILE_PATH, index=False)

ValueError: Length of values (3540000) does not match length of index (269796671)

In [None]:
# Check length generalization

import polars as pl
import matplotlib.pyplot as plt

df=pl.read_csv(SUBMISSION_FILE_PATH)
#some parameters
font_size=6
id1=269545321
id2=269724007
reshape1=391
reshape2=457
#get predictions
pred_DMS=df[id1:id2+1]['reactivity_DMS_MaP'].to_numpy().reshape(reshape1,reshape2)
pred_2A3=df[id1:id2+1]['reactivity_2A3_MaP'].to_numpy().reshape(reshape1,reshape2)
#plot mutate and map
fig = plt.figure()
plt.subplot(121)
plt.title(f'reactivity_DMS_MaP', fontsize=font_size)
plt.imshow(pred_DMS,vmin=0,vmax=1, cmap='gray_r')
plt.subplot(122)
plt.title(f'reactivity_2A3_MaP', fontsize=font_size)
plt.imshow(pred_2A3,vmin=0,vmax=1, cmap='gray_r')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/plot.png",dpi=500)
plt.clf()
plt.close()
