In [1]:
import os
os.environ['RNETEB_PATH'] = '/home/users/gtully/RNET-EB'
os.environ['RANGER_PATH'] = '/home/users/gtully/Ranger-Deep-Learning-Optimizer/'

In [2]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import numpy as np
import random
import ast
import os
from torch.utils.data import Dataset, DataLoader
from ast import literal_eval
import sklearn
from Bio import SeqIO
from sklearn.model_selection import train_test_split
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
import sys
import yaml

sys.path.append(os.environ['RNETEB_PATH']+ '/tools')
sys.path.append(os.environ['RANGER_PATH'] + '/ranger')
from plotting import *   # from RNETEB tools 
from training import *   # from RNETEB tools

from ranger import Ranger # from Ranger-Deep-Learning-Optimizer cloned

# RNET_EB_000_NPT Training Important Notes

## NPT means "No Pre-Training"

## THIS NOTEBOOK EVALUATES RNET_000 WITHOUT PRETRAINED RNET WEIGHTS! 

1) This model was trained on both the logkd_lig_scaled and the logkd_nolig_scaled, per riboswitch sequence (see the uniquely defined RNA_Dataset in this notebook). 

2) Hyperparameters were chosen from success of fine-tuning RibonanzaNet on secondary structure data. 
    - optimizer: Ranger
    - scheduler: CosineAnnealingLR
    - criterion: L1

3) Before running all cells in the NB, make sure to clone both RNET-EB and the Ranger-Deep-Learning-Optimizer Repository and add paths to : 
    os.environ['RNETEB_PATH'] = '/your/path/to/RNET-EB'
    os.environ['RANGER_PATH'] = '/your/path/to/Ranger-Deep-Learning-Optimizer/'

4) As is, this repository regenerates data in locations where I already have data saved in public RNET-EB repository. If you would like to save results separately, rename the output files. 

   

## Import Cleaned Data from RibonanzaNet_EB_data_prep.ipynb

In [3]:
train_df = pd.read_json(os.environ['RNETEB_PATH'] + '/data/processed_data/RNET_EB_train.json')
val_df = pd.read_json(os.environ['RNETEB_PATH'] + '/data/processed_data/RNET_EB_val.json')
test_df = pd.read_json(os.environ['RNETEB_PATH'] + '/data/processed_data/RNET_EB_test.json')

In [4]:
train_df.columns

Index(['Activation Ratio', 'Design', 'Folding_Subscore', 'KDFMN', 'KDOFF',
       'KDON', 'KDnoFMN', 'Kd_OFF', 'Kd_ON', 'NumberOfClusters', 'Player',
       'Puzzle_Name', 'Round', 'index', 'ligand', 'min_kd_val', 'puzzle',
       'sequence', 'switch', 'MS2_aptamer', 'lig_aptamer', 'MS2_lig_aptamer',
       'constraints_worked', 'logkd_nolig', 'logkd_lig', 'logkd_nolig_scaled',
       'logkd_lig_scaled', 'Dataset', 'passed_CDHIT_filter', 'log_AR', 'id',
       'description'],
      dtype='object')

# Define Dataset

In [5]:
class RNA_Dataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokens = {nt: i for i, nt in enumerate('ACGU')}
        self.label_names = ['logkd_lig_scaled', 'logkd_nolig_scaled']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence=[self.tokens[nt] for nt in (self.data.loc[idx,'sequence'])]
        sequence=np.array(sequence)
        sequence=torch.tensor(sequence)

        labels = np.array([self.data.loc[idx, l] for l in self.label_names])  # Just 1 value per label
        labels = torch.tensor(labels, dtype=torch.float32)  # Ensure labels are of correct float type


        return {'sequence': sequence, 'labels': labels}



In [6]:
# Create the datasets
train_dataset = RNA_Dataset(train_df)
val_dataset = RNA_Dataset(val_df)

# Test the first sample
print(train_dataset[0]['labels'].shape)


torch.Size([2])


In [7]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)

In [8]:
sys.path.append(os.environ['RNETEB_PATH']+'/ribonanzanet2d-final')

from Network import *

class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)
        self.entries=entries

    def print(self):
        print(self.entries)

def load_config_from_yaml(file_path):
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    return Config(**config)

class finetuned_RibonanzaNet(RibonanzaNet):
    def __init__(self, config, pretrained=False):
        super(finetuned_RibonanzaNet, self).__init__(config)
        if pretrained:
            self.load_state_dict(torch.load(os.environ['RNETEB_PATH']+'/ribonanzanet-weights/RibonanzaNet.pt',map_location='cpu'))

        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.decoder = nn.Linear(64, 2)  # From 64 "pooled values from each channel " to 2 output labels

    def forward(self,src):

        sequence_features, pairwise_features=self.get_embeddings(src, torch.ones_like(src).long().to(src.device))
        pairwise_features = pairwise_features.squeeze(0)  # Remove the batch dimension to make it [H, W, 64]
        pairwise_features = pairwise_features.permute(2, 0, 1)  # Change to [64, H, W] to match pooling expectation (C, H, W)

        # Apply global average pooling, result is [64, 1, 1]
        pairwise_features = self.global_pool(pairwise_features)

        # Flatten the output to [64]
        pairwise_features = pairwise_features.view(pairwise_features.size(0))  # Flatten to [64] (batch size 1, so this will be [64])

        # Pass through the decoder to get the final output [2]
        output = self.decoder(pairwise_features)

        return output

## This is the most important cell for this NB - pretrained = False ! 

In [9]:
config=load_config_from_yaml(os.environ['RNETEB_PATH'] + "/ribonanzanet2d-final/configs/pairwise.yaml")
model=finetuned_RibonanzaNet(config,pretrained=False).cuda()



constructing 9 ConvTransformerEncoderLayers


## Training Loop

Note: Need to clone the Ranger-Deep-Learning-Optimizer and add the path to the ranger folder within the repository. 

### Save the epoch losses as well as weights and model check point. 
### Also save plots of training and validation losses. 

In [10]:
# Initialize Hyperparameters
epochs = 20
cos_epoch = 15

best_loss = np.inf
optimizer = Ranger(model.parameters(), weight_decay=0.001, lr=0.0001)
criterion = torch.nn.L1Loss()
schedule = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(epochs - cos_epoch) * len(train_loader))

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


In [11]:

# Setup directories
checkpoint_dir = os.path.join(os.environ['RNETEB_PATH'], 'results/checkpoints')
figure_dir = os.path.join(os.environ['RNETEB_PATH'], 'results/figures')
Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
Path(figure_dir).mkdir(parents=True, exist_ok=True)

# Track losses
train_losses = []
val_losses = []

for epoch in range(epochs):
    model.train()
    tbar = tqdm(train_loader)
    total_loss = 0
    oom = 0
    for idx, batch in enumerate(tbar):
        sequence = batch['sequence'].cuda()
        labels = batch['labels'].cuda()
        output = model(sequence) 
        labels = labels.view_as(output)
        
        # Compute loss
        loss = criterion(output, labels)
        loss = loss.mean()

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        optimizer.step()
        optimizer.zero_grad()

        if (epoch + 1) > cos_epoch:
            schedule.step()

        total_loss += loss.item()
        tbar.set_description(f"Epoch {epoch + 1} Loss: {total_loss / (idx + 1)}")

    # Calculate average training loss
    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation loop
    tbar = tqdm(val_loader)
    model.eval()
    val_preds = []
    val_loss = 0
    for idx, batch in enumerate(tbar):
        sequence = batch['sequence'].cuda()
        labels = batch['labels'].cuda()

        with torch.no_grad():
            output = model(sequence)
            labels = labels.view_as(output)
            #output = output.squeeze() ## Double check this 
            loss = criterion(output, labels)
            loss = loss.mean()

        val_loss += loss.item()
        val_preds.append([labels.cpu().numpy(), output.cpu().numpy()])

    val_loss = val_loss / len(val_loader)
    val_losses.append(val_loss)
    print(f"Epoch {epoch + 1} - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Save latest checkpoint
    scheduler_to_save = schedule if (epoch + 1) > cos_epoch else None
    save_checkpoint(epoch, model, optimizer, scheduler_to_save, 
                   avg_train_loss, val_loss, train_losses, val_losses, 
                   best_loss, checkpoint_dir, 'latest_checkpoint.pt')

    # Save the best model
    if val_loss < best_loss:
        best_loss = val_loss
        best_preds = val_preds
        
        # Save best weights 
        torch.save(model.state_dict(), 
                   os.path.join(os.environ['RNETEB_PATH'], 
                               'results/rnet_eb_weights/RibonanzaNet-EB_000_NPT_log_kds.pt'))
        
        # Save best checkpoint (full)
        save_checkpoint(epoch, model, optimizer, scheduler_to_save, 
                       avg_train_loss, val_loss, train_losses, val_losses, 
                       best_loss, checkpoint_dir, 'RNET_EB_000_NPT_best_checkpoint.pt')
        print(f"✓ New best model saved! Val Loss: {val_loss:.4f}")

    # Save periodic checkpoints every 10 epochs
    if (epoch + 1) % 10 == 0:
        save_checkpoint(epoch, model, optimizer, scheduler_to_save, 
                       avg_train_loss, val_loss, train_losses, val_losses, 
                       best_loss, checkpoint_dir, f'RNET_EB_000_NPT_checkpoint_epoch_{epoch+1}.pt')

    # Plot and save loss curves
    plot_loss_curve(train_losses, val_losses, figure_dir)

# Plot final summary
plot_final_summary(train_losses, val_losses, figure_dir)

print(f"\n{'='*50}")
print(f"Training Complete!")
print(f"Best Validation Loss: {best_loss:.4f}")
print(f"Checkpoints saved to: {checkpoint_dir}")
print(f"Loss curves saved to: {figure_dir}")
print(f"{'='*50}")

  return fn(*args, **kwargs)
	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value = 1) (Triggered internally at /pytorch/torch/csrc/utils/python_arg_parser.cpp:1661.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
Epoch 1 Loss: 0.9246442714300368: 100%|██████████| 1258/1258 [03:10<00:00,  6.59it/s]
100%|██████████| 629/629 [00:17<00:00, 35.72it/s]


Epoch 1 - Train Loss: 0.9246, Val Loss: 0.8676
✓ New best model saved! Val Loss: 0.8676


  return fn(*args, **kwargs)
Epoch 2 Loss: 0.8835163648537877: 100%|██████████| 1258/1258 [03:11<00:00,  6.55it/s]
100%|██████████| 629/629 [00:17<00:00, 35.68it/s]


Epoch 2 - Train Loss: 0.8835, Val Loss: 0.8655
✓ New best model saved! Val Loss: 0.8655


  return fn(*args, **kwargs)
Epoch 3 Loss: 0.8760259621710011: 100%|██████████| 1258/1258 [03:11<00:00,  6.57it/s]
100%|██████████| 629/629 [00:17<00:00, 35.72it/s]


Epoch 3 - Train Loss: 0.8760, Val Loss: 0.8839


  return fn(*args, **kwargs)
Epoch 4 Loss: 0.8702589942465146: 100%|██████████| 1258/1258 [03:11<00:00,  6.57it/s]
100%|██████████| 629/629 [00:17<00:00, 35.82it/s]


Epoch 4 - Train Loss: 0.8703, Val Loss: 0.8881


  return fn(*args, **kwargs)
Epoch 5 Loss: 0.8677642393282753: 100%|██████████| 1258/1258 [03:11<00:00,  6.58it/s]
100%|██████████| 629/629 [00:17<00:00, 35.75it/s]


Epoch 5 - Train Loss: 0.8678, Val Loss: 0.9222


  return fn(*args, **kwargs)
Epoch 7 Loss: 0.8542737276404385: 100%|██████████| 1258/1258 [03:11<00:00,  6.57it/s]
100%|██████████| 629/629 [00:17<00:00, 35.79it/s]


Epoch 7 - Train Loss: 0.8543, Val Loss: 1.0413


  return fn(*args, **kwargs)
Epoch 8 Loss: 0.8347998569919685:  34%|███▍      | 427/1258 [01:05<02:02,  6.78it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 12 Loss: 0.8183890864703158: 100%|██████████| 1258/1258 [03:12<00:00,  6.54it/s]
100%|██████████| 629/629 [00:17<00:00, 35.60it/s]


Epoch 12 - Train Loss: 0.8184, Val Loss: 0.8745


  return fn(*args, **kwargs)
Epoch 13 Loss: 0.8039874791956858: 100%|██████████| 1258/1258 [03:12<00:00,  6.54it/s]
100%|██████████| 629/629 [00:17<00:00, 35.45it/s]


Epoch 13 - Train Loss: 0.8040, Val Loss: 0.8494
✓ New best model saved! Val Loss: 0.8494


  return fn(*args, **kwargs)
Epoch 14 Loss: 0.7917601759227167: 100%|██████████| 1258/1258 [03:11<00:00,  6.55it/s]
100%|██████████| 629/629 [00:17<00:00, 35.56it/s]


Epoch 14 - Train Loss: 0.7918, Val Loss: 0.8382
✓ New best model saved! Val Loss: 0.8382


  return fn(*args, **kwargs)
Epoch 15 Loss: 0.7753875519967799: 100%|██████████| 1258/1258 [03:11<00:00,  6.56it/s]
100%|██████████| 629/629 [00:17<00:00, 35.29it/s]


Epoch 15 - Train Loss: 0.7754, Val Loss: 0.8429


  return fn(*args, **kwargs)
Epoch 16 Loss: 0.7608440926088248: 100%|██████████| 1258/1258 [03:12<00:00,  6.55it/s]
100%|██████████| 629/629 [00:17<00:00, 35.51it/s]


Epoch 16 - Train Loss: 0.7608, Val Loss: 0.8416


  return fn(*args, **kwargs)
Epoch 17 Loss: 0.7317241672159946: 100%|██████████| 1258/1258 [03:12<00:00,  6.55it/s]
100%|██████████| 629/629 [00:17<00:00, 35.76it/s]


Epoch 17 - Train Loss: 0.7317, Val Loss: 0.9137


  return fn(*args, **kwargs)
Epoch 18 Loss: 0.6938038559805229: 100%|██████████| 1258/1258 [03:11<00:00,  6.58it/s]
100%|██████████| 629/629 [00:17<00:00, 35.79it/s]


Epoch 18 - Train Loss: 0.6938, Val Loss: 0.8710


  return fn(*args, **kwargs)
Epoch 19 Loss: 0.6498583423030017: 100%|██████████| 1258/1258 [03:11<00:00,  6.57it/s]
100%|██████████| 629/629 [00:17<00:00, 35.67it/s]


Epoch 19 - Train Loss: 0.6499, Val Loss: 0.8788


  return fn(*args, **kwargs)
Epoch 20 Loss: 0.6136457161986771: 100%|██████████| 1258/1258 [03:11<00:00,  6.57it/s]
100%|██████████| 629/629 [00:17<00:00, 35.61it/s]


Epoch 20 - Train Loss: 0.6136, Val Loss: 0.8853

Training Complete!
Best Validation Loss: 0.8382
Checkpoints saved to: /home/users/gtully/RNET-EB/results/checkpoints
Loss curves saved to: /home/users/gtully/RNET-EB/results/figures
