In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [None]:
# Constants for the RNA sequence
FILENAME = "data/train_data_QUICK_START.csv"
VOCAB_SIZE = 5
VOCAB = {'A': 0,
         'T': 1,
         'G': 2,
         'C': 3,
         'U': 4}
SEQ_LEN = 206
EXPERIMENT_TYPE = "2A3_MaP" # DMS_MaP

# Constants for the model
TEST_SIZE = 0.2
EPOCHS = 5
BATCH_SIZE = 32

In [None]:
# Read the CSV file and split it into train & test sets
def load_csv_data(filename=FILENAME, test_size=TEST_SIZE):
    df = pd.read_csv(filename)
    df = df.iloc[:1000, :]

    # Only load one experiment type at a time
    df = df[df['experiment_type'] == EXPERIMENT_TYPE]

    # Reformat the reactivity columns
    reactivity_columns = [col for col in df.columns if col.startswith('reactivity_0')]
    df['reactivity'] = df[reactivity_columns].values.tolist()

    # Select the relevant columns
    clean_df = df.loc[:, ['sequence', 'reactivity']]

    # Split into train and test sets
    train_df, test_df = train_test_split(clean_df, test_size=test_size)
    return train_df, test_df

train_df, test_df = load_csv_data()

In [None]:
# One hot encode the sequence and pad it to a fixed length
def one_hot_encode(sequence):
    encoding = torch.zeros(SEQ_LEN, VOCAB_SIZE)
    for i, nucleotide in enumerate(sequence):
        encoding[i, VOCAB[nucleotide]] = 1
    return encoding

# Convert sequences and reactivities to tensors
def df_to_tensor(df):
    # Apply padded one-hot encoding to the sequences
    encoded_sequences = df['sequence'].apply(one_hot_encode)

    # Convert to tensors and standardize the shape
    sequences_tensor = torch.stack(encoded_sequences.tolist())
    reactivities_tensor = torch.tensor(df['reactivity'].values.tolist(), dtype=torch.float32)
    reactivities_tensor = reactivities_tensor.unsqueeze(-1)

    return TensorDataset(sequences_tensor, reactivities_tensor)

train_data = df_to_tensor(train_df)
test_data = df_to_tensor(test_df)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Define the Bi-LSTM model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_dim, output_dim, n_layers, bidirectional=True):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(vocab_size, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.n_layers * 2, x.size(0), self.hidden_dim).to(x.device)  # Initial hidden state
        c0 = torch.zeros(self.n_layers * 2, x.size(0), self.hidden_dim).to(x.device)  # Initial cell state
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out)
        return out
    
model = BiLSTM(VOCAB_SIZE, hidden_dim=50, output_dim=1, n_layers=2)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
for epoch in range(EPOCHS):
    total_loss = 0
    for sequences, reactivities in train_loader:
        model.train()
        optimizer.zero_grad()
        outputs = model(sequences)

        loss = loss_fn(outputs, reactivities)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")