In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

In [None]:
# Constants for the RNA sequence
VOCAB_SIZE = 5
VOCAB = {'A': 0,
         'T': 1,
         'G': 2,
         'C': 3,
         'U': 4}
SEQ_LEN = 10

# Constants for the model
EPOCHS = 5

In [None]:
# Loading the data
def load_training_data(filename):
    df = pd.read_csv(filename)

    # Get the reactivity columns and convert them into a list
    reactivity_columns = [col for col in df.columns if col.startswith('reactivity_0')]
    df['reactivity'] = df[reactivity_columns].values.tolist()

    # Select the columns of interest
    result_df = df.loc[:, ['sequence', 'experiment_type', 'reactivity']]
    
    return result_df

train_df = load_training_data('data/train_data_QUICK_START.csv')
train_df.head()

In [None]:
# One hot encode the sequence and pad to a fixed length
def one_hot_encode(sequence):
    encoding = torch.zeros(VOCAB_SIZE, SEQ_LEN)
    for i, nucleotide in enumerate(sequence):
        encoding[VOCAB[nucleotide], i] = 1
    return encoding

# Example usage
sequence = "ATTGCUT"
encoded_seq = one_hot_encode(sequence)

In [None]:
# Define the Bi-LSTM model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_dim, output_dim, n_layers, bidirectional=True):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(vocab_size, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.n_layers * 2, x.size(0), self.hidden_dim).to(x.device)  # Initial hidden state
        c0 = torch.zeros(self.n_layers * 2, x.size(0), self.hidden_dim).to(x.device)  # Initial cell state
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out)
        return out
    
model = BiLSTM(VOCAB_SIZE, hidden_dim=50, output_dim=1, n_layers=2)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()
    
    # Assuming 'encoded_seq' is your input tensor and 'target' is your actual output
    output = model(encoded_seq.unsqueeze(0))  # Add batch dimension
    loss = loss_fn(output, target)
    
    loss.backward()
    optimizer.step()

In [None]:
# Make predictions
model.eval()
with torch.no_grad():
    prediction = model(encoded_seq.unsqueeze(0))
    print(prediction.squeeze().tolist())