In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from dataloader import get_dataloaders, MAX_SEQ_LENGTH, vocab_size
from torch.autograd import Variable
import time

class RNAPairLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, device='cpu'):
        super(RNAPairLSTM, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.device = device

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)

    def forward(self, input): 
        h_0 = Variable(torch.zeros(2*self.num_layers, input.size(0), self.hidden_dim, requires_grad=False).to(self.device))
        c_0 = Variable(torch.zeros(2*self.num_layers, input.size(0), self.hidden_dim).to(self.device))

        output, (h_out, _) = self.lstm(input, (h_0, c_0))
        output = self.fc(output)
        
        return output


def train_model(model, train_loader, criterion, optimizer, num_epochs=10, device='cpu'):
    model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        for seq1, seq2 in train_loader:
            seq1, seq2 = seq1.to(device), seq2.to(device)
            outputs = model(seq1)
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), seq2.reshape(-1, seq2.size(-1)))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

def evaluate_model(model, dev_loader, criterion, device):
    def enforce_e_and_p(output):
        E_token_index = 4
        P_token_index = 5

        output_indices = torch.argmax(output, dim=-1)
        for i in range(output.size(0)):  # Iterate over batch
            e_found = False
            for j in range(output.size(1)):  # Iterate over sequence length
                if e_found:
                    output_indices[i, j] = P_token_index
                elif output_indices[i, j] == E_token_index:
                    e_found = True
            if not e_found:
                output_indices[i, -1] = E_token_index

        # Convert indices back to one-hot encoding
        output_one_hot = torch.zeros_like(output).scatter_(-1, output_indices.unsqueeze(-1), 1.0)
        return output_one_hot
    
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for seq1, seq2 in dev_loader:
            seq1, seq2 = seq1.to(device), seq2.to(device)
            outputs = model(seq1)
            outputs = enforce_e_and_p(outputs)  # Apply post-processing
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), seq2.reshape(-1, seq2.size(-1)))
            total_loss += loss.item()
        print(f'Dev Loss: {total_loss / len(dev_loader):.4f}')

In [16]:
# Hyperparameters
input_dim = vocab_size  # One-hot encoded input size
hidden_dim = 128
output_dim = vocab_size  # One-hot encoded output size
num_layers = 2
num_epochs = 30
learning_rate = 1e-2
batch_size = 32

# Device configuration
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print('Using ' + device)

# Load data
train_loader, dev_loader, test_loader = get_dataloaders(batch_size=batch_size)

# Initialize model, criterion and optimizer
model = RNAPairLSTM(input_dim, hidden_dim, output_dim, num_layers, device).to(device)
weight = torch.tensor([1,1,1,1,1,0.01,1],dtype=torch.float32,requires_grad=False).to(device)
criterion = nn.CrossEntropyLoss(weight=weight)  # Use CrossEntropyLoss for classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 0.9 ** epoch)

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs, device)
# save model using dd/mm-hh:mm
path = time.strftime("%d-%m-%H:%M") + '.pth'
# torch.save(model.state_dict(), 'model_test.pth')
torch.save(model.state_dict(), 'model_test.pth')

# Evaluate the model
evaluate_model(model, dev_loader, criterion, device)

Using mps
Epoch 1/30
Epoch [1/30], Loss: 0.4937
Epoch 2/30
Epoch [2/30], Loss: 0.3978
Epoch 3/30
Epoch [3/30], Loss: 0.3760
Epoch 4/30
Epoch [4/30], Loss: 0.3440
Epoch 5/30
Epoch [5/30], Loss: 0.3256
Epoch 6/30
Epoch [6/30], Loss: 0.3022
Epoch 7/30
Epoch [7/30], Loss: 0.3165
Epoch 8/30
Epoch [8/30], Loss: 0.4658
Epoch 9/30
Epoch [9/30], Loss: 0.4865
Epoch 10/30
Epoch [10/30], Loss: 0.3924
Epoch 11/30
Epoch [11/30], Loss: 0.3884
Epoch 12/30
Epoch [12/30], Loss: 0.5550
Epoch 13/30
Epoch [13/30], Loss: 0.3202
Epoch 14/30
Epoch [14/30], Loss: 0.4354
Epoch 15/30
Epoch [15/30], Loss: 0.3606
Epoch 16/30
Epoch [16/30], Loss: 0.4685
Epoch 17/30
Epoch [17/30], Loss: 0.3050
Epoch 18/30
Epoch [18/30], Loss: 0.3041
Epoch 19/30
Epoch [19/30], Loss: 0.4928
Epoch 20/30
Epoch [20/30], Loss: 0.2942
Epoch 21/30
Epoch [21/30], Loss: 0.3686
Epoch 22/30
Epoch [22/30], Loss: 0.3971
Epoch 23/30
Epoch [23/30], Loss: 0.4006
Epoch 24/30
Epoch [24/30], Loss: 0.2873
Epoch 25/30
Epoch [25/30], Loss: 0.2532
Epoch 26

In [None]:
from dataloader import get_dataloaders, MAX_SEQ_LENGTH, vocab_size, vocabulary
import torch.nn as nn
import random
batch_size = 1
train_loader, dev_loader, test_loader = get_dataloaders(batch_size=batch_size)
# random select 5 training samples
random.seed(0)
train_samples = random.sample(list(train_loader), 10)
# random select 5 dev samples
dev_samples = random.sample(list(dev_loader), 10)
# random select 5 test samples
test_samples = random.sample(list(test_loader), 5)

vocab = list(vocabulary.keys())
def outputs_to_seq(outputs):
    outputs = outputs.argmax(dim=-1)
    # print(outputs)
    # 取vocab中的token
    outputs = [vocab[i] for i in outputs]
    # 去掉padding
    if 'P' in outputs:
        outputs = outputs[:outputs.index('P')]
    if 'E' in outputs:
        outputs = outputs[:outputs.index('E')]
    return outputs

model.eval()

# 输出原来的seq1和seq2，还有预测的seq2
for i in range(10):
    seq1, seq2 = train_samples[i]
    seq1 = seq1.to(device)
    seq2 = seq2.to(device)
    outputs = model(seq1.to(device))
    criterion = nn.CrossEntropyLoss()
    loss = criterion(outputs.reshape(MAX_SEQ_LENGTH, -1), seq2.reshape(MAX_SEQ_LENGTH, -1))
    print("train loss: ", loss.item())
    print("seq1: ", outputs_to_seq(seq1[0]))
    print("seq2: ", outputs_to_seq(seq2[0]))
    outputs = outputs_to_seq(outputs.reshape(MAX_SEQ_LENGTH, -1))
    print("pred: ", outputs)

print("dev samples")
for i in range(10):
    seq1, seq2 = dev_samples[i]
    seq1 = seq1.to(device)
    seq2 = seq2.to(device)
    outputs = model(seq1.to(device))
    criterion = nn.CrossEntropyLoss()
    loss = criterion(outputs.reshape(MAX_SEQ_LENGTH, -1), seq2.reshape(MAX_SEQ_LENGTH, -1))
    print("train loss: ", loss.item())
    print("seq1: ", outputs_to_seq(seq1[0]))
    print("seq2: ", outputs_to_seq(seq2[0]))
    outputs = outputs_to_seq(outputs.reshape(MAX_SEQ_LENGTH, -1))
    print("pred: ", outputs)

train loss:  0.5934857726097107
seq1:  ['T', 'G', 'A', 'G', 'A', 'T', 'G', 'G', 'A', 'G', 'T', 'C', 'T', 'C', 'G', 'C', 'T', 'C', 'T', 'G', 'A', 'C', 'G', 'C', 'C', 'A', 'G', 'G', 'C', 'T', 'G', 'G', 'A', 'G', 'T', 'G', 'C', 'A', 'G', 'T', 'G', 'G', 'T', 'G', 'C', 'G', 'A', 'T', 'C', 'T', 'C']
seq2:  ['G', 'A', 'G', 'T', 'C', 'T', 'T', 'G', 'C', 'T', 'C', 'T', 'G', 'T', 'C', 'G', 'C', 'C', 'T', 'A', 'G', 'G', 'C', 'T', 'G', 'G', 'A', 'G', 'T', 'G', 'C', 'A', 'G', 'T', 'G', 'G', 'C', 'G', 'C', 'G', 'A', 'T', 'C', 'T', 'C', 'G', 'G', 'C', 'T', 'C', 'A']
pred:  ['G', 'A', 'G', 'T', 'C', 'T', 'T', 'G', 'C', 'T', 'T', 'T', 'G', 'T', 'C', 'T', 'C', 'G', 'T', 'G', 'C', 'T', 'C', 'T', 'G', 'G', 'G', 'G', 'T', 'G', 'T', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'A', 'T', 'A', 'T', 'C', 'G', 'G', 'C']
train loss:  0.8530638217926025
seq1:  ['G', 'G', 'G', 'T', 'T', 'T', 'T', 'C', 'C', 'T', 'C', 'G', 'G', 'G', 'C', 'C', 'G', 'C', 'C', 'C', 'T', 'C', 'C', 'G', 'G', 'G', 'T']
seq2:  ['G', 'C', '