### Generate Data

In [142]:
import random
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
def generate_custom_sequence(sequence_length = 10):
    # Define rules for your desired properties (e.g., hydrophobicity, charge)
    # Implement logic to generate sequences accordingly
    # ...
# Define the amino acids (you can customize this list)

    # For demonstration purposes, let's create a simple custom sequence
    # Generate a random peptide sequence of length 10

    random_sequence = "".join(random.choice(amino_acids) for _ in range(sequence_length))

    return random_sequence

def generate_peptides(n=100, sequence_length=10):
    data=[]
    for _ in range(n):
        data.append(generate_custom_sequence(sequence_length = sequence_length))
    return data

custom_peptides = []
vocab_size = len(set(amino_acids))
print("Vocab Size:", vocab_size)
for seq_len in [6,8,10,12,15,20]:
    [custom_peptides.append(i) for i in generate_peptides(n=10000, sequence_length=seq_len)]
custom_peptides = list(set(custom_peptides))
print("Random Peptides:", len(custom_peptides))

Vocab Size: 20
Random Peptides: 59999


### Tokenize dataset

In [143]:
from torch.nn.utils.rnn import pad_sequence
import torch

class CharTokenizer:
    def __init__(self, amino_acids):
        self.char_to_index = {char: idx for idx, char in enumerate(amino_acids, start=1)}
        self.index_to_char = {idx: char for char, idx in self.char_to_index.items()}
        self.index_to_char[0] = ""
        
    def encode_many(self, sequences):
        D = []
        for seq in sequences:
            D.append(torch.tensor([self.char_to_index[char] for char in seq]))

        D_pad = pad_sequence(D, batch_first=True, padding_value=0)
        return D_pad
    def decode_many(self, indices):
        D = []
        for enc in indices:
            D.append("".join(self.index_to_char[idx.item()] for idx in enc))
        return D
        
    def encode(self, sequence):
        return [self.char_to_index[char] for char in sequence]

    def decode(self, indices):
        return "".join(self.index_to_char[idx] for idx in indices)

# Example usage
tokenizer = CharTokenizer(amino_acids)

custom_peptides_encoded = tokenizer.encode_many(custom_peptides)
# custom_peptides_decoded = tokenizer.decode_many(custom_peptides_encoded)
print(custom_peptides_encoded.shape)


torch.Size([59999, 20])


### LSTM Model

In [144]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Define a simple LSTM model
class PeptideLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_dim):
        super(PeptideLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden_state):
        embedding_seq = self.embedding(input_seq)
        lstm_out, hidden_state = self.lstm(embedding_seq, hidden_state)
        output = self.fc(lstm_out)
        output_probs = F.softmax(output, dim=-1)
        return output_probs, hidden_state

# Example usage
input_size = vocab_size+1  # Number of unique amino acids
hidden_size = 64
output_size = vocab_size+1  # Same as input size for character-level prediction
embedding_dim = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PeptideLSTM(input_size, hidden_size, output_size, embedding_dim)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop (you'll need to adapt this to your dataset)
batch_size = 32
num_epochs = 100
N = len(custom_peptides_encoded)
num_batches = N // batch_size
for epoch in range(num_epochs):
    total_loss = 0.0
    random_indices = torch.rand
    
    # Shuffle the dataset (optional but recommended)
    random_indices = torch.randperm(len(custom_peptides_encoded))
    shuffled_sequences = [custom_peptides_encoded[i] for i in random_indices]

    for batch_start in range(0, len(shuffled_sequences), batch_size):
        batch = shuffled_sequences[batch_start : batch_start + batch_size]
        # Convert batch to PyTorch tensors
        batch_tensors = torch.stack(batch, dim=0).long().to(device) # Assuming your sequences are integer indices

        # Initialize hidden state (if using stateful LSTM)
        hidden_state = None

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs, hidden_state = model(batch_tensors, hidden_state)

        # Compute loss (assuming your targets are the next characters in the sequences)
        targets = batch_tensors[:, 1:]  # Shift targets by one position
        loss = criterion(outputs[:, :-1].reshape(-1, output_size), targets.reshape(-1))

        # Backpropagation
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {average_loss:.4f}")

print("Training completed!")


Epoch 1/100 - Loss: 2.6876
Epoch 2/100 - Loss: 2.6795
Epoch 3/100 - Loss: 2.6794
Epoch 4/100 - Loss: 2.6794
Epoch 5/100 - Loss: 2.6794
Epoch 6/100 - Loss: 2.6793
Epoch 7/100 - Loss: 2.6794
Epoch 8/100 - Loss: 2.6793
Epoch 9/100 - Loss: 2.6794
Epoch 10/100 - Loss: 2.6794
Epoch 11/100 - Loss: 2.6794
Epoch 12/100 - Loss: 2.6793
Epoch 13/100 - Loss: 2.6793
Epoch 14/100 - Loss: 2.6792
Epoch 15/100 - Loss: 2.6793
Epoch 16/100 - Loss: 2.6793
Epoch 17/100 - Loss: 2.6792
Epoch 18/100 - Loss: 2.6793
Epoch 19/100 - Loss: 2.6793
Epoch 20/100 - Loss: 2.6789
Epoch 21/100 - Loss: 2.6784
Epoch 22/100 - Loss: 2.6782
Epoch 23/100 - Loss: 2.6782
Epoch 24/100 - Loss: 2.6782
Epoch 25/100 - Loss: 2.6782
Epoch 26/100 - Loss: 2.6782
Epoch 27/100 - Loss: 2.6783
Epoch 28/100 - Loss: 2.6782
Epoch 29/100 - Loss: 2.6781
Epoch 30/100 - Loss: 2.6782
Epoch 31/100 - Loss: 2.6782
Epoch 32/100 - Loss: 2.6782
Epoch 33/100 - Loss: 2.6782
Epoch 34/100 - Loss: 2.6782
Epoch 35/100 - Loss: 2.6782
Epoch 36/100 - Loss: 2.6782
E

KeyboardInterrupt: 

### Generate new sequences

In [146]:
model.to('cpu')

# Generating new sequences (after training)
def generate_sequence(model, seed_sequence, length=20):
    hidden_state = None
    output_sequence = seed_sequence

    for _ in range(length):
        input_seq = tokenizer.encode_many([output_sequence[-1]])  # Convert last character to input
        output, hidden_state = model(input_seq, hidden_state)
        predicted_char = tokenizer.decode_many([output[-1].argmax(dim=1)])  # Sample from output distribution
        output_sequence += predicted_char[0]

    return output_sequence

# Example usage
seed_sequence = "KKM"  # Provide a starting sequence
generated_sequence = generate_sequence(model, seed_sequence, length=25)
print(f"Generated sequence: {generated_sequence}")

Generated sequence: KKMVCRLQMVCWLQMVCWLQMVCWLQ
