# Lab 3a solutions

In [None]:

### STEP 2a - encoding of sequences
# define the length 
len_dna=3*6
len_prot=round(len_dna/3)

# encode the samples
encode_nr_samples = 5000
DNAseq_encoded = [encode_and_pad(dna_lang, sentence[0:len_dna], codon_length, context) for sentence in DNAseq[0:encode_nr_samples]]
ProtSeq_encoded = [encode_and_pad(prot_lang, sentence[0:len_prot], 1, context) for sentence in ProtSeq[0:encode_nr_samples]]


In [None]:

### STEP 2a - encoding of sequences
# the samples themselves are irrelevant for training! we do not need to stick to a certain sample size for LSTMs
# we can simply use other sample sizes after training
# merge the first thousand sequences for training
DNAseq_merged = ''.join(DNAseq[0:1000])
ProtSeq_merged = ''.join(ProtSeq[0:1000])

# cut sequences up into snippets of defined length
DNAseq_snippets = [DNAseq_merged[i*len_dna:(i+1)*len_dna] for i in range(round(len(DNAseq_merged)/len_dna)) if len(DNAseq_merged[i*len_dna:(i+1)*len_dna]) == len_dna]
protSeq_snippets = [ProtSeq_merged[i*len_prot:(i+1)*len_prot] for i in range(round(len(ProtSeq_merged)/len_prot)) if len(ProtSeq_merged[i*len_prot:(i+1)*len_prot]) == len_prot]

# encode the samples
DNAseq_encoded = [encode_and_pad(dna_lang, sentence, codon_length, context) for sentence in DNAseq_snippets[0:encode_nr_samples]]
ProtSeq_encoded = [encode_and_pad(prot_lang, sentence, 1, context) for sentence in protSeq_snippets[0:encode_nr_samples]]

# take a look ad the snippets
DNAseq_snippets[0:5], protSeq_snippets[0:5]


In [None]:

### STEP 2b - dataloader
batch_size = 200

# define dataloader for training
nr_training_samples = 1000
train_dl = get_dataloader(DNAseq_encoded[0:nr_training_samples], ProtSeq_encoded[0:nr_training_samples], batch_size)

# define dataloader for validation
len_val_set = 100
val_dl = get_dataloader(DNAseq_encoded[nr_training_samples:nr_training_samples+len_val_set], ProtSeq_encoded[nr_training_samples:nr_training_samples+len_val_set], len_val_set)


In [None]:

### STEP 3a - model architecture
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyLSTM,self).__init__()
        # input parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # linear layer
       # self.lin0 = nn.Linear(input_size, input_size)
        
        # define LSTM
        self.LSTM = nn.LSTM(input_size, hidden_size, batch_first=True)
        
        # linear layer
        self.lin1 = nn.Linear(hidden_size, output_size)
        
    def forward(self,inp):
        inp1 = inp.to(device)
        
        # run first linear layer (embedding)
#        inp1 = self.lin0(inp1)
        
        # define initial hidden and cell states 
        h0 = torch.zeros(1, inp1.size(0), self.hidden_size).double().to(inp.device)
        c0 = torch.zeros(1, inp1.size(0), self.hidden_size).double().to(inp.device)
        
        # run LSTM
        lstm_output, _ = self.LSTM(inp1,(h0,c0))
        
        # return second linear layer and return output
        return self.lin1(lstm_output)


In [None]:

### STEP 3b - loss function and optimizer
# lightning module to train the sequence model
class SequenceModelLightning(L.LightningModule):
    def __init__(self, input_size, hidden_size, output_size, lr=0.1):
        super().__init__()
        self.model = MyLSTM(input_size, hidden_size, output_size).double()
        self.lr = lr
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        input_tensor = batch[0].double()
        target_tensor = batch[1].double()

        output = self.model(input_tensor)
        loss = self.loss(output.view(-1, output.shape[2]),target_tensor.view(-1).long())
        self.log("train_loss", loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_tensor = batch[0].double()
        target_tensor = batch[1].double()

        output = self.model(input_tensor)
        loss = self.loss(output.view(-1, output.shape[2]),target_tensor.view(-1).long())
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        # define optimizer here
        return optim.Adam(self.parameters(), lr=self.lr)
    

In [None]:

### STEP 3c - model and training loop
lit_model = SequenceModelLightning(input_size = len(dna_lang.word2index),
                                  hidden_size = 24,
                                  output_size = len(prot_lang.index2word),
                                  lr = 0.1)

# define the trainer
trainer = L.Trainer(devices = 1, 
                    max_epochs = 20)


# learn the weights of the model
trainer.fit(lit_model, train_dl, val_dl)


In [None]:

### STEP 4 - encode test sequences of arbitrary length
max_len = 10000
test_DNAseq_encoded  = [encode_and_pad(dna_lang, sentence, codon_length, max_len) for sentence in test_DNAseq]
test_ProtSeq_encoded = [encode_and_pad(prot_lang, sentence, 1, max_len) for sentence in test_ProtSeq]


# Lab 3b solutions

In [None]:
    
#### STEP 1
# select sentence and create language
gen_lang = Lang("text_gen")
gen_lang.addSentence(cur_text.split(), 1)
gen_lang.to_encoding()

# number of words in language
gen_lang.n_words


In [None]:

#### STEP 2a
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyLSTM,self).__init__()
        # input parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # linear layer
        self.lin0 = nn.Linear(input_size, hidden_size)
        
        # define LSTM
        self.LSTM = nn.LSTM(hidden_size, hidden_size)
        
        # linear layer
        self.lin1 = nn.Linear(hidden_size, output_size)
        
    def forward(self, inp):
        inp1 = inp.to(device)
        
        # first linear layer
        inp1 = self.lin0(inp1)
        
        # define initial hidden and cell states 
        h0 = torch.zeros(1, 1, self.hidden_size).double().to(inp.device)
        c0 = torch.zeros(1, 1, self.hidden_size).double().to(inp.device)
        
        # run LSTM
        lstm_output, _ = self.LSTM(inp1,(h0,c0))
        
        # return second linear layer and return output
        return self.lin1(lstm_output)
    

In [None]:

#### STEP 2b
# initialize optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr=0.1)

# initialize loss function
criterion = torch.nn.CrossEntropyLoss()
