# LSTM-arithmetic

## Dataset
- [Arithmetic dataset](https://drive.google.com/file/d/1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-/view?usp=sharing)

In [36]:
# ! pip install seaborn
# ! pip install opencc
# ! pip install -U scikit-learn

import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import opencc
import os
from sklearn.model_selection import train_test_split

data_path = './data'

In [37]:
df_train = pd.read_csv(os.path.join(data_path, 'arithmetic_train.csv'))
df_eval = pd.read_csv(os.path.join(data_path, 'arithmetic_eval.csv'))
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [38]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

# Build Dictionary
 - The model cannot perform calculations directly with plain text.
 - Convert all text (numbers/symbols) into numerical representations.
 - Special tokens
    - '&lt;pad&gt;'
        - Each sentence within a batch may have different lengths.
        - The length is padded with '&lt;pad&gt;' to match the longest sentence in the batch.
    - '&lt;eos&gt;'
        - Specifies the end of the generated sequence.
        - Without '&lt;eos&gt;', the model will not know when to stop generating.

In [39]:
char_to_id = {}
id_to_char = {}

# write your code here
# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to conver charactors to ids, while id_to_char is the opposite
all_text = ''.join(df_train['src'].tolist())
unique_chars = sorted(set(all_text))
special_tokens = ['<pad>', '<eos>']
current_id = 0

# Add special tokens
for token in special_tokens:
    char_to_id[token] = current_id
    id_to_char[current_id] = token
    current_id += 1

# Add characters from dataset
for ch in unique_chars:
    char_to_id[ch] = current_id
    id_to_char[current_id] = ch
    current_id += 1

# Add <unk> token for unknown characters
char_to_id['<unk>'] = current_id
id_to_char[current_id] = '<unk>'
current_id += 1


vocab_size = len(char_to_id)
print('Vocab size{}'.format(vocab_size))

Vocab size19


# Data Preprocessing
 - The data is processed into the format required for the model's input and output. (End with \<eos\> token)


In [40]:
df_train

Unnamed: 0.1,Unnamed: 0,src,tgt,len
0,2285313,14*(43+20)=882,882,14
1,317061,(6+1)*5=35,35,10
2,718770,13+32+29=74,74,11
3,170195,31*(3-11)=-248,-248,14
4,2581417,24*49+1=1177,1177,12
...,...,...,...,...
2369245,91786,1+(37*8)=297,297,12
2369246,1974790,37-25-19=-7,-7,11
2369247,410475,7+39-40=6,6,9
2369248,1451302,27-28-12=-13,-13,12


In [41]:
# Write your code here
def preprocess_data(df_input, char_to_id):
    df = df_input.copy()
    # Create a new DataFrame to avoid modifying the original one

    # Map src to char IDs
    df['char_id_list'] = df['src'].map(lambda x: [char_to_id.get(ch, char_to_id['<unk>']) for ch in x] + [char_to_id['<eos>']])

    # Map tgt to char IDs
    df['tgt_tokens'] = df['tgt'].map(lambda x: [char_to_id.get(ch, char_to_id['<unk>']) for ch in x])

    # Compute padding length
    df['pad_len'] = df['len'] - df['tgt'].str.len()

    # Create label_id_list by concatenating pad tokens + tgt tokens + <eos>
    df['label_id_list'] = df.apply(
        lambda row: [char_to_id['<pad>']] * row['pad_len'] + row['tgt_tokens'] + [char_to_id['<eos>']],
        axis=1
    )

    df = df[['src', 'tgt', 'len', 'char_id_list', 'label_id_list']]
    return df
df_train = preprocess_data(df_train, char_to_id)
df_eval = preprocess_data(df_eval, char_to_id)
df = df_train

df.head()

Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,14*(43+20)=882,882,14,"[8, 11, 4, 2, 11, 10, 5, 9, 7, 3, 17, 15, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 15, 9, 1]"
1,(6+1)*5=35,35,10,"[2, 13, 5, 8, 3, 4, 12, 17, 10, 12, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 1]"
2,13+32+29=74,74,11,"[8, 10, 5, 10, 9, 5, 9, 16, 17, 14, 11, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 1]"
3,31*(3-11)=-248,-248,14,"[10, 8, 4, 2, 10, 6, 8, 8, 3, 17, 6, 9, 11, 15...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 9, 11, 15, 1]"
4,24*49+1=1177,1177,12,"[9, 11, 4, 11, 16, 5, 8, 17, 8, 8, 14, 14, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 14, 14, 1]"


# Hyper Parameters

|Hyperparameter|Meaning|Value|
|-|-|-|
|`batch_size`|Number of data samples in a single batch|64|
|`epochs`|Total number of epochs to train|10|
|`embed_dim`|Dimension of the word embeddings|256|
|`hidden_dim`|Dimension of the hidden state in each timestep of the LSTM|256|
|`lr`|Learning Rate|0.001|
|`grad_clip`|To prevent gradient explosion in RNNs, restrict the gradient range|1|

In [42]:
batch_size = 64
epochs = 2
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1

# Data Batching
- Use `torch.utils.data.Dataset` to create a data generation tool called  `dataset`.
- The, use `torch.utils.data.DataLoader` to randomly sample from the `dataset` and group the samples into batches.

- Example: 1+2-3=0
    - Model input: 1 + 2 - 3 = 0
    - Model output: / / / / / 0 &lt;eos&gt;  (the '/' can be replaced with &lt;pad&gt;)
    - The key for the model's output is that the model does not need to predict the next character of the previous part. What matters is that once the model sees '=', it should start generating the answer, which is '0'. After generating the answer, it should also generate&lt;eos&gt;

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        # return the amount of data
        return len(self.sequences)
    
    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        x = self.sequences.iloc[index]['char_id_list']
        y = self.sequences.iloc[index]['label_id_list']
        return x, y

# collate function, used to build dataloader
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch]
    batch_y = [torch.tensor(data[1]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])
    
    # Pad the input sequence
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [11]:
ds_train = Dataset(df_train[['char_id_list', 'label_id_list']])
ds_eval = Dataset(df_eval[['char_id_list', 'label_id_list']])

In [12]:
# Build dataloader of train set and eval set, collate_fn is the collate function
from torch.utils.data import DataLoader
dl_train =  DataLoader(
    dataset=ds_train, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=collate_fn
)
dl_eval = DataLoader(
    dataset=ds_eval, 
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=collate_fn
)

# Model Design

## Execution Flow
1. Convert all characters in the sentence into embeddings.
2. Pass the embeddings through an LSTM sequentially.
3. The output of the LSTM is passed into another LSTM, and additional layers can be added.
4. The output from all time steps of the final LSTM is passed through a Fully Connected layer.
5. The character corresponding to the maximum value across all output dimensions is selected as the next character.

## Loss Function
Since this is a classification task, Cross Entropy is used as the loss function.

## Gradient Update
Adam algorithm is used for gradient updates.

In [48]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])
        
        self.rnn_layer1 = torch.nn.LSTM(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        self.rnn_layer2 = torch.nn.LSTM(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))
        
    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)
    
    # The forward pass of the model
    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)
        
        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,
                                                          batch_x_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)
        
        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)
        
        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,
                                                            batch_first=True)
        
        batch_x = self.linear(batch_x)
        
        return batch_x
    
    def generator(self, start_char, max_len=200):
        
        char_list = [char_to_id[c] for c in start_char]
        
        next_char = None
        
        while len(char_list) < max_len: 
            # Write your code here 
            # Pack the char_list to tensor
            # Input the tensor to the embedding layer, LSTM layers, linear respectively
            input_tensor = torch.tensor(char_list, dtype=torch.long).unsqueeze(0).to(next(self.parameters()).device)  # shape: (1, seq_len)
            # Move initial sequence to the correct device
            batch_x = self.embedding(input_tensor)
            batch_x, _ = self.rnn_layer1(batch_x)
            batch_x, _ = self.rnn_layer2(batch_x)
            batch_x = self.linear(batch_x)
            
            y = batch_x[0, -1, :] # Obtain the next token prediction y
            
            next_char = torch.argmax(y).item() # Use argmax function to get the next token prediction
            
            if next_char == char_to_id['<eos>']:
                break
            
            char_list.append(next_char)
            
        return [id_to_char[ch_id] for ch_id in char_list]
    

In [44]:
torch.manual_seed(2)


device = 'cuda' # torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu') # Write your code here. Specify a device (cuda or cpu)

model = CharRNN(vocab_size,
                embed_dim,
                hidden_dim)

In [45]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>']) # Write your code here. Cross-entropy loss function. The loss function should ignore <pad>
optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Write your code here. Use Adam or AdamW for Optimizer

# Training
1. The outer `for` loop controls the `epoch`
    1. The inner `for` loop uses `data_loader` to retrieve batches.
        1. Pass the batch to the `model` for training.
        2. Compare the predicted results `batch_pred_y` with the true labels `batch_y` using Cross Entropy to calculate the loss `loss`
        3. Use `loss.backward` to automatically compute the gradients.
        4. Use `torch.nn.utils.clip_grad_value_` to limit the gradient values between `-grad_clip` &lt; and &lt; `grad_clip`.
        5. Use `optimizer.step()` to update the model (backpropagation).
2.  After every `1000` batches, output the current loss to monitor whether it is converging.

In [33]:
from tqdm import tqdm
from copy import deepcopy
model = model.to(device)
model.train()
i = 0
for epoch in range(1, epochs+1):
    # The process bar
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        # Write your code here
        # Clear the gradient
        optimizer.zero_grad()
    
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        

        # Write your code here
        # Input the prediction and ground truths to loss function
        loss = criterion(batch_pred_y.view(-1, vocab_size), batch_y.view(-1).to(device))
        # Back propagation
        loss.backward()

        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip) # gradient clipping

        # Write your code here
        # Optimize parameters in the model
        optimizer.step()

        i+=1
        if i%50==0:
            bar.set_postfix(loss = loss.item())
    
    # Evaluate your model
    bar = tqdm(dl_eval, desc=f"Validation epoch {epoch}")
    matched = 0
    total = 0
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        
        predictions = model(batch_x.to(device), batch_x_lens) # Write your code here. Input the batch_x to the model and generate the predictions
        
        # An example of using generator: model.generator('1+1=')
        # Write your code here.
        decoded_outputs_indices = torch.argmax(predictions, dim=2).cpu()
        target_y_cpu = batch_y.cpu()
        # Check whether the prediction match the ground truths
        for pred_seq, true_seq, true_len in zip(decoded_outputs_indices, target_y_cpu, batch_y_lens):
            true_sequence_slice = true_seq[:true_len].tolist()
            pred_sequence_slice = pred_seq[:true_len].tolist()
            if pred_sequence_slice == true_sequence_slice:
                matched += 1
            total += 1
        # Compute exact match (EM) on the eval dataset
        # EM = correct/total

        
    print(matched/total)

Train epoch 1:  28%|██▊       | 10376/37020 [01:46<04:32, 97.89it/s, loss=3.04e-7] 


KeyboardInterrupt: 

In [46]:
df_train

Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,14*(43+20)=882,882,14,"[8, 11, 4, 2, 11, 10, 5, 9, 7, 3, 17, 15, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 15, 9, 1]"
1,(6+1)*5=35,35,10,"[2, 13, 5, 8, 3, 4, 12, 17, 10, 12, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 1]"
2,13+32+29=74,74,11,"[8, 10, 5, 10, 9, 5, 9, 16, 17, 14, 11, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 1]"
3,31*(3-11)=-248,-248,14,"[10, 8, 4, 2, 10, 6, 8, 8, 3, 17, 6, 9, 11, 15...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 9, 11, 15, 1]"
4,24*49+1=1177,1177,12,"[9, 11, 4, 11, 16, 5, 8, 17, 8, 8, 14, 14, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 14, 14, 1]"
...,...,...,...,...,...
2369245,1+(37*8)=297,297,12,"[8, 5, 2, 10, 14, 4, 15, 3, 17, 9, 16, 14, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 16, 14, 1]"
2369246,37-25-19=-7,-7,11,"[10, 14, 6, 9, 12, 6, 8, 16, 17, 6, 14, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 14, 1]"
2369247,7+39-40=6,6,9,"[14, 5, 10, 16, 6, 11, 7, 17, 13, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 13, 1]"
2369248,27-28-12=-13,-13,12,"[9, 14, 6, 9, 15, 6, 8, 9, 17, 6, 8, 10, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 8, 10, 1]"


In [49]:
def generator(start_char, max_len=200):
    char_list = [char_to_id[c] for c in start_char]
    
    next_char = None
    
    while len(char_list) < max_len: 
        # Write your code here 
        # Pack the char_list to tensor
        # Input the tensor to the embedding layer, LSTM layers, linear respectively
        input_tensor = torch.tensor(char_list, dtype=torch.long).unsqueeze(0).to(device)  # shape: (1, seq_len)
        batch_x = model.embedding(input_tensor)
        batch_x, _ = model.rnn_layer1(batch_x)
        batch_x, _ = model.rnn_layer2(batch_x)
        batch_x = model.linear(batch_x)
        
        y = batch_x[0, -1, :] # Obtain the next token prediction y
        
        next_char = torch.argmax(y).item() # Use argmax function to get the next token prediction
        
        if next_char == char_to_id['<eos>']:
            break
        
        char_list.append(next_char)
        
    return [id_to_char[ch_id] for ch_id in char_list]

In [50]:
''.join(generator('14*(43+20)='))

RuntimeError: Expected all tensors to be on the same device, but got index is on cuda:0, different from other tensors on cpu (when checking argument in method wrapper_CUDA__index_select)

In [51]:
model.generator('7-(48*33)=')

['7',
 '-',
 '(',
 '4',
 '8',
 '*',
 '3',
 '3',
 ')',
 '=',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 '<pad>',
 ')',
 ')',
 '<pad>',
 '<pad>',
 ')',
 '<pad>',
 ')',
 '<pad>',
 ')',


In [None]:
model.train()
criterion = torch.nn.CrossEntropyLoss() # Write your code here. Cross-entropy loss function. The loss function should ignore <pad>
optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Write your code here. Use


print(df_train)
print()
manual_x = df_train.iloc[0]['char_id_list']
manual_y = df_train.iloc[0]['label_id_list']
model.train()
for epoch in range(1, epochs+1):
    # The process bar
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    # single train no batch
    
        
    print(matched/total)

In [26]:
''.join(generator('7-(48*33)='))

'7-(48*33)=1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111'

In [47]:
generator('7-(48*33)=')

RuntimeError: Expected all tensors to be on the same device, but got index is on cuda:0, different from other tensors on cpu (when checking argument in method wrapper_CUDA__index_select)

In [None]:
import torch

# Assuming these variables (model, criterion, optimizer, device, char_to_id, vocab_size) 
# are defined and the model is on the device.
def train():
    for data in df_train.iterrows():
        # 1. Define the single example (Input and Target)
        manual_x = data[1]['char_id_list']  # Input sequence as a list of char IDs
        manual_y = data[1]['label_id_list']  # Target sequence as a list

        # 3. Determine Lengths and Pad
        x_len = len(manual_x)
        y_len = len(manual_y)
        pad_id = char_to_id['<pad>']
        max_len = max(x_len, y_len) 

        # Pad sequences and convert to Tensors (Batch Size of 1)
        batch_x = torch.tensor([manual_x + [pad_id] * (max_len - x_len)], 
                            dtype=torch.long).to(device)
        batch_y = torch.tensor([manual_y + [pad_id] * (max_len - y_len)], 
                            dtype=torch.long).to(device)
        batch_x_lens = torch.tensor([x_len], dtype=torch.long) # Length is 11 tokens
        model.to(device)
        # Ensure the model is in training mode
        model.train() 

        # 1. Clear Gradients from previous step
        optimizer.zero_grad()

        # 2. Forward Pass (using the encoder)
        # Outputs shape: (Batch=1, Seq_Len=max_len, Vocab_Size)
        outputs = model(batch_x, batch_x_lens) 

        # 3. Calculate Loss
        # Reshape for CrossEntropyLoss: (1*max_len, Vocab_Size) and (1*max_len)
        loss = criterion(outputs.view(-1, vocab_size), batch_y.view(-1))

        print(f"Manual Training on ''<eos>'\n")
        print(f"Input Length: {x_len}, Target Length: {y_len}")
        print(f"Calculated Loss: {loss.item():.6f}")

        # 4. Backpropagation
        loss.backward()

        # (Optional but recommended) Apply Gradient Clipping
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip) 

        # 5. Update Weights
        optimizer.step()

        print("Single training step complete. Weights for this specific example were updated.")

In [96]:
train()

Manual Training on '14*(43+20)=882<eos>'

Input Length: 15, Target Length: 15
Calculated Loss: 7.970864
Single training step complete. Weights for this specific example were updated.
Manual Training on '14*(43+20)=882<eos>'

Input Length: 11, Target Length: 11
Calculated Loss: 11.418401
Single training step complete. Weights for this specific example were updated.
Manual Training on '14*(43+20)=882<eos>'

Input Length: 12, Target Length: 12
Calculated Loss: 9.609255
Single training step complete. Weights for this specific example were updated.
Manual Training on '14*(43+20)=882<eos>'

Input Length: 15, Target Length: 15
Calculated Loss: 8.296675
Single training step complete. Weights for this specific example were updated.
Manual Training on '14*(43+20)=882<eos>'

Input Length: 13, Target Length: 13
Calculated Loss: 13.443637
Single training step complete. Weights for this specific example were updated.
Manual Training on '14*(43+20)=882<eos>'

Input Length: 14, Target Length: 14
Calcu

KeyboardInterrupt: 

In [97]:
model.to('cpu')

CharRNN(
  (embedding): Embedding(19, 256, padding_idx=0)
  (rnn_layer1): LSTM(256, 256, batch_first=True)
  (rnn_layer2): LSTM(256, 256, batch_first=True)
  (linear): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=19, bias=True)
  )
)

In [108]:
model.generator('14*')

['1', '4', '*']

In [104]:

model.to(device)
for batch_x, batch_y, batch_x_lens, batch_y_lens in DataLoader(
    dataset=ds_train, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=collate_fn
):
    batch_x = batch_x.to(device)
    batch_y = batch_y.to(device)
    # batch_x_lens = batch_x_lens.to(device)

    optimizer.zero_grad()
    outputs = model(batch_x, batch_x_lens)
    loss = criterion(outputs.view(-1, vocab_size), batch_y.view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
    optimizer.step()

In [105]:
model.eval()

CharRNN(
  (embedding): Embedding(19, 256, padding_idx=0)
  (rnn_layer1): LSTM(256, 256, batch_first=True)
  (rnn_layer2): LSTM(256, 256, batch_first=True)
  (linear): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=19, bias=True)
  )
)

In [110]:
model.to('cpu')
model.generator('14*(')

['1',
 '4',
 '*',
 '(',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3'