### Seq2Seq model for English to Chinese translation (baseline model)

In [1]:
import json
import numpy as np
import pickle
from collections import Counter
import string
import re
import torch
import torch.nn as nn

### Choosing, loading, and cleaning dataset

- going to use datasets from: https://www.kaggle.com/datasets/qianhuan/translation?resource=download

In [2]:
train_set_path = "dataset/translation2019zh_train.json"

train_set = []
with open(train_set_path) as f:
    for line in f:
        train_set.append(json.loads(line))

print(len(train_set))
print(train_set[0])

5161434
{'english': 'For greater sharpness, but with a slight increase in graininess, you can use a 1:1 dilution of this developer.', 'chinese': '为了更好的锐度，但是附带的会多一些颗粒度，可以使用这个显影剂的1：1稀释液。'}


We want to lower the size of this dataset, for testing purposes.
- right now 5.1M sentences
- let's make it 10,000 sentences...

In [3]:
# get 10,000 random indices 
sampled_indices = np.random.choice(len(train_set), 10000)

train_subset = [train_set[i] for i in sampled_indices]
print(train_subset[0])
with open('dataset/train_set_mini.pkl', 'wb') as f:
    pickle.dump(train_subset, f)

{'english': 'Dual-Satellite Positioning System is a regional positioning system, on the basis of which the paper supplies a method to realize passive positioning by existent tri-satellite on the sea.', 'chinese': '首先，本文根据双星定位系统的工作原理和工作方式，建立了三星无源定位的数学模型。'}


In [4]:
with open('dataset/train_set_mini.pkl', 'rb') as f:
    train_set_mini = pickle.load(f)

print(train_set_mini[0])

{'english': 'Dual-Satellite Positioning System is a regional positioning system, on the basis of which the paper supplies a method to realize passive positioning by existent tri-satellite on the sea.', 'chinese': '首先，本文根据双星定位系统的工作原理和工作方式，建立了三星无源定位的数学模型。'}


Train set made. 
Now to work on the actual architecture

## Preprocessing steps:
- we want to maintain vocabulary for english and chinese. 
    - this is simple, just use a counter() and then limit it to if it appears within 5

- In terms of encoding, we want to use sequence input, so a sentence becomes a list [3, 100, 8, 9], where each number corresponds to the index of the word in the dictionary
    - we want to do this because LSTMs keep track of the word relationships at the sentence level
    - then use nn.Embedding?
        - nn.Embedding allows us to create a matrix representing the vocabulary. It allows us to train some nuance into the words, where instead of typical BoW where each word is just an index, each word is now a vector, which allows us to train some meaning into the word
        - the embedding is a matrix size (vocab length, dim). Length vocab length because each row in the matrix corresponds to a word in the vocab, ( row of index = index of word in vocab :) )
        

# now to work on the vocabulary

In [5]:
## helper functions 
def remove_punctuation(text):
    '''
    Get rid of all punctuation from string text
    '''
    return text.translate(str.maketrans('', '', string.punctuation))

def get_words_from_sentence(s):
    '''
    Gets words from sentence 
    '''
    return s.split(' ')

def clean_en_pair(pair):
    '''
    Cleans the english from the pair 
    '''
    return get_words_from_sentence(remove_punctuation(pair['english']).lower())

def get_en_vocab(train_set):
    '''
    get_en_dict:
        Gets an english vocab from train_set as a dict 
    '''
    # get only the english sentences, list of strings 
    en_sentences = [clean_en_pair(pair) for pair in train_set]
    en_sentences_flattened = [word for sentence in en_sentences for word in sentence]
    print(f"Words pre-clean {len(en_sentences_flattened)}")
    en_sentences_flattened = [word for word in en_sentences_flattened if word != '']
    print(f"Words post-clean {len(en_sentences_flattened)}")
    
    word_counts = Counter(en_sentences_flattened)
    # with word counts, now we limit the vocabulary to words that happen at least 5 times
    en_vocab = {}
    # {word: index}
    idx = 0
    for word in ["<SOS>", "<EOS>", "<UNK>"]:
        en_vocab[word] = idx 
        idx += 1
    for word, occurrences in word_counts.items():
        if occurrences >= 5:
            en_vocab[word] = idx 
            idx += 1
    return en_vocab

def remove_zh_punctuation(text):
    cleaned = re.sub(r'[，。！？【】（）《》“”‘’、]', '', text)
    cleaned = re.sub(r'\s+', '', cleaned)
    return cleaned

def get_zh_vocab(train_set):
    '''
    get_zh_vocab:
        Gets an zh vocab from train_set as a dict 
    '''
    zh_sentences = [list(remove_zh_punctuation(pair['chinese'])) for pair in train_set]
    zh_sentences_flattened = [word for sentence in zh_sentences for word in sentence]
    print(len(zh_sentences_flattened))

    word_counts = Counter(zh_sentences_flattened)
    zh_vocab = {}

    idx = 0 
    for word in ["<SOS>", "<EOS>", "<UNK>"]:
        zh_vocab[word] = idx 
        idx += 1 
    for word, occurrences in word_counts.items():
        if occurrences >= 5: 
            zh_vocab[word] = idx 
            idx += 1 
    return zh_vocab

en_vocab = get_en_vocab(train_set_mini)
print(en_vocab)

zh_vocab = get_zh_vocab(train_set_mini)
print(zh_vocab)

Words pre-clean 191524
Words post-clean 190553
327033
{'<SOS>': 0, '<EOS>': 1, '<UNK>': 2, '首': 3, '先': 4, '本': 5, '文': 6, '根': 7, '据': 8, '双': 9, '星': 10, '定': 11, '位': 12, '系': 13, '统': 14, '的': 15, '工': 16, '作': 17, '原': 18, '理': 19, '和': 20, '方': 21, '式': 22, '建': 23, '立': 24, '了': 25, '三': 26, '无': 27, '源': 28, '数': 29, '学': 30, '模': 31, '型': 32, '愿': 33, '南': 34, '友': 35, '缘': 36, '努': 37, '力': 38, '能': 39, '为': 40, '您': 41, '搭': 42, '世': 43, '界': 44, '沟': 45, '通': 46, '桥': 47, '梁': 48, '目': 49, '：': 50, '观': 51, '察': 52, '视': 53, '频': 54, '显': 55, '示': 56, '终': 57, '端': 58, '引': 59, '起': 60, '眼': 61, '部': 62, '症': 63, '状': 64, '特': 65, '点': 66, '探': 67, '讨': 68, '其': 69, '处': 70, '则': 71, '每': 72, '个': 73, '客': 74, '户': 75, '都': 76, '当': 77, '成': 78, '企': 79, '业': 80, '广': 81, '告': 82, '传': 83, '媒': 84, '一': 85, '项': 86, '程': 87, '自': 88, '己': 89, '样': 90, '板': 91, '而': 92, '赢': 93, '得': 94, '市': 95, '场': 96, '日': 97, '气': 98, '象': 99, '厅': 100, '指': 101, '出': 102, '海': 103, '正'

In [6]:
with open('vocab/en_vocab.pkl', 'wb') as f:
    pickle.dump(en_vocab, f)

with open('vocab/zh_vocab.pkl', 'wb') as f:
    pickle.dump(zh_vocab, f)

In [7]:
with open('vocab/en_vocab.pkl', 'rb') as f:
    en_vocab = pickle.load(f)

with open('vocab/zh_vocab.pkl', 'rb') as f:
    zh_vocab = pickle.load(f)
print(en_vocab)
print(zh_vocab)

{'<SOS>': 0, '<EOS>': 1, '<UNK>': 2, '首': 3, '先': 4, '本': 5, '文': 6, '根': 7, '据': 8, '双': 9, '星': 10, '定': 11, '位': 12, '系': 13, '统': 14, '的': 15, '工': 16, '作': 17, '原': 18, '理': 19, '和': 20, '方': 21, '式': 22, '建': 23, '立': 24, '了': 25, '三': 26, '无': 27, '源': 28, '数': 29, '学': 30, '模': 31, '型': 32, '愿': 33, '南': 34, '友': 35, '缘': 36, '努': 37, '力': 38, '能': 39, '为': 40, '您': 41, '搭': 42, '世': 43, '界': 44, '沟': 45, '通': 46, '桥': 47, '梁': 48, '目': 49, '：': 50, '观': 51, '察': 52, '视': 53, '频': 54, '显': 55, '示': 56, '终': 57, '端': 58, '引': 59, '起': 60, '眼': 61, '部': 62, '症': 63, '状': 64, '特': 65, '点': 66, '探': 67, '讨': 68, '其': 69, '处': 70, '则': 71, '每': 72, '个': 73, '客': 74, '户': 75, '都': 76, '当': 77, '成': 78, '企': 79, '业': 80, '广': 81, '告': 82, '传': 83, '媒': 84, '一': 85, '项': 86, '程': 87, '自': 88, '己': 89, '样': 90, '板': 91, '而': 92, '赢': 93, '得': 94, '市': 95, '场': 96, '日': 97, '气': 98, '象': 99, '厅': 100, '指': 101, '出': 102, '海': 103, '正': 104, '以': 105, '倍': 106, '于': 107, '全': 108, '球': 10

### Model architecture building
- 2 LSTM's are the backbone
- also build a higher level Seq2Seq model as abstraction of the entire model 
- nn.Embedding() as a variable for both Encoder and Decoder 
    - use vocab_size as row length, by the embedding dim as the column length
- Encoder will be english, decoder will be chinese 

### nn.LSTM

- sequence models are central to NLPl they are models where there is some sort of dependence through teime between inputs. 
- a recurrent neural network is a network that maintains some kind of state.
- for example its output could be used as part of the next input, so that information c an propagate along as the network passes over the sequence.
- In the case of an LSTM, for each element in the sequence, there is a corresponding hidden state ht, which in principle contains infromation from arbitrary points earlier in the sequence. 
- we can use the hidden state to predict words in a language model, pos, and a myriad of oether things.

LSTMs in pytorch:
- pytorch LSTM expects all of its inputs to be 3D tensors. The semantics of the axes of these tensors is important. The first axis is the sequence itself, the second indixes instances in the mini batch, and the third indexes elements of the input. 
- ignore mini batching, we will always just have 1 dimension on th second axis.
- If we want to run the sequence model over the sentence "The cow jumped" our input should look like:

[
    q (the)
    q (cow)
    q (jumped)
]
Except remember there is an additional 2nd dimension with size 1, (this dimension )

Initializing an LSTM:
```python
lstm = nn.LSTM(3, 3) #input dim is 3, output dim is 3 
```

input_size = 3: this means each input vector at a time step is of length 3. All inputs must have 3 columns, (n x 3). 
- each sequence = a list of input vectors (one per timestemp)
- each input vector = size input_size 

input_size = 3, then your input tensor shape for 1 batch would be:
    (seq_len, batch_size, 3)

- What does this mean for the embedding layer?
    - it must also be dimension (vocab_size, 3), since each token is mapped to a vecotr using the embedding. This embedding becomes the input at each timestep for the LSTM, and the LSTM accepts vector of dimensions (3).

- using nn.LSTM example:
    
```python
self.lstm = nn.LSTM(embedding_dim, hidden_dim)  #(embedding_dim) is the dimension of the embedding dim, and then hidden_dim is also essentially a hyperparameter, it's the dimension of the hidden state 
```

**For most LSTM applications we will need a linear layer to learn the mapping from hidden state to the tag space. but in the case of the encoder, you don't need it! Since the linear layer is essentially the classifier layer, that learns to 
interpret the hidden layer.**


### forward:
- in forward we will have
```
lstm_out, _ = self.LSTM(embeds.view(len(sentence), 1, -1))
```
- what exactly is going on here? Well, basically embeds is a tensor of dimensions (number of tokens in sentence, embedding dim) (**since rememember that the embedding layer takes each word index, and pulls up the corresponding row from the vocabulary)
- we want to reshape it to (sequence length, batch_size, input_size)
    - reshaping is: sequence length( the length of the sentence )
    - batch size is 1, since its one sentence at a time 
    - input_size = -1: it assumes that the length will be embedding_dim 
- **embeds.view**: is a way to reshape in LSTM

- forward in the encoder should only return the hidden state and cell state, since its what matters 


## backpropagation
- pytorch under the hood tracks operations on tensors with require_grad = True. All nn.modules like nn.Linear and nn.LSTM already register their parameters with requires_grad=True, so as long as its connected correctly in the forward pass, pytorch will handle the gradients during backprop. 

In [8]:
## Encoder: English layer 

class Encoder(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim):
        super(Encoder, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim) # initialize an LSTM, with embedding_dim, and hidden_dim hyperparameters 
    
    def forward(self, sentence):
        embeds = self.embeddings(sentence)  # remember that sentence has to the in [word_index0, word_index1, word_index2] form
        _, (h_n, c_n) = self.LSTM(embeds.view(len(sentence), 1, -1)) # one timestep at a tiem 
        return h_n, c_n
        

In [9]:
## Test example pass through the encoder 
encoder = Encoder(embedding_dim=3, vocab_size= len(en_vocab), hidden_dim=5)
# now remember that for forward we pass a sentence as the list of words mapped to the indices they show up in the vocab, such as [45, 18, 28]
sentence = "I love bread."

input_words = get_words_from_sentence(remove_punctuation(sentence).lower())
# now map the inputs to the vocab 
input_indices = [en_vocab[word] for word in input_words] 
# now that I think about it, we probably want a function that does this, so that we don't get hit with a KeyError and actually use our <unk> token lul
input_indices

[159, 490, 2567]

In [10]:
# with the input_indices, we can now throw it through the encoder?
# oh wait yopu need tensor first lul
input_indices_tensor = torch.tensor(input_indices, dtype=torch.long)
output = encoder.forward(input_indices_tensor)
output # this makes sense, we have both hidden and cell states :)

(tensor([[[ 0.1199,  0.1956, -0.1796,  0.3297,  0.0197]]],
        grad_fn=<StackBackward0>),
 tensor([[[ 0.1810,  0.3782, -0.2318,  0.7538,  0.0513]]],
        grad_fn=<StackBackward0>))

### Decoder
- so the decoder is another LSTM, taking as input "a large fixed-dimensional vector representation", and then use another LSTM to extract the output sequence from that vector 
- we can just pass in h_n and c_n in the decoder LSTM as parameter for this! 
- and then for forward we just run the linear layer and then run the log_softmax to get the logits?

- What about the Embedding layer?
    - we also need an embedding layer ( used in both training and inference )
    - training phase we have "ground-truth" tokens, we need the "ground-truth" tokens we need the embedding layer to make them tensors and to feed each token through the decoder

### Teacher forcing and backpropagation
- At a time step t, the input is the actual target sequence from t - 1 !. This makes sense, we give it the "correct" input from the time before, and have it try to predict the input now.  the t-1 token is called the "ground truth" token, is passed through the embedding layer specifically trained for the target language vocabulary
- the output is the predicted timestep t token, and what you use to compare as loss is the actual t token. 

**forward step in the decoder**
- when making a prediction, you either use the correct previous token (teacher forcing during training), or you use the previous prediction (inference)
- during prediction we will use nn.LSTM. You want to pass the token at t-1's tensor, AND the previous c_n and h_n from the decoder! it's the recurrent aspect of the RNN

- we also do feed in the encoder h_n, and c_n, in the first step :).
- don't need to worry about the rest, because it already does it implicitly by nn.LSTM

- also to enforce teacher forcing, we have to do one time step at a time, instead of all at once 
    - so instead, we go through every single input one at time 

- in the no teacher YOU USE THE PREVIOUS PREDICTION OF THE TIME(after it is )

In [11]:
## Configurations 
MAX_RESPONSE_LENGTH = 10

In [85]:
class Decoder(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim):
        super(Decoder, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, vocab_size) # define the linear layer 
    
    def word_to_tensor(self, word):
        '''
        takes a single wrod and gets the corresponding tensor
        '''
        word_lst = get_words_from_sentence(remove_zh_punctuation(word))
        indices = [zh_vocab[word] for word in word_lst]
        # get tensor 
        return torch.tensor(indices, dtype=torch.long)

    def tensor_outputs_to_sentence(self, output_tensor):
        s = ''
        zh_vocab_lst = list(zh_vocab.keys())
        for word_tensor in output_tensor:
            pred_idx = torch.argmax(word_tensor, dim=-1).item()
            s += zh_vocab_lst[pred_idx]
        return s 

    def forward(self, hidden, sentence=None):
        # if we pass in a sentence, then we are trying teacher forcing.
        all_outputs = []
        if sentence is not None:
            embeds_lst = self.embeddings(sentence)
            teacher_forcing_inputs = embeds_lst[:-1]
            targets = embeds_lst[1:]
            for i in range(len(teacher_forcing_inputs)):
                # use embed to actually make the prediction
                input_tensor = teacher_forcing_inputs[i]
                out, hidden = self.LSTM(input_tensor.view(1, 1, -1), hidden)
                logits = self.linear(out)
                all_outputs.append(logits)
        else:
            # just generate some tokens, starting from the <sos> token
            start_token = self.word_to_tensor('<SOS>')
            # run through embedding layer
            prev_char = start_token
            for i in range(MAX_RESPONSE_LENGTH):
                embeds = self.embeddings(prev_char)
                out, hidden = self.LSTM(embeds.view(1, 1, -1), hidden)
                logits = self.linear(out)
                all_outputs.append(logits)
                pred_idx = torch.argmax(logits, dim=2).item()
                prev_char = torch.tensor(pred_idx, dtype=torch.long)
            
        return torch.cat(all_outputs, dim=0)

In [86]:
## functions to take a sentence and turn it into a tensor, adding <sos> and <eos>
def sequence_to_tensor_en(sequence):
    '''
    takes sequence and converts to tensor 
    '''
    # add "<SOS> and <EOS>"
    words = get_words_from_sentence("<SOS> " + remove_punctuation(sequence).lower() + " <EOS>")
    
    # convert to indices, reverting to <UNK> token
    word_indices = [ en_vocab[word] if word in en_vocab else en_vocab["<UNK>"] for word in words ]
    return torch.tensor(word_indices, dtype=torch.long)
    

def sequence_to_tensor_zh(sequence):
    '''
    takes sequence and converts to chinese tensor 
    '''
    words = (["<SOS>"] + list(remove_zh_punctuation(sequence)))
    words.append("<EOS>")
    
    word_indices = [ zh_vocab[word] if word in zh_vocab else zh_vocab["<UNK>"] for word in words ]
    return torch.tensor(word_indices, dtype=torch.long)

In [95]:
# a full run through both the Encoder and the decoder 

encoder = Encoder(embedding_dim=3, vocab_size=len(en_vocab), hidden_dim=5)
en_sentence = "I love bread."
zh_sentence = "我爱面包"


h_n, c_n = encoder.forward(sequence_to_tensor_en(en_sentence))

In [96]:
## create the decoder
decoder = Decoder(embedding_dim=3, vocab_size=len(zh_vocab), hidden_dim=5)
predicted_sentence_tensor = decoder.forward((h_n, c_n))
predicted_sentence_tensor.shape
predicted_sentence = decoder.tensor_outputs_to_sentence(predicted_sentence_tensor)
predicted_sentence

'淋淋淋县淋淋淋淋县淋'

In [97]:
predicted_sentence_tensor = decoder.forward((h_n, c_n))
predicted_sentence_tensor.shape
predicted_sentence = decoder.tensor_outputs_to_sentence(predicted_sentence_tensor)
predicted_sentence

'淋淋淋县淋淋淋淋县淋'

In [98]:
## its the same character, but once we do teacher forcing, it should be different 

predicted_sentence_tensor = decoder.forward((h_n, c_n), sequence_to_tensor_zh(zh_sentence))
predicted_sentence_tensor.shape
predicted_sentence = decoder.tensor_outputs_to_sentence(predicted_sentence_tensor)
predicted_sentence

'淋淋赏淋淋'