### Seq2Seq model for English to Chinese translation (baseline model)

In [1]:
import json
import numpy as np
import pickle
from collections import Counter
import string
import re
import torch
import torch.nn as nn

### Choosing, loading, and cleaning dataset

- going to use datasets from: https://www.kaggle.com/datasets/qianhuan/translation?resource=download

In [2]:
train_set_path = "dataset/translation2019zh_train.json"

train_set = []
with open(train_set_path) as f:
    for line in f:
        train_set.append(json.loads(line))

print(len(train_set))
print(train_set[0])

5161434
{'english': 'For greater sharpness, but with a slight increase in graininess, you can use a 1:1 dilution of this developer.', 'chinese': '为了更好的锐度，但是附带的会多一些颗粒度，可以使用这个显影剂的1：1稀释液。'}


We want to lower the size of this dataset, for testing purposes.
- right now 5.1M sentences
- let's make it 10,000 sentences...

In [3]:
# get 10,000 random indices 
sampled_indices = np.random.choice(len(train_set), 10000)

train_subset = [train_set[i] for i in sampled_indices]
print(train_subset[0])
with open('dataset/train_set_mini.pkl', 'wb') as f:
    pickle.dump(train_subset, f)

{'english': 'Draw a little square on the floor with chalk and announce to the other passengers, "This is my personal space!"', 'chinese': '用粉笔在地上划出个小方块，向其他人宣布：“这是我的地盘！”'}


In [4]:
with open('dataset/train_set_mini.pkl', 'rb') as f:
    train_set_mini = pickle.load(f)

print(train_set_mini[0])

{'english': 'Draw a little square on the floor with chalk and announce to the other passengers, "This is my personal space!"', 'chinese': '用粉笔在地上划出个小方块，向其他人宣布：“这是我的地盘！”'}


Train set made. 
Now to work on the actual architecture

## Preprocessing steps:
- we want to maintain vocabulary for english and chinese. 
    - this is simple, just use a counter() and then limit it to if it appears within 5

- In terms of encoding, we want to use sequence input, so a sentence becomes a list [3, 100, 8, 9], where each number corresponds to the index of the word in the dictionary
    - we want to do this because LSTMs keep track of the word relationships at the sentence level
    - then use nn.Embedding?
        - nn.Embedding allows us to create a matrix representing the vocabulary. It allows us to train some nuance into the words, where instead of typical BoW where each word is just an index, each word is now a vector, which allows us to train some meaning into the word
        - the embedding is a matrix size (vocab length, dim). Length vocab length because each row in the matrix corresponds to a word in the vocab, ( row of index = index of word in vocab :) )
        

# now to work on the vocabulary

In [5]:
## helper functions 
def remove_punctuation(text):
    '''
    Get rid of all punctuation from string text
    '''
    return text.translate(str.maketrans('', '', string.punctuation))

def get_words_from_sentence(s):
    '''
    Gets words from sentence 
    '''
    return s.split(' ')

def clean_en_pair(pair):
    '''
    Cleans the english from the pair 
    '''
    return get_words_from_sentence(remove_punctuation(pair['english']).lower())

def get_en_vocab(train_set):
    '''
    get_en_dict:
        Gets an english vocab from train_set as a dict 
    '''
    # get only the english sentences, list of strings 
    en_sentences = [clean_en_pair(pair) for pair in train_set]
    en_sentences_flattened = [word for sentence in en_sentences for word in sentence]
    print(f"Words pre-clean {len(en_sentences_flattened)}")
    en_sentences_flattened = [word for word in en_sentences_flattened if word != '']
    print(f"Words post-clean {len(en_sentences_flattened)}")
    
    word_counts = Counter(en_sentences_flattened)
    # with word counts, now we limit the vocabulary to words that happen at least 5 times
    en_vocab = {}
    # {word: index}
    idx = 0
    for word in ["<SOS>", "<EOS>", "<UNK>"]:
        en_vocab[word] = idx 
        idx += 1
    for word, occurrences in word_counts.items():
        if occurrences >= 5:
            en_vocab[word] = idx 
            idx += 1
    return en_vocab

def remove_zh_punctuation(text):
    cleaned = re.sub(r'[，。！？【】（）《》“”‘’、]', '', text)
    cleaned = re.sub(r'\s+', '', cleaned)
    return cleaned

def get_zh_vocab(train_set):
    '''
    get_zh_vocab:
        Gets an zh vocab from train_set as a dict 
    '''
    zh_sentences = [list(remove_zh_punctuation(pair['chinese'])) for pair in train_set]
    zh_sentences_flattened = [word for sentence in zh_sentences for word in sentence]
    print(len(zh_sentences_flattened))

    word_counts = Counter(zh_sentences_flattened)
    zh_vocab = {}

    idx = 0 
    for word in ["<SOS>", "<EOS>", "<UNK>"]:
        zh_vocab[word] = idx 
        idx += 1 
    for word, occurrences in word_counts.items():
        if occurrences >= 5: 
            zh_vocab[word] = idx 
            idx += 1 
    return zh_vocab

en_vocab = get_en_vocab(train_set_mini)
print(en_vocab)

zh_vocab = get_zh_vocab(train_set_mini)
print(zh_vocab)

Words pre-clean 191423
Words post-clean 190432
327725
{'<SOS>': 0, '<EOS>': 1, '<UNK>': 2, '用': 3, '粉': 4, '笔': 5, '在': 6, '地': 7, '上': 8, '划': 9, '出': 10, '个': 11, '小': 12, '方': 13, '块': 14, '向': 15, '其': 16, '他': 17, '人': 18, '宣': 19, '布': 20, '：': 21, '这': 22, '是': 23, '我': 24, '的': 25, '盘': 26, '另': 27, '一': 28, '种': 29, '要': 30, '素': 31, '—': 32, '音': 33, '频': 34, '包': 35, '括': 36, '广': 37, '告': 38, '解': 39, '说': 40, '词': 41, '乐': 42, '和': 43, '效': 44, '她': 45, '没': 46, '有': 47, '立': 48, '即': 49, '作': 50, '答': 51, '现': 52, '埃': 53, '多': 54, '里': 55, '安': 56, '内': 57, '塔': 58, '尼': 59, '亚': 60, '胡': 61, '执': 62, '政': 63, '支': 64, '持': 65, '基': 66, '础': 67, '更': 68, '宗': 69, '教': 70, '化': 71, '来': 72, '自': 73, '乡': 74, '村': 75, '育': 76, '程': 77, '度': 78, '较': 79, '低': 80, '口': 81, '们': 82, '看': 83, '荣': 84, '誉': 85, '民': 86, '族': 87, '至': 88, '关': 89, '重': 90, '传': 91, '言': 92, '组': 93, '织': 94, '神': 95, '鬼': 96, '不': 97, '过': 98, '并': 99, '可': 100, '靠': 101, '证': 102, '据': 103, '财'

In [6]:
with open('vocab/en_vocab.pkl', 'wb') as f:
    pickle.dump(en_vocab, f)

with open('vocab/zh_vocab.pkl', 'wb') as f:
    pickle.dump(zh_vocab, f)

In [7]:
with open('vocab/en_vocab.pkl', 'rb') as f:
    en_vocab = pickle.load(f)

with open('vocab/zh_vocab.pkl', 'rb') as f:
    zh_vocab = pickle.load(f)
print(en_vocab)
print(zh_vocab)

{'<SOS>': 0, '<EOS>': 1, '<UNK>': 2, '用': 3, '粉': 4, '笔': 5, '在': 6, '地': 7, '上': 8, '划': 9, '出': 10, '个': 11, '小': 12, '方': 13, '块': 14, '向': 15, '其': 16, '他': 17, '人': 18, '宣': 19, '布': 20, '：': 21, '这': 22, '是': 23, '我': 24, '的': 25, '盘': 26, '另': 27, '一': 28, '种': 29, '要': 30, '素': 31, '—': 32, '音': 33, '频': 34, '包': 35, '括': 36, '广': 37, '告': 38, '解': 39, '说': 40, '词': 41, '乐': 42, '和': 43, '效': 44, '她': 45, '没': 46, '有': 47, '立': 48, '即': 49, '作': 50, '答': 51, '现': 52, '埃': 53, '多': 54, '里': 55, '安': 56, '内': 57, '塔': 58, '尼': 59, '亚': 60, '胡': 61, '执': 62, '政': 63, '支': 64, '持': 65, '基': 66, '础': 67, '更': 68, '宗': 69, '教': 70, '化': 71, '来': 72, '自': 73, '乡': 74, '村': 75, '育': 76, '程': 77, '度': 78, '较': 79, '低': 80, '口': 81, '们': 82, '看': 83, '荣': 84, '誉': 85, '民': 86, '族': 87, '至': 88, '关': 89, '重': 90, '传': 91, '言': 92, '组': 93, '织': 94, '神': 95, '鬼': 96, '不': 97, '过': 98, '并': 99, '可': 100, '靠': 101, '证': 102, '据': 103, '财': 104, '部': 105, '长': 106, '决': 107, '定': 108, '会': 10

### Model architecture building
- 2 LSTM's are the backbone
- also build a higher level Seq2Seq model as abstraction of the entire model 
- nn.Embedding() as a variable for both Encoder and Decoder 
    - use vocab_size as row length, by the embedding dim as the column length
- Encoder will be english, decoder will be chinese 

### nn.LSTM

- sequence models are central to NLPl they are models where there is some sort of dependence through teime between inputs. 
- a recurrent neural network is a network that maintains some kind of state.
- for example its output could be used as part of the next input, so that information c an propagate along as the network passes over the sequence.
- In the case of an LSTM, for each element in the sequence, there is a corresponding hidden state ht, which in principle contains infromation from arbitrary points earlier in the sequence. 
- we can use the hidden state to predict words in a language model, pos, and a myriad of oether things.

LSTMs in pytorch:
- pytorch LSTM expects all of its inputs to be 3D tensors. The semantics of the axes of these tensors is important. The first axis is the sequence itself, the second indixes instances in the mini batch, and the third indexes elements of the input. 
- ignore mini batching, we will always just have 1 dimension on th second axis.
- If we want to run the sequence model over the sentence "The cow jumped" our input should look like:

[
    q (the)
    q (cow)
    q (jumped)
]
Except remember there is an additional 2nd dimension with size 1, (this dimension )

Initializing an LSTM:
```python
lstm = nn.LSTM(3, 3) #input dim is 3, output dim is 3 
```

input_size = 3: this means each input vector at a time step is of length 3. All inputs must have 3 columns, (n x 3). 
- each sequence = a list of input vectors (one per timestemp)
- each input vector = size input_size 

input_size = 3, then your input tensor shape for 1 batch would be:
    (seq_len, batch_size, 3)

- What does this mean for the embedding layer?
    - it must also be dimension (vocab_size, 3), since each token is mapped to a vecotr using the embedding. This embedding becomes the input at each timestep for the LSTM, and the LSTM accepts vector of dimensions (3).

- using nn.LSTM example:
    
```python
self.lstm = nn.LSTM(embedding_dim, hidden_dim)  #(embedding_dim) is the dimension of the embedding dim, and then hidden_dim is also essentially a hyperparameter, it's the dimension of the hidden state 
```

**For most LSTM applications we will need a linear layer to learn the mapping from hidden state to the tag space. but in the case of the encoder, you don't need it! Since the linear layer is essentially the classifier layer, that learns to 
interpret the hidden layer.**


### forward:
- in forward we will have
```
lstm_out, _ = self.LSTM(embeds.view(len(sentence), 1, -1))
```
- what exactly is going on here? Well, basically embeds is a tensor of dimensions (number of tokens in sentence, embedding dim) (**since rememember that the embedding layer takes each word index, and pulls up the corresponding row from the vocabulary)
- we want to reshape it to (sequence length, batch_size, input_size)
    - reshaping is: sequence length( the length of the sentence )
    - batch size is 1, since its one sentence at a time 
    - input_size = -1: it assumes that the length will be embedding_dim 
- **embeds.view**: is a way to reshape in LSTM

- forward in the encoder should only return the hidden state and cell state, since its what matters 


## backpropagation
- pytorch under the hood tracks operations on tensors with require_grad = True. All nn.modules like nn.Linear and nn.LSTM already register their parameters with requires_grad=True, so as long as its connected correctly in the forward pass, pytorch will handle the gradients during backprop. 

In [None]:
## Encoder: English layer 

class Encoder(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim):
        super(Encoder, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim) # initialize an LSTM, with embedding_dim, and hidden_dim hyperparameters 
    
    def forward(self, sentence):
        embeds = self.embeddings(sentence)  # remember that sentence has to the in [word_index0, word_index1, word_index2] form
        _, (h_n, c_n) = self.LSTM(embeds.view(len(sentence), 1, -1)) # one timestep at a tiem 
        return h_n, c_n
        

In [19]:
## Test example pass through the encoder 
encoder = Encoder(embedding_dim=3, vocab_size= len(en_vocab), hidden_dim=5)
# now remember that for forward we pass a sentence as the list of words mapped to the indices they show up in the vocab, such as [45, 18, 28]
sentence = "I love bread."

input_words = get_words_from_sentence(remove_punctuation(sentence).lower())
# now map the inputs to the vocab 
input_indices = [en_vocab[word] for word in input_words] 
# now that I think about it, we probably want a function that does this, so that we don't get hit with a KeyError and actually use our <unk> token lul
input_indices

[98, 659, 3942]

In [22]:
# with the input_indices, we can now throw it through the encoder?
# oh wait yopu need tensor first lul
input_indices_tensor = torch.tensor(input_indices, dtype=torch.long)
output = encoder.forward(input_indices_tensor)
output # this makes sense, we have both hidden and cell states :)

(tensor([[[0.2249, 0.0802, 0.1633, 0.0565, 0.1078]]], grad_fn=<StackBackward0>),
 tensor([[[0.3253, 0.1661, 0.2645, 0.1570, 0.2795]]], grad_fn=<StackBackward0>))

### Decoder
- so the decoder is another LSTM, taking as input "a large fixed-dimensional vector representation", and then use another LSTM to extract the output sequence from that vector 
- we can just pass in h_n and c_n in the decoder LSTM as parameter for this! 
- and then for forward we just run the linear layer and then run the log_softmax to get the logits? haha 

In [None]:
class Decoder(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim):
        super(Decoder, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, )