### Seq2Seq model for English to Chinese translation (baseline model)

In [38]:
import json
import numpy as np
import pickle
from collections import Counter
import string
import re 

### Choosing, loading, and cleaning dataset

- going to use datasets from: https://www.kaggle.com/datasets/qianhuan/translation?resource=download

In [None]:
train_set_path = "dataset/translation2019zh_train.json"

train_set = []
with open(train_set_path) as f:
    for line in f:
        train_set.append(json.loads(line))

print(len(train_set))
print(train_set[0])

5161434
{'english': 'For greater sharpness, but with a slight increase in graininess, you can use a 1:1 dilution of this developer.', 'chinese': '为了更好的锐度，但是附带的会多一些颗粒度，可以使用这个显影剂的1：1稀释液。'}


We want to lower the size of this dataset, for testing purposes.
- right now 5.1M sentences
- let's make it 10,000 sentences...

In [12]:
# get 10,000 random indices 
sampled_indices = np.random.choice(len(train_set), 10000)

train_subset = [train_set[i] for i in sampled_indices]
print(train_subset[0])
with open('dataset/train_set_mini.pkl', 'wb') as f:
    pickle.dump(train_subset, f)

{'english': 'The amphibian & reptile hall, the marine animal hall, the tropic monkey hall, the bird garden are to be built on the flat ground, and the Garden will welcome Chinese and foreign guests with new looks.', 'chinese': '两栖——爬行动物馆、海兽馆、热带猴馆、鸟园等新型建筑将平地而起，以新的容貌迎接中外宾客。'}


In [5]:
with open('dataset/train_set_mini.pkl', 'rb') as f:
    train_set_mini = pickle.load(f)

print(train_set_mini[0])

{'english': 'The amphibian & reptile hall, the marine animal hall, the tropic monkey hall, the bird garden are to be built on the flat ground, and the Garden will welcome Chinese and foreign guests with new looks.', 'chinese': '两栖——爬行动物馆、海兽馆、热带猴馆、鸟园等新型建筑将平地而起，以新的容貌迎接中外宾客。'}


Train set made. 
Now to work on the actual architecture

## Preprocessing steps:
- we want to maintain vocabulary for english and chinese. 
    - this is simple, just use a counter() and then limit it to if it appears within 5

- In terms of encoding, we want to use sequence input, so a sentence becomes a list [3, 100, 8, 9], where each number corresponds to the index of the word in the dictionary
    - we want to do this because LSTMs keep track of the word relationships at the sentence level
    - then use nn.Embedding?
        - nn.Embedding allows us to create a matrix representing the vocabulary. It allows us to train some nuance into the words, where instead of typical BoW where each word is just an index, each word is now a vector, which allows us to train some meaning into the word
        - the embedding is a matrix size (vocab length, dim). Length vocab length because each row in the matrix corresponds to a word in the vocab, ( row of index = index of word in vocab :) )
        

# now to work on the vocabulary

In [47]:
## helper functions 
def remove_punctuation(text):
    '''
    Get rid of all punctuation from string text
    '''
    return text.translate(str.maketrans('', '', string.punctuation))

def get_words_from_sentence(s):
    '''
    Gets words from sentence 
    '''
    return s.split(' ')

def clean_en_pair(pair):
    '''
    Cleans the english from the pair 
    '''
    return get_words_from_sentence(remove_punctuation(pair['english']).lower())

def get_en_vocab(train_set):
    '''
    get_en_dict:
        Gets an english vocab from train_set as a dict 
    '''
    # get only the english sentences, list of strings 
    en_sentences = [clean_en_pair(pair) for pair in train_set]
    en_sentences_flattened = [word for sentence in en_sentences for word in sentence]
    print(f"Words pre-clean {len(en_sentences_flattened)}")
    en_sentences_flattened = [word for word in en_sentences_flattened if word != '']
    print(f"Words post-clean {len(en_sentences_flattened)}")
    
    word_counts = Counter(en_sentences_flattened)
    # with word counts, now we limit the vocabulary to words that happen at least 5 times
    en_vocab = {}
    # {word: index}
    idx = 0
    for word in ["<SOS>", "<EOS>", "<UNK>"]:
        en_vocab[word] = idx 
        idx += 1
    for word, occurrences in word_counts.items():
        if occurrences >= 5:
            en_vocab[word] = idx 
            idx += 1
    return en_vocab

def remove_zh_punctuation(text):
    cleaned = re.sub(r'[，。！？【】（）《》“”‘’、]', '', text)
    cleaned = re.sub(r'\s+', '', cleaned)
    return cleaned

def get_zh_vocab(train_set):
    '''
    get_zh_vocab:
        Gets an zh vocab from train_set as a dict 
    '''
    zh_sentences = [list(remove_zh_punctuation(pair['chinese'])) for pair in train_set]
    zh_sentences_flattened = [word for sentence in zh_sentences for word in sentence]
    print(len(zh_sentences_flattened))

    word_counts = Counter(zh_sentences_flattened)
    zh_vocab = {}

    idx = 0 
    for word in ["<SOS>", "<EOS>", "<UNK>"]:
        zh_vocab[word] = idx 
        idx += 1 
    for word, occurrences in word_counts.items():
        if occurrences >= 5: 
            zh_vocab[word] = idx 
            idx += 1 
    return zh_vocab

en_vocab = get_en_vocab(train_set_mini)
print(en_vocab)

zh_vocab = get_zh_vocab(train_set_mini)
print(zh_vocab)

Words pre-clean 192167
Words post-clean 191139
329481
{'<SOS>': 0, '<EOS>': 1, '<UNK>': 2, '两': 3, '栖': 4, '—': 5, '爬': 6, '行': 7, '动': 8, '物': 9, '馆': 10, '海': 11, '兽': 12, '热': 13, '带': 14, '鸟': 15, '园': 16, '等': 17, '新': 18, '型': 19, '建': 20, '筑': 21, '将': 22, '平': 23, '地': 24, '而': 25, '起': 26, '以': 27, '的': 28, '容': 29, '貌': 30, '迎': 31, '接': 32, '中': 33, '外': 34, '宾': 35, '客': 36, '最': 37, '重': 38, '要': 39, '就': 40, '是': 41, '当': 42, '你': 43, '踢': 44, '得': 45, '不': 46, '错': 47, '但': 48, '我': 49, '们': 50, '仍': 51, '然': 52, '取': 53, '漂': 54, '亮': 55, '结': 56, '果': 57, '为': 58, '俱': 59, '乐': 60, '部': 61, '奋': 62, '战': 63, '到': 64, '底': 65, '同': 66, '时': 67, '授': 68, '权': 69, '广': 70, '西': 71, '和': 72, '富': 73, '餐': 74, '饮': 75, '公': 76, '司': 77, '调': 78, '查': 79, '一': 80, '切': 81, '情': 82, '况': 83, '妻': 84, '子': 85, '拉': 86, '着': 87, '他': 88, '去': 89, '舞': 90, '池': 91, '在': 92, '些': 93, '特': 94, '定': 95, '条': 96, '件': 97, '下': 98, '某': 99, '闸': 100, '门': 101, '曾': 102, '产': 103, '生'

In [51]:
with open('vocab/en_vocab.pkl', 'wb') as f:
    pickle.dump(en_vocab, f)

with open('vocab/zh_vocab.pkl', 'wb') as f:
    pickle.dump(zh_vocab, f)

In [53]:
with open('vocab/en_vocab.pkl', 'rb') as f:
    en_vocab = pickle.load(f)

with open('vocab/zh_vocab.pkl', 'rb') as f:
    zh_vocab = pickle.load(f)
print(en_vocab)
print(zh_vocab)

{'<SOS>': 0, '<EOS>': 1, '<UNK>': 2, '两': 3, '栖': 4, '—': 5, '爬': 6, '行': 7, '动': 8, '物': 9, '馆': 10, '海': 11, '兽': 12, '热': 13, '带': 14, '鸟': 15, '园': 16, '等': 17, '新': 18, '型': 19, '建': 20, '筑': 21, '将': 22, '平': 23, '地': 24, '而': 25, '起': 26, '以': 27, '的': 28, '容': 29, '貌': 30, '迎': 31, '接': 32, '中': 33, '外': 34, '宾': 35, '客': 36, '最': 37, '重': 38, '要': 39, '就': 40, '是': 41, '当': 42, '你': 43, '踢': 44, '得': 45, '不': 46, '错': 47, '但': 48, '我': 49, '们': 50, '仍': 51, '然': 52, '取': 53, '漂': 54, '亮': 55, '结': 56, '果': 57, '为': 58, '俱': 59, '乐': 60, '部': 61, '奋': 62, '战': 63, '到': 64, '底': 65, '同': 66, '时': 67, '授': 68, '权': 69, '广': 70, '西': 71, '和': 72, '富': 73, '餐': 74, '饮': 75, '公': 76, '司': 77, '调': 78, '查': 79, '一': 80, '切': 81, '情': 82, '况': 83, '妻': 84, '子': 85, '拉': 86, '着': 87, '他': 88, '去': 89, '舞': 90, '池': 91, '在': 92, '些': 93, '特': 94, '定': 95, '条': 96, '件': 97, '下': 98, '某': 99, '闸': 100, '门': 101, '曾': 102, '产': 103, '生': 104, '较': 105, '强': 106, '烈': 107, '振': 108, '少': 10