In [83]:
# This notebook loads the dataset and does data preprocessing.
# We convert the sentence pairs into 2 instances of a custom PyTorch dataset class for train and validation.

# Possible extensions: We could institute an <unknown> token.
#     We could trim vocabulary differently to ensure both the (trimmed) input and output vocabularies have equal size. 
#

In [84]:
!pwd

/Users/sr_old/Desktop/attention_seq2seq


In [85]:
import sys
print(sys.executable)
print(sys.version)

/Users/sr_old/Desktop/attention_seq2seq/p3.10_attention_seq2seq/bin/python
3.10.13 (main, Aug 24 2023, 22:36:46) [Clang 14.0.3 (clang-1403.0.22.14.1)]


## Imports

In [86]:
import re

In [87]:

import random
import numpy as np

In [88]:
import torch

In [89]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [90]:
import pickle

In [91]:
from dataprep_functions import *

## Seed

In [92]:
seed = 42

In [93]:
#rng = np.random.default_rng(seed)
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x107f9b0d0>

## Variables

In [94]:
# Name of the folder where data is accessed and saved
path = "datasets"

In [95]:
# We will use these to filter the training data, based on token length.
MIN_LENGTH = 2
MAX_LENGTH = 5

In [96]:
# This will let us throw out some of the training data, due to presence of unpopular words.
# This counts the minimum number of times a word should appear in the data.
MIN_COUNT = 15

In [97]:
# These not to be changed.
PAD_token = 0
SOS_token = 1
EOS_token = 2

## 1. Data preprocessing steps

In [98]:
def prepare_data(lang1_name, lang2_name, MIN_LENGTH, MAX_LENGTH, path = "datasets", reverse=False):
    # A high-level function that calls the text processing methods.
    
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, path, reverse)
    print("Read %d sentence pairs" % len(pairs))
    
    pairs = filter_pairs(pairs, MIN_LENGTH, MAX_LENGTH)
    print("Filtered to %d pairs" % len(pairs))
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])
    
    print('Indexed %d words in input language, %d words in output language.' % (input_lang.n_words, output_lang.n_words))
    
    # We output two language classes and a dataset of sentence pairs.
    return input_lang, output_lang, pairs

In [99]:
# Below we make the decision to "reverse" the translation task, so translating from French to English.

In [100]:
input_lang, output_lang, pairs = prepare_data('eng', 'fra', MIN_LENGTH, MAX_LENGTH, reverse = True)

Reading file...
Read 192341 sentence pairs
Filtered to 27282 pairs
Indexing words...
Indexed 8360 words in input language, 5112 words in output language.


In [101]:
pairs[1230]

['lachez moi !', 'let me go !']

In [102]:
input_lang.trim(MIN_COUNT)
output_lang.trim(MIN_COUNT)

Words kept: 652 out of 8360, a fraction of 0.08
Words kept: 702 out of 5112, a fraction of 0.14


In [103]:
# this throws out data, based on too rare words
keep_pairs = []

for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True
    keep_output = True
    
    for word in input_sentence.split(' '):
        if word not in input_lang.word2index:
            keep_input = False
            # break inner for-loop
            break

    for word in output_sentence.split(' '):
        if word not in output_lang.word2index:
            keep_output = False
            break

    # Keep the pair in the dataset only if neither sentence in the pair contains "rare" words.
    if keep_input and keep_output:
        keep_pairs.append(pair)

print("Trimmed from %d pairs to %d, %.2f of total" % (len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
pairs = keep_pairs

Trimmed from 27282 pairs to 7370, 0.27 of total


In [104]:
pairs[225:229]

[['j ai echoue .', 'i failed .'],
 ['j ai compris .', 'i get it .'],
 ['j ai compris .', 'i got it .'],
 ['j ai aide .', 'i helped .']]

In [105]:
# Dataprep steps in summary:
#  Read the original text file and split into lines
#  Split lines into pairs and do RegEx cleaning
#  Filter to pairs of a certain length only
#  Throw out pairs with rare words

In [106]:
len(pairs)

7370

## 2. Now turning data into PyTorch tensors

In [107]:
frac = 0.7
train_size = int(frac * len(pairs))

In [108]:
random.Random(seed).shuffle(pairs)

In [109]:
train_dataset = LanguageDataset(pairs[:train_size], input_lang, output_lang, seed)
val_dataset = LanguageDataset(pairs[train_size:], input_lang, output_lang, seed)

In [110]:
trial_dataset1 = LanguageDataset(pairs[:100], input_lang, output_lang, seed)
trial_dataset2 = LanguageDataset(pairs[100:200], input_lang, output_lang, seed)

In [111]:
torch.save(train_dataset, path + '/train.pt')
torch.save(val_dataset, path + '/val.pt')

In [112]:
torch.save(trial_dataset1, path + '/trial1.pt')
torch.save(trial_dataset2, path + '/trial2.pt')

In [113]:
# Pickling
with open(path + "/input_lang.txt", "wb") as x:
    pickle.dump(input_lang, x)
with open(path + "/output_lang.txt", "wb") as x:
    pickle.dump(output_lang, x)

In [114]:
# Here the dimensionality is length x batch for the input tokens

In [115]:
trial_dataset1[1:5]

((tensor([[ 10,   7, 154, 357],
          [117,   4,  50,  88],
          [  3,   2, 646,   9],
          [ 62,   0, 450,   2],
          [  6,   0,   6,   0],
          [  2,   0,   2,   0]]),
  [6, 3, 6, 4]),
 (tensor([[113,  24,  92, 162],
          [611,  60, 178, 430],
          [194,   6, 701,  42],
          [ 76,   2, 410, 191],
          [  4,   0,   4,   8],
          [  2,   0,   2,   2]]),
  [6, 4, 6, 6]))

In [116]:
vars(input_lang);

In [117]:
input_lang.name, input_lang.n_words

('fra', 655)

In [118]:
output_lang.name, output_lang.n_words

('eng', 705)

In [119]:
# We have included EOS_tokens at the end of each sequence, and our bespoke PyTorch class LanguageDataset(Dataset) pads to the longest sequence 
#    in the batch.