# Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.ops import sigmoid_focal_loss
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader, Sampler, RandomSampler, SubsetRandomSampler, random_split

import os
import random
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

# Random Seed and Batch Size

In [3]:
# Set random seed and batch size for all random processes and datasets. Seed for saved datasets is 100.

seed = 100
batch_size = 64

# Save / Load Datasets

In [4]:
languages = [('cs', 'Czech'), ('nl', 'Dutch'), ('en', 'English'), ('fr', 'French'),
             ('de', 'German'), ('el', 'Greek'), ('it', 'Italian'), ('ko', 'Korean'),
             ('no', 'Norwegian'), ('es', 'Spanish'), ('sv', 'Swedish'), ('tr', 'Turkish')]

germanic = [('nl', 'Dutch'), ('en', 'English'), ('de', 'German')]
romantic = [('fr', 'French'), ('it', 'Italian'), ('es', 'Spanish')]
nordic = [('no', 'Norwegian'), ('sv', 'Swedish')]

families = [('Germanic', germanic), ('Romantic', romantic), ('Nordic', nordic)]

## Original Size Datasets

In [5]:
#@title Original All Lang Datasets

data_folder = '/content/drive/MyDrive/CompLing Projects/Voynich/Final/Datasets'
'''
# Load Pickle Dataset
lang_folder = '/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/New Datasets (from SunPoeppleDatabases)'
with open(os.path.join(lang_folder, 'Pickles', 'All_Langs.pickle'), 'rb') as f:
  lang_pickle = pickle.load(f)

# Train / Val / Test split of 80/10/10
test_size = int(len(lang_pickle) * 0.1)

# Stratify split using language tags.
lang_stratify = [entry[2] for entry in lang_pickle]

# First split into train_pool (train + val) and test_ds.
train_pool, test_ds = train_test_split(lang_pickle, test_size=test_size, random_state=seed,
                                       shuffle=True, stratify=lang_stratify)

# Stratify split using language tags.
lang_stratify = [entry[2] for entry in train_pool]

# Then split train_pool into train_ds and val_ds
train_ds, val_ds = train_test_split(train_pool, test_size=test_size, random_state=seed,
                                    shuffle=True, stratify=lang_stratify)

with open(os.path.join(data_folder, 'Original', 'All_Langs_Train_Original.pickle'), 'wb') as f:
  pickle.dump(train_ds, f)
with open(os.path.join(data_folder, 'Original', 'All_Langs_Val_Original.pickle'), 'wb') as f:
  pickle.dump(val_ds, f)
with open(os.path.join(data_folder, 'Original', 'All_Langs_Test_Original.pickle'), 'wb') as f:
  pickle.dump(test_ds, f)
'''
with open(os.path.join(data_folder, 'Original', 'All_Langs_Train_Original.pickle'), 'rb') as f:
  train_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Original', 'All_Langs_Val_Original.pickle'), 'rb') as f:
  val_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Original', 'All_Langs_Test_Original.pickle'), 'rb') as f:
  test_ds = pickle.load(f)

train_pool = train_ds + val_ds

In [6]:
#@title Original LOO Datasets

# To load a dataset, use the last last lines of the commented code below with the language name filled in.
'''
for lang in languages:
  loo_train = [entry for entry in train_pool if entry[2] != lang[0]]
  lang_stratify = [entry[2] for entry in loo_train]
  if len(loo_train) > len(train_ds):
    loo_train_ds, _ = train_test_split(loo_train, train_size=len(train_ds), random_state=seed,
                                    shuffle=True, stratify=lang_stratify)
  else:
    loo_train_ds = shuffle(loo_train, random_state=seed)

  loo_val_ds = [entry for entry in train_pool if entry[2] == lang[0]]

  with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Train_Original.pickle'), 'wb') as f:
    pickle.dump(loo_train_ds, f)
  with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Val_Original.pickle'), 'wb') as f:
    pickle.dump(loo_val_ds, f)
'''
lang=languages[2]
# To load:
with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Train_Original.pickle'), 'rb') as f:
  loo_train_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Val_Original.pickle'), 'rb') as f:
  loo_val_ds = pickle.load(f)

print()




In [7]:
for lang in languages:
  # Set random seeds
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

  # Set Datasets
  with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Train_Original.pickle'), 'rb') as f:
    loo_train_ds = pickle.load(f)
  with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Val_Original.pickle'), 'rb') as f:
    loo_val_ds = pickle.load(f)
  print(lang)
  print('Train:', set([entry[2] for entry in loo_train_ds]))
  print('Val:', set([entry[2] for entry in loo_val_ds]))
  print()

('cs', 'Czech')
Train: {'el', 'tr', 'es', 'ko', 'fr', 'no', 'it', 'nl', 'de', 'en', 'sv'}
Val: {'cs'}

('nl', 'Dutch')
Train: {'el', 'es', 'tr', 'ko', 'fr', 'cs', 'no', 'it', 'de', 'en', 'sv'}
Val: {'nl'}

('en', 'English')
Train: {'es', 'el', 'tr', 'ko', 'fr', 'cs', 'no', 'it', 'nl', 'de', 'sv'}
Val: {'en'}

('fr', 'French')
Train: {'el', 'es', 'tr', 'ko', 'cs', 'no', 'it', 'nl', 'de', 'en', 'sv'}
Val: {'fr'}

('de', 'German')
Train: {'el', 'es', 'ko', 'fr', 'cs', 'no', 'it', 'nl', 'tr', 'en', 'sv'}
Val: {'de'}

('el', 'Greek')
Train: {'es', 'tr', 'ko', 'fr', 'cs', 'no', 'it', 'nl', 'de', 'en', 'sv'}
Val: {'el'}

('it', 'Italian')
Train: {'es', 'tr', 'el', 'ko', 'fr', 'cs', 'no', 'nl', 'de', 'en', 'sv'}
Val: {'it'}

('ko', 'Korean')
Train: {'el', 'tr', 'es', 'fr', 'cs', 'no', 'it', 'nl', 'de', 'en', 'sv'}
Val: {'ko'}

('no', 'Norwegian')
Train: {'el', 'es', 'ko', 'fr', 'cs', 'de', 'it', 'nl', 'tr', 'en', 'sv'}
Val: {'no'}

('es', 'Spanish')
Train: {'el', 'tr', 'ko', 'fr', 'cs', 'no', 

In [8]:
with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_Dutch_Train_Original.pickle'), 'rb') as f:
  loo_train_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_Dutch_Val_Original.pickle'), 'rb') as f:
  loo_val_ds = pickle.load(f)

In [9]:
loo_train_ds[:20]

[(array([3.68616148, 2.13868702, 1.87839558, 1.99691402, 3.169925  ,
         0.        , 0.53051472, 0.        , 0.        , 0.        ,
         2.169925  , 1.        , 0.        , 0.        , 0.        ]),
  array([1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0]),
  'sv',
  'f9ZT2relsevApen'),
 (array([2.67677992, 3.67203057, 0.36881448, 2.95311361, 3.42626475,
         2.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        ]),
  array([1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]),
  'el',
  'adikleptikos'),
 (array([4.65988009, 3.14577114, 1.87005383, 5.30985526, 0.5849625 ,
         0.        , 1.        ]),
  array([1, 0, 0, 1, 0, 0, 0]),
  'sv',
  'vAlvInd'),
 (array([6.41576115, 3.30875271, 1.1793237 , 0.13326653, 2.95419631,
         1.        , 1.        , 0.        ]),
  array([1, 0, 0, 0, 1, 0, 1, 0]),
  'no',
  'C9nsb6IN'),
 (array([3.86169538, 4.58271055, 2.84645474, 2.11547722, 1.        ,
         0.        , 0.5849625 , 1.        , 0.        , 

In [10]:
loo_val_ds[:20]

[(array([3.83713252, 3.59892389, 1.91919625, 7.48381578, 0.        ]),
  array([1, 0, 0, 1, 0]),
  'nl',
  'vrKAf'),
 (array([5.17417676, 3.86399177, 3.10046018, 1.37851162, 0.73696559,
         1.5849625 , 1.        ]),
  array([1, 0, 1, 0, 1, 0, 0]),
  'nl',
  'zal@G@r'),
 (array([4.65915934, 3.53245066, 2.60306431, 0.87446912, 0.        ,
         0.46948528, 0.        , 0.05658353, 0.        , 4.64385619,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        ]),
  array([1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0]),
  'nl',
  'lit@ratyrkritik}s'),
 (array([3.70737221, 1.79073346, 3.04634202, 1.63640927, 0.26589406,
         0.03476542, 3.7725895 , 1.5849625 , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        ]),
  array([1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]),
  'nl',
  'b@lKtskOnflIkt'),
 (array([3.83713252, 4.28083672, 3.67670507, 2.45943162, 0.5849625 ]),
  array([1, 0, 0, 0, 0]),
  'nl',
  'vA

In [None]:
#@title Original Family Datasets

# To load a dataset, use the last last lines of the commented code below with the language name filled in.
'''
for family in families:
  for lang in family[1]:
    train_langs = set([entry[0] for entry in family[1] if entry != lang])
    family_train_ds = shuffle([entry for entry in train_pool if entry[2] in train_langs], random_state=seed)
    family_val_ds = shuffle([entry for entry in train_pool if entry[2] == lang[0]], random_state=seed)
    with open(os.path.join(data_folder, 'Original', 'Family', f'{family[0]}_{lang[1]}_Train_Original.pickle'), 'wb') as f:
      pickle.dump(family_train_ds, f)
    with open(os.path.join(data_folder, 'Original', 'Family', f'{family[0]}_{lang[1]}_Val_Original.pickle'), 'wb') as f:
      pickle.dump(family_val_ds, f)

# To load:
with open(os.path.join(data_folder, 'Original', 'Family', f'{family[0]}_{lang[1]}_Train_Original.pickle'), 'rb') as f:
  family_train_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Original', 'Family', f'{family[0]}_{lang[1]}_Val_Original.pickle'), 'rb') as f:
  family_val_ds = pickle.load(f)
'''
print()




## Uniform Size Datasets

In [None]:
#@title Uniform All Lang Datasets
data_folder = '/content/drive/MyDrive/CompLing Projects/Voynich/Final/Datasets'
'''
uniform_train = []
uniform_val = []
uniform_test = []

for lang in languages:
  uniform_train += shuffle([entry for entry in train_ds if entry[2]==lang[0]], random_state=seed, n_samples=12000)
  uniform_val += shuffle([entry for entry in val_ds if entry[2]==lang[0]], random_state=seed, n_samples=1500)
  uniform_test += shuffle([entry for entry in test_ds if entry[2]==lang[0]], random_state=seed, n_samples=1500)

uniform_train_ds = shuffle(uniform_train, random_state=seed)
uniform_val_ds = shuffle(uniform_val)
uniform_test_ds = shuffle(uniform_test)

with open(os.path.join(data_folder, 'Uniform', 'All_Langs_Train_Uniform.pickle'), 'wb') as f:
  pickle.dump(uniform_train_ds, f)
with open(os.path.join(data_folder, 'Uniform', 'All_Langs_Val_Uniform.pickle'), 'wb') as f:
  pickle.dump(uniform_val_ds, f)
with open(os.path.join(data_folder, 'Uniform', 'All_Langs_Test_Uniform.pickle'), 'wb') as f:
  pickle.dump(uniform_test_ds, f)
'''
with open(os.path.join(data_folder, 'Uniform', 'All_Langs_Train_Uniform.pickle'), 'rb') as f:
  uniform_train_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Uniform', 'All_Langs_Val_Uniform.pickle'), 'rb') as f:
  uniform_val_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Uniform', 'All_Langs_Test_Uniform.pickle'), 'rb') as f:
  uniform_test_ds = pickle.load(f)

uniform_train_pool = uniform_train_ds + uniform_val_ds

In [None]:
#@title Uniform LOO Datasets

# To load a dataset, use the last last lines of the commented code below with the language name filled in.
'''
for lang in languages:
  loo_train = [entry for entry in uniform_train_pool if entry[2] != lang[0]]
  lang_stratify = [entry[2] for entry in loo_train]
  loo_train_ds, _ = train_test_split(loo_train, train_size=144001, random_state=seed,
                                     shuffle=True, stratify=lang_stratify)
  loo_val_ds = [entry for entry in uniform_train_pool if entry[2] == lang[0]]

  with open(os.path.join(data_folder, 'Uniform', 'LOO', f'LOO_{lang[1]}_Train_Uniform.pickle'), 'wb') as f:
    pickle.dump(loo_train_ds, f)
  with open(os.path.join(data_folder, 'Uniform', 'LOO', f'LOO_{lang[1]}_Val_Uniform.pickle'), 'wb') as f:
    pickle.dump(loo_val_ds, f)

# To load:
with open(os.path.join(data_folder, 'Uniform', 'LOO', f'LOO_{lang[1]}_Train_Uniform.pickle'), 'rb') as f:
  loo_train_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Uniform', 'LOO', f'LOO_{lang[1]}_Val_Uniform.pickle'), 'rb') as f:
  loo_val_ds = pickle.load(f)
'''
print()




In [None]:
#@title Uniform Family Datasets

# To load a dataset, use the last last lines of the commented code below with the language name filled in.
'''
for family in families:
  for lang in family[1]:
    train_langs = set([entry[0] for entry in family[1] if entry != lang])
    family_train_ds = shuffle([entry for entry in uniform_train_pool if entry[2] in train_langs], random_state=seed)
    family_val_ds = shuffle([entry for entry in uniform_train_pool if entry[2] == lang[0]], random_state=seed)
    with open(os.path.join(data_folder, 'Uniform', 'Family', f'{family[0]}_{lang[1]}_Train_Uniform.pickle'), 'wb') as f:
      pickle.dump(family_train_ds, f)
    with open(os.path.join(data_folder, 'Uniform', 'Family', f'{family[0]}_{lang[1]}_Val_Uniform.pickle'), 'wb') as f:
      pickle.dump(family_val_ds, f)

# To load:
with open(os.path.join(data_folder, 'Uniform', 'Family', f'{family[0]}_{lang[1]}_Train_Uniform.pickle'), 'rb') as f:
  family_train_ds = pickle.load(f)
with open(os.path.join(data_folder, 'Uniform', 'Family', f'{family[0]}_{lang[1]}_Val_Uniform.pickle'), 'rb') as f:
  family_val_ds = pickle.load(f)
'''
print()




# Load and Check Datasets

## Original Size Datasets

In [None]:
#@title Number of Segments and Syllables in All Lang Datasets
lang_folder = '/content/drive/MyDrive/CompLing Projects/Voynich/Data and scripts/New Datasets (from SunPoeppleDatabases)'

lang_df = pd.read_csv(os.path.join(lang_folder, 'All_Langs.csv'), index_col=0)

num_segments = []
num_syllables = []

# Train Set
num_segments.append(sum([len(train_ds[i][0]) for i in range(len(train_ds))]))
num_syllables.append(sum([train_ds[i][1].sum() for i in range(len(train_ds))]))

# Val Set
num_segments.append(sum([len(val_ds[i][0]) for i in range(len(val_ds))]))
num_syllables.append(sum([val_ds[i][1].sum() for i in range(len(val_ds))]))

# Test Set
num_segments.append(sum([len(test_ds[i][0]) for i in range(len(test_ds))]))
num_syllables.append(sum([test_ds[i][1].sum() for i in range(len(test_ds))]))

# Overall
num_segments.append(len(lang_df))
num_syllables.append(lang_df['syllable'].sum())

# W/out Head, i.e. without the first position (which is always a syllable break)
num_segments.append(len(lang_df) - lang_df['word_id'].max())
num_syllables.append(lang_df['syllable'].sum() - lang_df['word_id'].max())

datasets = ['Train', 'Val', 'Test', 'Overall', 'W/out Head']

count_df = pd.DataFrame(data = {'Dataset':datasets,
                                'Num Segments':num_segments,
                                'Num Syllables':num_syllables})
count_df['% Syllables'] = count_df['Num Syllables'] / count_df['Num Segments']
'''
count_df.to_csv(os.path.join(lang_folder, 'Stats', 'All_Languages.csv'), index=False)

count_df = pd.read_csv(os.path.join(lang_folder, 'Stats', 'All_Languages.csv'))
'''
# Check that Train + Val + Test Segments and Syllables add up to Overall Segments and Syllables
assert count_df.iloc[:3]['Num Segments'].sum() == count_df.loc[3,'Num Segments']
assert count_df.iloc[:3]['Num Syllables'].sum() == count_df.loc[3,'Num Syllables']

pd.options.display.precision = 2
count_df

Unnamed: 0,Dataset,Num Segments,Num Syllables,% Syllables
0,Train,4215314,1655769,0.39
1,Val,527295,207086,0.39
2,Test,528941,207872,0.39
3,Overall,5271550,2070727,0.39
4,W/out Head,4642808,1441985,0.31


## Uniform Size Datasets

In [None]:
#@title Number of Segments and Syllables in All Lang Datasets

num_segments = []
num_syllables = []

# Train Set
num_segments.append(sum([len(uniform_train_ds[i][0]) for i in range(len(uniform_train_ds))]))
num_syllables.append(sum([uniform_train_ds[i][1].sum() for i in range(len(uniform_train_ds))]))

# Val Set
num_segments.append(sum([len(uniform_val_ds[i][0]) for i in range(len(uniform_val_ds))]))
num_syllables.append(sum([uniform_val_ds[i][1].sum() for i in range(len(uniform_val_ds))]))

# Test Set
num_segments.append(sum([len(uniform_test_ds[i][0]) for i in range(len(uniform_test_ds))]))
num_syllables.append(sum([uniform_test_ds[i][1].sum() for i in range(len(uniform_test_ds))]))

# Overall
num_segments.append(sum(num_segments))
num_syllables.append(sum(num_syllables))

datasets = ['Train', 'Val', 'Test', 'Overall']

count_df = pd.DataFrame(data = {'Dataset':datasets,
                                'Num Segments':num_segments,
                                'Num Syllables':num_syllables})
count_df['% Syllables'] = count_df['Num Syllables'] / count_df['Num Segments']
'''
count_df.to_csv(os.path.join(lang_folder, 'Stats', 'All_Languages.csv'), index=False)

count_df = pd.read_csv(os.path.join(lang_folder, 'Stats', 'All_Languages.csv'))
'''
# Check that Train + Val + Test Segments and Syllables add up to Overall Segments and Syllables
assert count_df.iloc[:3]['Num Segments'].sum() == count_df.loc[3,'Num Segments']
assert count_df.iloc[:3]['Num Syllables'].sum() == count_df.loc[3,'Num Syllables']

pd.options.display.precision = 2
count_df

Unnamed: 0,Dataset,Num Segments,Num Syllables,% Syllables
0,Train,1159651,468765,0.4
1,Val,144906,58490,0.4
2,Test,145195,58688,0.4
3,Overall,1449752,585943,0.4


# Batch Sampler

The EqualLengthsBatchSampler groups samples into batches based on sample-length, e.g. all 8-letter words appear only with other 8-letter words. This eliminates the need for padding.

It also makes the batch sizes as close to equal as possible while not overstepping the specified `batch_size`. For example, if `batch_size` is 64 and there are 65 samples, rather than create batches of size [64, 1], it will create batches of size [33, 32]. This also means that, as long as there are more samples than the specified `batch_size`, the smallest batch will always be at least `batch_size // 2`.

Based on this, I have decided to delete any sample lengths that have fewer than `batch_size // 2` samples. This is because some sample-lengths have very few or even just one sample in the dataset. If these small groups of samples are passed along as their own batches, they will be given equal weight to groups of samples up to the full `batch_size`, e.g. a single sample of length 31 would be given the same weight as 64 samples of length 8, assuming `batch_size = 64`.

This results in the loss of the longest samples in our dataset, which could introduce some bias into our model (e.g. making it worse at predicting syllable breaks for long words), but the overall number of samples lost is not significant. You can see the lengths and numbers of samples dropped using the method `show_dropped_samples()`, and you can return a dictionary of the dropped samples themselves using the method `dropped_samples()`. Conversely, you can show the lengths and numbers of samples retained in the dataset after equal length batch sampling using the method `show_batches()`.

**Note:** To get deterministic behavior from the EqualLengthsBatchSampler, you need to initialize it with a seed each time you initialize the corresponding DataLoader. If you initialize a new DataLoader without re-initializing the Sampler, it will not reset the RNG within the sampler and so you will get new samples for the new DataLoader. Example code is show below:

```
train_sampler = EqualLengthsBatchSampler(train_ds, batch_size, seed)
train_dl = DataLoader(train_ds, batch_sampler=train_sampler)
fnn_model = NgramFNN(n_gram=7, d_hidden=15, n_layers=5).to(device)
fnn_model.fit_wandb(train_dl, val_dl, epochs=10, loss_fn='bce')
```
Each of the 10 epochs will give uniquely shuffled batches, but if you run the code block again it will give you the same 10 batches as the first time you ran it. If you don't want this behavior, feed in `seed=None`.

In [None]:
# Code copied (with edits and additional functions) from
# https://discuss.pytorch.org/t/tensorflow-esque-bucket-by-sequence-length/41284/27

class EqualLengthsBatchSampler(Sampler):

    def __init__(self, dataset, batch_size, seed):

        # Set random seed
        self.rng = np.random.default_rng(seed)

        # Remember batch size and number of samples
        self.batch_size = batch_size

        self.unique_lengths = set()
        self.samples = defaultdict(list)

        for i in range(0, len(dataset)):
            len_input = len(dataset[i][0])

            # Add length to set of all seen lengths
            self.unique_lengths.add(len_input)

            # For each length, keep track of which sample indices for this length
            # E.g.: self.lengths_to_sample = { 4: [3,5,11], 5: [1,2,9], ...}
            self.samples[len_input].append(i)

        # Delete lengths and corresponding samples if there are fewer than batch_size // 2
        # samples of that length.
        self.small_samples = set()
        for length in self.unique_lengths:
            if len(self.samples[length]) < batch_size // 2:
                self.small_samples.add(length)

        self.unique_lengths = self.unique_lengths - self.small_samples

        # Convert set of unique lengths to a list so we can shuffle it later
        self.unique_lengths = list(self.unique_lengths)

    def __len__(self):
        batches = 0
        for length in self.unique_lengths:
          batches += np.ceil(len(self.samples[length]) / self.batch_size).astype(int)
        return batches

    def __iter__(self):

        # Make list to store all batches of any length
        all_batches = []

        # Shuffle list of unique length pairs
        self.rng.shuffle(self.unique_lengths)

        # Iterate over all possible word lengths
        for length in self.unique_lengths:

            # Get indices of all samples for the current lengths
            # for example, all indices with a length of 8
            sequence_indices = self.samples[length]
            sequence_indices = np.array(sequence_indices)

            # Shuffle array of sequence indices
            self.rng.shuffle(sequence_indices)

            # Compute the number of batches
            num_batches = np.ceil(len(sequence_indices) / self.batch_size)

            # Loop over all possible batches of given length and add to list of all batches
            all_batches += [batch_indices for batch_indices in np.array_split(sequence_indices, num_batches)]

        # Shuffle list of all batches; this shuffles the order of batches but keeps their internal structure the same
        self.rng.shuffle(all_batches)
        for batch in all_batches:
          yield(np.asarray(batch))


    def show_batches(self):
      '''
      Print the different possible word lengths, the number of samples with each word length,
      the number of batches of size self.batch_size that can be made out of those samples,
      and the remainder, i.e. the number of samples in the final, smallest batch.
      (Note: if remainder is 0, that means the number of samples falls perfectly in n batches.)
      '''
      print(f'Length    # Samples    # Batches    Avg Batch Size')
      for length in self.unique_lengths:
          num_samples = len(self.samples[length])
          num_batches = np.ceil(num_samples / self.batch_size)
          average = num_samples / num_batches
          print(f'{length:>6} {num_samples:>12} {num_batches:>12.0f} {average:>12.1f}')

    def dropped_samples(self):
      '''
      Return dictionary of all words dropped from dataset for having too few samples of that length.
      '''
      small_samples = {}
      for length in self.small_samples:
        small_samples[length] = self.samples[length]
      return small_samples

    def show_dropped_samples(self):
      '''
      Print the word lengths that were dropped from the dataset, and the total number of words of each length.
      '''
      dropped_samples = self.dropped_samples()
      print('Dropped Samples \n')
      print('Length    # Samples')
      for key, value in dropped_samples.items():
        print(f'{key:>6} {len(value):>12}')

In [None]:
train_sampler = EqualLengthsBatchSampler(train_ds, batch_size, seed)
val_sampler = EqualLengthsBatchSampler(val_ds, batch_size, seed)
uniform_train_sampler = EqualLengthsBatchSampler(uniform_train_ds, batch_size, seed)
uniform_val_sampler = EqualLengthsBatchSampler(uniform_val_ds, batch_size, seed)

In [None]:
train_sampler.show_batches()

Length    # Samples    # Batches    Avg Batch Size
     1           72            2         36.0
     2         1268           20         63.4
     3         8924          140         63.7
     4        23710          371         63.9
     5        44038          689         63.9
     6        59124          924         64.0
     7        71245         1114         64.0
     8        75507         1180         64.0
     9        65352         1022         63.9
    10        50168          784         64.0
    11        34973          547         63.9
    12        24382          381         64.0
    13        15996          250         64.0
    14        10412          163         63.9
    15         6930          109         63.6
    16         4409           69         63.9
    17         2812           44         63.9
    18         1651           26         63.5
    19          972           16         60.8
    20          488            8         61.0
    21          291          

In [None]:
train_sampler.show_dropped_samples()

Dropped Samples 

Length    # Samples
    24           29
    25           19
    26            6
    27            4
    31            1


In [None]:
uniform_train_sampler.show_batches()

Length    # Samples    # Batches    Avg Batch Size
     2          386            7         55.1
     3         2890           46         62.8
     4         7774          122         63.7
     5        14917          234         63.7
     6        18638          292         63.8
     7        21184          331         64.0
     8        21619          338         64.0
     9        18055          283         63.8
    10        13649          214         63.8
    11         9308          146         63.8
    12         6137           96         63.9
    13         3632           57         63.7
    14         2292           36         63.7
    15         1417           23         61.6
    16          894           14         63.9
    17          530            9         58.9
    18          293            5         58.6
    19          199            4         49.8
    20           77            2         38.5
    21           48            1         48.0


In [None]:
uniform_train_sampler.show_dropped_samples()

Dropped Samples 

Length    # Samples
     1           20
    22           20
    23           10
    24            4
    25            4
    27            3


In [None]:
train_sampler = EqualLengthsBatchSampler(train_ds, batch_size, seed)
val_sampler = EqualLengthsBatchSampler(val_ds, batch_size, seed)

In [None]:
train_dl = DataLoader(train_ds, batch_sampler=train_sampler)
val_dl = DataLoader(val_ds, batch_sampler=val_sampler)
test_dl = DataLoader(test_ds, batch_size=1, shuffle=False)

In [None]:
val_sampler = EqualLengthsBatchSampler(val_ds, batch_size, seed=None)
val_dl = DataLoader(val_ds, batch_sampler=val_sampler)
for epoch in range(3):
  print(epoch)
  for i, batch in enumerate(val_dl):
    if i % 100 == 0:
      print(batch[3][:10])
  print()

0
('Oljebyte', 'b{lIsk3t', 'Dimotici', 't5t{l@tI', 'St&firUN', 'qhqudrnd', 'h6grAvid', 'orlqnemf', 'kaJmakam', 'parasito')
('hvjtkd', 'spr5@r', '=&p@ln', 'glfRma', 'bOll4k', 'fOl*m&', ')3s7iT', 'wjarja', 'BftB@n', 'p3rk@t')
('fOXkINsInstIt3t', 'dir@tLndirEkt|r', 'pOstseG@lMtomat', 'eGalizatsifOnts', 'kOntraIndikatsi', 'pOstseG@lhAnd@l', 'apUt4kstEknIker', 'xElvf9rgl9melse', 'EktenskApsanb}d', 'v@rniwINsidejal')
('kOnUkO', 'p#sw3d', 'krEwso', 'kOyOtE', 'kORTEL', 'rhwleh', 'vudqkd', 'vlffma', 'jok@r@', 'dr5be9')
('EnxAbOnAdO', 'qlrudwowjr', 't4tragOnAl', 'xl1xolivje', 'vEkstmetud', 'eksomalino', 'krOsk}ntri', 'fluttuante', 'Opdrag@ls@', 'xUstAmEntE')
('eoqhxkd', 'br}dg0m', 'peretti', 'SmEl=@n', 'rAmsvAT', 'aNnblAd', 'g@Sv&=@', 'YpIxkWt', 'santone', 'sepjat3')
('wjsle3ek', 'sapunaDa', 'kvElsb9n', 'vormal@x', 'Sl]khAls', 'kled@bun', 'sxElpfIs', 'hoSteSka', 'BsrXx@rn', 'p3g@t@rI')
('akupunKt5ra', 'aftoelehxos', 'tOpres0ltAt', '3ventyZfIlm', 'agresIvIt4t', 'n1raz73kov3', 'roSpO9)edlo', 'suxo

In [None]:
print(f'Batch Size: {batch_size}')
print(f'Train Samples: {len(train_ds):<8}  Val Samples: {len(val_ds):<6}  Test Samples: {len(test_ds):<6}')
print(f'Train Batches: {len(train_dl):<8}  Val Batches: {len(val_dl):<6}  Test Batches: {len(test_dl):<6}')

Batch Size: 64
Train Samples: 502994    Val Samples: 62874   Test Samples: 62874 
Train Batches: 7870      Val Batches: 991     Test Batches: 62874 


In [None]:
for lang in languages:
  print(lang[1])
  with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Train_Original.pickle'), 'rb') as f:
    loo_train_ds = pickle.load(f)
  loo_train_sampler = EqualLengthsBatchSampler(loo_train_ds, batch_size, seed)
  loo_train_sampler.show_dropped_samples()
  drops = loo_train_sampler.dropped_samples()
  print('Total:', sum([len(val) for val in drops.values()]))
  print()

Czech
Dropped Samples 

Length    # Samples
    25           19
    26            5
    27            4
    31            1
Total: 29

Dutch
Dropped Samples 

Length    # Samples
    23           25
    24           13
    25            5
    26            2
    27            1
Total: 46

English
Dropped Samples 

Length    # Samples
    25           19
    26            6
    27            3
    31            1
Total: 29

French
Dropped Samples 

Length    # Samples
    25           19
    26            6
    27            3
    31            1
Total: 29

German
Dropped Samples 

Length    # Samples
    24           31
    25           15
    26            6
    27            3
    31            1
Total: 56

Greek
Dropped Samples 

Length    # Samples
    25           19
    26            6
    27            4
    31            1
Total: 30

Italian
Dropped Samples 

Length    # Samples
    25           19
    26            4
    27            3
    31            1
Total: 27

Korean
Dr

In [None]:
for lang in languages:
  print(lang[1])
  with open(os.path.join(data_folder, 'Original', 'LOO', f'LOO_{lang[1]}_Val_Original.pickle'), 'rb') as f:
    loo_val_ds = pickle.load(f)
  loo_val_sampler = EqualLengthsBatchSampler(loo_val_ds, batch_size, seed)
  loo_val_sampler.show_dropped_samples()
  drops = loo_val_sampler.dropped_samples()
  print('Total:', sum([len(val) for val in drops.values()]))
  print()

Czech
Dropped Samples 

Length    # Samples
     1            6
    18           27
    19           19
    20            9
    21            6
    22            1
    24            1
Total: 69

Dutch
Dropped Samples 

Length    # Samples
     1           10
    24           21
    25           14
    26            4
    27            3
    31            1
Total: 53

English
Dropped Samples 

Length    # Samples
     1            8
    18            3
    19            4
    17           15
Total: 30

French
Dropped Samples 

Length    # Samples
     1           12
    17           17
    18            7
    19            3
    21            1
Total: 40

German
Dropped Samples 

Length    # Samples
     1           10
    20           23
    21           19
    22            7
    23            1
    24            1
    25            2
    27            1
Total: 64

Greek
Dropped Samples 

Length    # Samples
     1            3
    18           25
    19           15
    20           

In [None]:
batches = dict()
loo_train_sampler = EqualLengthsBatchSampler(loo_train_ds, batch_size, seed)
loo_train_dl = DataLoader(loo_train_ds, batch_sampler=loo_train_sampler)
for i in range(1,32):
  batches[i] = {'# Samples':0, '# Batches':0, 'Avg Batch Size':[]}
for batch in loo_train_dl:
  length = len(batch[0][0])
  batches[length]['# Samples'] += len(batch[0])
  batches[length]['# Batches'] += 1
  batches[length]['Avg Batch Size'] += [len(batch[0])]

for length in range(1,32):
  if batches[length]['# Samples'] == 0:
    del batches[length]
  else:
    size = batches[length]['Avg Batch Size']
    batches[length]['Avg Batch Size'] = sum(size) / len(size)

In [None]:
print(lang[1])
print(f'Length    # Samples    # Batches    Avg Batch Size')
for length in batches:
    print(f"{length:>6} {batches[length]['# Samples']:>12} {batches[length]['# Batches']:>12.0f} {batches[length]['Avg Batch Size']:>12.1f}")

Turkish
Length    # Samples    # Batches    Avg Batch Size
     1           71            2         35.5
     2         1251           20         62.5
     3         8618          135         63.8
     4        22989          360         63.9
     5        42226          660         64.0
     6        58253          911         63.9
     7        71046         1111         63.9
     8        75661         1183         64.0
     9        66163         1034         64.0
    10        50936          796         64.0
    11        35714          559         63.9
    12        24983          391         63.9
    13        16311          255         64.0
    14        10622          166         64.0
    15         7025          110         63.9
    16         4468           70         63.8
    17         2887           46         62.8
    18         1697           27         62.9
    19          982           16         61.4
    20          518            9         57.6
    21          297  

In [None]:
print(lang[1])
loo_train_sampler = EqualLengthsBatchSampler(loo_train_ds, batch_size, seed)
loo_train_sampler.show_batches()

Turkish
Length    # Samples    # Batches    Avg Batch Size
     1           71            2         35.5
     2         1251           20         62.5
     3         8618          135         63.8
     4        22989          360         63.9
     5        42226          660         64.0
     6        58253          911         63.9
     7        71046         1111         63.9
     8        75661         1183         64.0
     9        66163         1034         64.0
    10        50936          796         64.0
    11        35714          559         63.9
    12        24983          391         63.9
    13        16311          255         64.0
    14        10622          166         64.0
    15         7025          110         63.9
    16         4468           70         63.8
    17         2887           46         62.8
    18         1697           27         62.9
    19          982           16         61.4
    20          518            9         57.6
    21          297  

In [None]:
total = 0
for length in batches:
  total += batches[length]['# Samples']
print(lang[1])
total

Turkish


502935

In [None]:
for family in families:
  for lang in family[1]:
    print(lang[1])
    with open(os.path.join(data_folder, 'Original', 'Family', f'{family[0]}_{lang[1]}_Train_Original.pickle'), 'rb') as f:
      family_train_ds = pickle.load(f)
    family_train_sampler = EqualLengthsBatchSampler(family_train_ds, batch_size, seed)
    family_train_sampler.show_dropped_samples()
    drops = family_train_sampler.dropped_samples()
    print('Total:', sum([len(val) for val in drops.values()]))
    print()

Dutch
Dropped Samples 

Length    # Samples
     1           18
    20           23
    21           19
    22            7
    23            1
    24            1
    25            2
    27            1
Total: 72

English
Dropped Samples 

Length    # Samples
     1           20
    24           22
    25           16
    26            4
    27            4
    31            1
Total: 67

German
Dropped Samples 

Length    # Samples
     1           18
    24           21
    25           14
    26            4
    27            3
    31            1
Total: 61

French
Dropped Samples 

Length    # Samples
     1            4
    20           16
    21            9
    22            6
    23            1
    24            1
    26            2
Total: 39

Italian
Dropped Samples 

Length    # Samples
     1           12
    17           17
    18            7
    19            3
    21            1
Total: 40

Spanish
Dropped Samples 

Length    # Samples
     1           16
    20       

In [None]:
for family in families:
  for lang in family[1]:
    print(lang[1])
    with open(os.path.join(data_folder, 'Original', 'Family', f'{family[0]}_{lang[1]}_Val_Original.pickle'), 'rb') as f:
      family_val_ds = pickle.load(f)
    family_val_sampler = EqualLengthsBatchSampler(family_val_ds, batch_size, seed)
    family_val_sampler.show_dropped_samples()
    drops = family_val_sampler.dropped_samples()
    print('Total:', sum([len(val) for val in drops.values()]))
    print()

Dutch
Dropped Samples 

Length    # Samples
     1           10
    24           21
    25           14
    26            4
    27            3
    31            1
Total: 53

English
Dropped Samples 

Length    # Samples
     1            8
    18            3
    19            4
    17           15
Total: 30

German
Dropped Samples 

Length    # Samples
     1           10
    20           23
    21           19
    22            7
    23            1
    24            1
    25            2
    27            1
Total: 64

French
Dropped Samples 

Length    # Samples
     1           12
    17           17
    18            7
    19            3
    21            1
Total: 40

Italian
Dropped Samples 

Length    # Samples
     1            4
    20           16
    21            9
    22            6
    23            1
    24            1
    26            2
Total: 39

Spanish
Dropped Samples 

Length    # Samples
     2            9
    14           20
    15           13
Total: 42

No