In [25]:
import numpy as np
from wordfreq import iter_wordlist, word_frequency
import time

class FastWordSampler:
    def __init__(self, language='en', max_words=100000):
        self.words = []
        self.language = language
        self.cumulative_probs = []
        
        total_freq = 0
        for i, word in enumerate(iter_wordlist(language)):
            if i >= max_words:
                break
            freq = word_frequency(word, language)
            total_freq += freq
            self.words.append(word)
            self.cumulative_probs.append(total_freq)
        
        self.cumulative_probs = np.array(self.cumulative_probs) / total_freq

    def sample(self, n_samples):
        random_values = np.random.random(n_samples)
        indices = np.searchsorted(self.cumulative_probs, random_values)
        return [self.words[i] for i in indices]

sampler = FastWordSampler()
samples = sampler.sample(500)

# Optional: Check frequencies of sampled words
# from collections import Counter

# word_counts = Counter(samples)
# print("\nTop 10 most frequently sampled words:")
# for word, count in word_counts.most_common(10):
#     print(f"{word}: {count} times (true frequency: {word_frequency(word, 'en'):.6f})")

Time taken to sample 50,000 words: 0.0004 seconds
First 10 sampled words: ['the', 'the', 'whereas', 'play', 'the', 'more', 'to', 'possible', 'inbound', 'at']

Top 10 most frequently sampled words:
the: 29 times (true frequency: 0.053700)
and: 19 times (true frequency: 0.025700)
to: 17 times (true frequency: 0.026900)
of: 13 times (true frequency: 0.025100)
is: 7 times (true frequency: 0.011700)
in: 7 times (true frequency: 0.018600)
a: 7 times (true frequency: 0.022900)
on: 7 times (true frequency: 0.008130)
so: 6 times (true frequency: 0.003310)
for: 5 times (true frequency: 0.010200)


In [26]:
from g2p_en import G2p

g2p = G2p()
words_and_phonemes = [(word, g2p(word)) for word in samples]

print("\nFirst 10 sampled words and their phonemes:")
for word, phonemes in words_and_phonemes[:10]:
    print(f"{word}: {' '.join(phonemes)}")


First 10 sampled words and their phonemes:
the: DH AH0
the: DH AH0
whereas: W EH0 R AE1 Z
play: P L EY1
the: DH AH0
more: M AO1 R
to: T UW1
possible: P AA1 S AH0 B AH0 L
inbound: IH0 N B AW1 N D
at: AE1 T


In [28]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict

# Create a dictionary to map phonemes to indices
phoneme_to_index = defaultdict(lambda: len(phoneme_to_index) + 1)

# Function to encode a single word's phonemes
def encode_word(phonemes):
    return [phoneme_to_index[p] for p in phonemes]

# Encode all phonemes from each word in the data
encoded_phonemes = [torch.tensor(encode_word(phonemes)) for _, phonemes in words_and_phonemes]

# Pad sequences to the same length
padded_sequences = pad_sequence(encoded_phonemes, batch_first=True, padding_value=0)

# Create one-hot encodings
vocab_size = len(phoneme_to_index) + 1
one_hot_encoded = torch.nn.functional.one_hot(padded_sequences, num_classes=vocab_size).float()

# Create a list of (word, one-hot encoded tensor) pairs
encoded_data = [(word, one_hot) for (word, _), one_hot in zip(words_and_phonemes, one_hot_encoded)]

# Print the result for the first word
print(f"Word: {encoded_data[100][0]}")
print(f"Shape of one-hot encoded tensor: {encoded_data[100][1].shape}")
print(f"One-hot encoded tensor:\n{encoded_data[100][1]}")

Word: websites
Shape of one-hot encoded tensor: torch.Size([14, 60])
One-hot encoded tensor:
tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,

In [None]:
# Create DataLoader
from torch.utils.data import DataLoader, TensorDataset

# Separate words and encodings
words, encodings = zip(*encoded_data)

# Create a TensorDataset
dataset = TensorDataset(torch.stack(list(encodings)))

# Create a DataLoader
batch_size = 10
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Example of iterating through the DataLoader
# for batch in dataloader:
#     inputs = batch[0]
#     print(f"Batch shape: {inputs.shape}")
#     break

In [29]:
from PIL import Image, ImageDraw, ImageFont

def text_to_image_tensor(
        words: list=["text"], savepath=None, index=1, mirror=False,
        fontname='Arial', W = 64, H = 64, size=10, spacing=0,
        xshift=0, yshift=-3, upper=False, invert=False, show=None
    ):

    tensors = []
    
    for word in words:
        if upper: word = word.upper()
        if invert: word = word[::-1]
        
        img = Image.new("L", (W,H), color=10)
        fnt = ImageFont.truetype(fontname+'.ttf', size)
        draw = ImageDraw.Draw(img)

        # Starting word anchor
        w = sum([(fnt.getbbox(l)[2] - fnt.getbbox(l)[0]) for l in word])
        h = sum([(fnt.getbbox(l)[3] - fnt.getbbox(l)[1]) for l in word]) / len(word)
        w = w + spacing * (len(word) - 1)
        h_anchor = (W - w) / 2
        v_anchor = (H - h) / 2

        x, y = (xshift + h_anchor, yshift + v_anchor)
        
        for l in word:
            draw.text((x,y), l, font=fnt, fill="white")
            letter_w = fnt.getbbox(l)[2] - fnt.getbbox(l)[0]
            x += letter_w + spacing

        if x > (W + spacing + 2) or (xshift + h_anchor) < -1:
            raise ValueError(f"Text width is bigger than image. Failed on size:{size}")
        
        if savepath:
            img.save(f"{savepath}/{word}.jpg")

        img_np = np.array(img)
        img_tensor = torch.from_numpy(img_np)
        tensors.append((word, img_tensor))
    
    return tensors

tensors = text_to_image_tensor(samples[:10])

In [32]:
print("\nFirst 10 sampled words and their image tensors:")
for word, tensor in tensors:
    print(f"{word}: {tensor.shape}")


First 10 sampled words and their image tensors:
the: torch.Size([64, 64])
the: torch.Size([64, 64])
whereas: torch.Size([64, 64])
play: torch.Size([64, 64])
the: torch.Size([64, 64])
more: torch.Size([64, 64])
to: torch.Size([64, 64])
possible: torch.Size([64, 64])
inbound: torch.Size([64, 64])
at: torch.Size([64, 64])
