In [1]:
import numpy as np
from wordfreq import iter_wordlist, word_frequency

class FastWordSampler:
    def __init__(self, language='en', max_words=50000):
        self.words = []
        self.language = language
        self.cumulative_probs = []
        
        total_freq = 0
        for i, word in enumerate(iter_wordlist(language)):
            if i >= max_words:
                break
            freq = word_frequency(word, language)
            total_freq += freq
            self.words.append(word)
            self.cumulative_probs.append(total_freq)
        
        self.cumulative_probs = np.array(self.cumulative_probs) / total_freq

    def sample(self, n_samples):
        random_values = np.random.random(n_samples)
        indices = np.searchsorted(self.cumulative_probs, random_values)
        return [self.words[i] for i in indices]

sampler = FastWordSampler()
samples = sampler.sample(500)

# Optional: Check frequencies of sampled words
# from collections import Counter

# word_counts = Counter(samples)
# print("\nTop 10 most frequently sampled words:")
# for word, count in word_counts.most_common(10):
#     print(f"{word}: {count} times (true frequency: {word_frequency(word, 'en'):.6f})")

In [2]:
from g2p_en import G2p

g2p = G2p()
words_and_phonemes = [(word, g2p(word)) for word in samples]

print("\nFirst 10 sampled words and their phonemes:")
for word, phonemes in words_and_phonemes[:10]:
    print(f"{word}: {' '.join(phonemes)}")


First 10 sampled words and their phonemes:
single: S IH1 NG G AH0 L
it: IH1 T
corporations: K AO2 R P ER0 EY1 SH AH0 N Z
posted: P OW1 S T IH0 D
fifth: F IH1 F TH
this: DH IH1 S
had: HH AE1 D
in: IH0 N
quasi: K W AA1 S IY0
to: T UW1


In [3]:
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import defaultdict

# Create a dictionary to map phonemes to indices
phoneme_to_index = defaultdict(lambda: len(phoneme_to_index) + 1)

# Function to encode a single word's phonemes
def encode_word(phonemes):
    return [phoneme_to_index[p] for p in phonemes]

# Encode all phonemes from each word in the data
encoded_phonemes = [torch.tensor(encode_word(phonemes)) for _, phonemes in words_and_phonemes]

# Pad sequences to the same length
padded_sequences = pad_sequence(encoded_phonemes, batch_first=True, padding_value=0)

# Create one-hot encodings
vocab_size = len(phoneme_to_index) + 1
one_hot_encoded = torch.nn.functional.one_hot(padded_sequences, num_classes=vocab_size).float()

# Create a list of (word, one-hot encoded tensor) pairs
encoded_data = [(word, one_hot) for (word, _), one_hot in zip(words_and_phonemes, one_hot_encoded)]

# Print the result for the first word
print(f"Word: {encoded_data[100][0]}")
print(f"Shape of one-hot encoded tensor: {encoded_data[100][1].shape}")
print(f"One-hot encoded tensor:\n{encoded_data[100][1]}")

Word: valley
Shape of one-hot encoded tensor: torch.Size([14, 58])
One-hot encoded tensor:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 

In [4]:
# Create DataLoader
from torch.utils.data import DataLoader, TensorDataset

# Separate words and encodings
words, encodings = zip(*encoded_data)

# Create a TensorDataset
dataset = TensorDataset(torch.stack(list(encodings)))

# Create a DataLoader
batch_size = 10
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Example of iterating through the DataLoader
# for batch in dataloader:
#     inputs = batch[0]
#     print(f"Batch shape: {inputs.shape}")
#     break

In [5]:
from PIL import Image, ImageDraw, ImageFont

def text_to_image_tensor(
        words: list=["text"], savepath=None, index=1, mirror=False,
        fontname='Arial', W = 64, H = 64, size=10, spacing=0,
        xshift=0, yshift=-3, upper=False, invert=False, show=None
    ):

    tensors = []
    
    for word in words:
        if upper: word = word.upper()
        if invert: word = word[::-1]
        
        img = Image.new("L", (W,H), color=10)
        fnt = ImageFont.truetype(fontname+'.ttf', size)
        draw = ImageDraw.Draw(img)

        # Starting word anchor
        w = sum([(fnt.getbbox(l)[2] - fnt.getbbox(l)[0]) for l in word])
        h = sum([(fnt.getbbox(l)[3] - fnt.getbbox(l)[1]) for l in word]) / len(word)
        w = w + spacing * (len(word) - 1)
        h_anchor = (W - w) / 2
        v_anchor = (H - h) / 2

        x, y = (xshift + h_anchor, yshift + v_anchor)
        
        for l in word:
            draw.text((x,y), l, font=fnt, fill="white")
            letter_w = fnt.getbbox(l)[2] - fnt.getbbox(l)[0]
            x += letter_w + spacing

        if x > (W + spacing + 2) or (xshift + h_anchor) < -1:
            raise ValueError(f"Text width is bigger than image. Failed on size:{size}")
        
        if savepath:
            img.save(f"{savepath}/{word}.jpg")

        img_np = np.array(img)
        img_tensor = torch.from_numpy(img_np)
        tensors.append((word, img_tensor))
    
    return tensors

tensors = text_to_image_tensor(words[:10])

In [6]:
print("\nFirst 10 sampled words and their image tensors:")
for word, tensor in tensors:
    print(f"{word}: {tensor.shape}")


First 10 sampled words and their image tensors:
but: torch.Size([64, 64])
neighbors: torch.Size([64, 64])
new: torch.Size([64, 64])
hats: torch.Size([64, 64])
0.0: torch.Size([64, 64])
to: torch.Size([64, 64])
can: torch.Size([64, 64])
the: torch.Size([64, 64])
supervising: torch.Size([64, 64])
this: torch.Size([64, 64])


In [34]:
import numpy as np
from collections import defaultdict

from g2p_en import G2p
from PIL import Image, ImageDraw, ImageFont
from wordfreq import iter_wordlist, word_frequency

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

""" Notes on naming:

"phoneme tensors" are one-hot tensors of a list of phonemes for a single word
"grapheme tensors" are 1D image tensors of a 64x64 image of a single word

"""

""" Sample words based on frequency """

class WordSampler:
    def __init__(self, word_count, language='en', max_words=100000):
        self.words = []
        self.word_count = word_count
        self.language = language
        self.cumulative_probs = []
        
        total_freq = 0
        for i, word in enumerate(iter_wordlist(language)):
            if i >= max_words:
                break
            freq = word_frequency(word, language)
            total_freq += freq
            self.words.append(word)
            self.cumulative_probs.append(total_freq)
        
        self.cumulative_probs = np.array(self.cumulative_probs) / total_freq

    def sample(self):
        random_values = np.random.random(self.word_count)
        indices = np.searchsorted(self.cumulative_probs, random_values)        
        return [self.words[i] for i in indices]


""" Generate phonemes for each word """

def text_to_phoneme(words: list, g2p: G2p):

    # Get list of phonemes for each word
    phonemes = [g2p(word) for word in words]

    # Create a dictionary to map phonemes to indices
    phoneme_to_index = defaultdict(lambda: len(phoneme_to_index) + 1)
    encoded_phonemes = [[phoneme_to_index[p] for p in phoneme] for phoneme in phonemes]

    # Pad sequences to the same length
    encoded_phonemes = [torch.tensor(lst) for lst in encoded_phonemes]
    padded_sequences = pad_sequence(encoded_phonemes, batch_first=True, padding_value=0)

    # Create one-hot encodings for each phoneme
    vocab_size = len(phoneme_to_index) + 1
    phoneme_tensors = torch.nn.functional.one_hot(padded_sequences, num_classes=vocab_size).float()

    return phoneme_tensors


""" Create grapheme tensors for each word """

def text_to_grapheme(
        words: list=["text"], savepath=None, index=1, mirror=False,
        fontname='Arial', W = 64, H = 64, size=10, spacing=0,
        xshift=0, yshift=-3, upper=False, invert=False, show=None
    ):

    tensors = []
    
    for word in words:
        if upper: word = word.upper()
        if invert: word = word[::-1]
        
        img = Image.new("L", (W,H), color=10)
        fnt = ImageFont.truetype(fontname+'.ttf', size)
        draw = ImageDraw.Draw(img)

        # Starting word anchor
        w = sum([(fnt.getbbox(l)[2] - fnt.getbbox(l)[0]) for l in word])
        h = sum([(fnt.getbbox(l)[3] - fnt.getbbox(l)[1]) for l in word]) / len(word)
        w = w + spacing * (len(word) - 1)
        h_anchor = (W - w) / 2
        v_anchor = (H - h) / 2

        x, y = (xshift + h_anchor, yshift + v_anchor)
        
        # Draw the word letter by letter
        for l in word:
            draw.text((x,y), l, font=fnt, fill="white")
            letter_w = fnt.getbbox(l)[2] - fnt.getbbox(l)[0]
            x += letter_w + spacing

        if x > (W + spacing + 2) or (xshift + h_anchor) < -1:
            raise ValueError(f"Text width is bigger than image. Failed on size:{size}")
        
        if savepath:
            img.save(f"{savepath}/{word}.jpg")

        # Convert images to tensors
        img_np = np.array(img)
        img_tensor = torch.from_numpy(img_np)
        tensors.append(img_tensor)
    
    return tensors



class DataGenerator():
    def __init__(self, word_count=500, batch_size=10, savepath=None):
        self.word_count = word_count
        self.batch_size = batch_size
        self.sampler = WordSampler(word_count)
        self.words = self.sampler.sample()
        self.g2p = G2p()


    def text_to_phoneme(words: list, g2p: G2p):

        # Get list of phonemes for each word
        phonemes = [g2p(word) for word in words]

        # Create a dictionary to map phonemes to indices
        phoneme_to_index = defaultdict(lambda: len(phoneme_to_index) + 1)
        encoded_phonemes = [[phoneme_to_index[p] for p in phoneme] for phoneme in phonemes]

        # Pad sequences to the same length
        encoded_phonemes = [torch.tensor(lst) for lst in encoded_phonemes]
        padded_sequences = pad_sequence(encoded_phonemes, batch_first=True, padding_value=0)

        # Create one-hot encodings for each phoneme
        vocab_size = len(phoneme_to_index) + 1
        phoneme_tensors = torch.nn.functional.one_hot(padded_sequences, num_classes=vocab_size).float()

        return phoneme_tensors


    def text_to_grapheme(
            words: list=["text"], savepath=None, index=1, mirror=False,
            fontname='Arial', W = 64, H = 64, size=10, spacing=0,
            xshift=0, yshift=-3, upper=False, invert=False, show=None
        ):

        tensors = []
        for word in words:
            if upper: word = word.upper()
            if invert: word = word[::-1]
            
            img = Image.new("L", (W,H), color=10)
            fnt = ImageFont.truetype(fontname+'.ttf', size)
            draw = ImageDraw.Draw(img)

            # Starting word anchor
            w = sum([(fnt.getbbox(l)[2] - fnt.getbbox(l)[0]) for l in word])
            h = sum([(fnt.getbbox(l)[3] - fnt.getbbox(l)[1]) for l in word]) / len(word)
            w = w + spacing * (len(word) - 1)
            h_anchor = (W - w) / 2
            v_anchor = (H - h) / 2

            x, y = (xshift + h_anchor, yshift + v_anchor)
            
            # Draw the word letter by letter
            for l in word:
                draw.text((x,y), l, font=fnt, fill="white")
                letter_w = fnt.getbbox(l)[2] - fnt.getbbox(l)[0]
                x += letter_w + spacing

            if x > (W + spacing + 2) or (xshift + h_anchor) < -1:
                raise ValueError(f"Text width is bigger than image. Failed on size:{size}")
            
            if savepath:
                img.save(f"{savepath}/{word}.jpg")

            # Convert images to tensors
            img_np = np.array(img)
            img_tensor = torch.from_numpy(img_np)
            tensors.append(img_tensor)
        
        return tensors


    def generate_phonemes(self):
        phoneme_tensors = text_to_phoneme(self.words, self.g2p)
        phoneme_dataset = TensorDataset(phoneme_tensors)
        phoneme_dataloader = DataLoader(phoneme_dataset, batch_size=self.batch_size, shuffle=True)

        # analyse the dataloader
        print(f"Dataloader length: {len(phoneme_dataloader)}")
        for batch in phoneme_dataloader:
            print(f"Batch shape: {batch[0].shape}")
            break

        return phoneme_dataloader
    

    def generate_graphemes(self, savepath=None):
        grapheme_tensors = text_to_grapheme(self.words, savepath)
        grapheme_dataset = TensorDataset(*grapheme_tensors)
        grapheme_dataloader = DataLoader(grapheme_dataset, batch_size=self.batch_size, shuffle=True)

        print(grapheme_tensors[0].shape)

        return grapheme_dataloader



if __name__ == "__main__":
    gen = DataGenerator(word_count=50, batch_size=10)
    phoneme_dataloader = gen.generate_phonemes()

Dataloader length: 5
Batch shape: torch.Size([10, 9, 43])


In [35]:
import numpy as np
from collections import defaultdict

from g2p_en import G2p
from PIL import Image, ImageDraw, ImageFont
from wordfreq import iter_wordlist, word_frequency

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

""" Notes on naming:

"phoneme tensors" are one-hot tensors of a list of phonemes for a single word
"grapheme tensors" are 1D image tensors of a 64x64 image of a single word

"""

""" Sample words based on frequency """

class WordSampler:
    def __init__(self, word_count, language='en', max_words=100000):
        self.words = []
        self.word_count = word_count
        self.language = language
        self.cumulative_probs = []
        
        total_freq = 0
        for i, word in enumerate(iter_wordlist(language)):
            if i >= max_words:
                break
            freq = word_frequency(word, language)
            total_freq += freq
            self.words.append(word)
            self.cumulative_probs.append(total_freq)
        
        self.cumulative_probs = np.array(self.cumulative_probs) / total_freq

    def sample(self):
        random_values = np.random.random(self.word_count)
        indices = np.searchsorted(self.cumulative_probs, random_values)        
        return [self.words[i] for i in indices]


class DataGenerator():
    def __init__(self, word_count=500, batch_size=10, savepath=None):
        self.word_count = word_count
        self.batch_size = batch_size
        self.sampler = WordSampler(word_count)
        self.words = self.sampler.sample()
        self.g2p = G2p()

    def text_to_phoneme(words: list, g2p: G2p):

        # Get list of phonemes for each word
        phonemes = [g2p(word) for word in words]

        # Create a dictionary to map phonemes to indices
        phoneme_to_index = defaultdict(lambda: len(phoneme_to_index) + 1)
        encoded_phonemes = [[phoneme_to_index[p] for p in phoneme] for phoneme in phonemes]

        # Pad sequences to the same length
        encoded_phonemes = [torch.tensor(lst) for lst in encoded_phonemes]
        padded_sequences = pad_sequence(encoded_phonemes, batch_first=True, padding_value=0)

        # Create one-hot encodings for each phoneme
        vocab_size = len(phoneme_to_index) + 1
        phoneme_tensors = torch.nn.functional.one_hot(padded_sequences, num_classes=vocab_size).float()

        return phoneme_tensors

    def text_to_grapheme(
            words: list=["text"], savepath=None, index=1, mirror=False,
            fontname='Arial', W = 64, H = 64, size=10, spacing=0,
            xshift=0, yshift=-3, upper=False, invert=False, show=None
        ):

        tensors = []
        for word in words:
            if upper: word = word.upper()
            if invert: word = word[::-1]
            
            img = Image.new("L", (W,H), color=10)
            fnt = ImageFont.truetype(fontname+'.ttf', size)
            draw = ImageDraw.Draw(img)

            # Starting word anchor
            w = sum([(fnt.getbbox(l)[2] - fnt.getbbox(l)[0]) for l in word])
            h = sum([(fnt.getbbox(l)[3] - fnt.getbbox(l)[1]) for l in word]) / len(word)
            w = w + spacing * (len(word) - 1)
            h_anchor = (W - w) / 2
            v_anchor = (H - h) / 2

            x, y = (xshift + h_anchor, yshift + v_anchor)
            
            # Draw the word letter by letter
            for l in word:
                draw.text((x,y), l, font=fnt, fill="white")
                letter_w = fnt.getbbox(l)[2] - fnt.getbbox(l)[0]
                x += letter_w + spacing

            if x > (W + spacing + 2) or (xshift + h_anchor) < -1:
                raise ValueError(f"Text width is bigger than image. Failed on size:{size}")
            
            if savepath:
                img.save(f"{savepath}/{word}.jpg")

            # Convert images to tensors
            img_np = np.array(img)
            img_tensor = torch.from_numpy(img_np)
            tensors.append(img_tensor)
        
        return tensors

    def generate_phonemes(self):
        phoneme_tensors = text_to_phoneme(self.words, self.g2p)
        phoneme_dataset = TensorDataset(phoneme_tensors)
        phoneme_dataloader = DataLoader(phoneme_dataset, batch_size=self.batch_size, shuffle=True)

        # analyse the dataloader
        print(f"Dataloader length: {len(phoneme_dataloader)}")
        for batch in phoneme_dataloader:
            print(f"Batch shape: {batch[0].shape}")
            break

        return phoneme_dataloader

    def generate_graphemes(self, savepath=None):
        grapheme_tensors = text_to_grapheme(self.words, savepath)
        grapheme_dataset = TensorDataset(*grapheme_tensors)
        grapheme_dataloader = DataLoader(grapheme_dataset, batch_size=self.batch_size, shuffle=True)

        print(grapheme_tensors[0].shape)

        return grapheme_dataloader


if __name__ == "__main__":
    gen = DataGenerator(word_count=50, batch_size=10)
    phoneme_dataloader = gen.generate_phonemes()

Dataloader length: 5
Batch shape: torch.Size([10, 10, 46])


In [24]:
phoneme_tensors = gen.generate_phonemes()

torch.Size([500, 18, 58])


In [19]:
def text_to_phoneme(words: list, g2p: G2p):

    # Get list of phonemes for each word
    phonemes = [g2p(word) for word in words]

    # Create a dictionary to map phonemes to indices
    phoneme_to_index = defaultdict(lambda: len(phoneme_to_index) + 1)
    encoded_phonemes = [[phoneme_to_index[p] for p in phoneme] for phoneme in phonemes]
    print(encoded_phonemes)

    # Pad sequences to the same length
    encoded_phonemes = [torch.tensor(lst) for lst in encoded_phonemes]
    padded_sequences = pad_sequence(encoded_phonemes, batch_first=True, padding_value=0)
    print(padded_sequences)

    # Create one-hot encodings for each phoneme
    vocab_size = len(phoneme_to_index) + 1
    phoneme_tensors = torch.nn.functional.one_hot(padded_sequences, num_classes=vocab_size).float()

    return phoneme_tensors

g2p = G2p()

In [20]:
text_to_phoneme(["hello", "world", "TensorDataset"], g2p)

[[1, 2, 3, 4], [5, 6, 3, 7], [8, 9, 10, 11, 12, 13, 8, 2, 11, 14, 8, 11]]
tensor([[ 1,  2,  3,  4,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5,  6,  3,  7,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 8,  9, 10, 11, 12, 13,  8,  2, 11, 14,  8, 11]])
tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0.,

tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
    