In [78]:
!"E:\python\python312\python.exe" -m pip install torchdata==0.7.1
!"E:\python\python312\python.exe" -m pip install portalocker
!"E:\python\python312\python.exe" -m pip install pandas
!"E:\python\python312\python.exe" -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     ----------------------- --------------- 10.0/16.3 MB 56.4 MB/s eta 0:00:01
     ------------------------- ------------- 10.7/16.3 MB 26.8 MB/s eta 0:00:01
     ----------------------------------- --- 14.7/16.3 MB 24.3 MB/s eta 0:00:01
     ----------------------------------- --- 14.7/16.3 MB 24.3 MB/s eta 0:00:01
     ----------------------------------- --- 14.7/16.3 MB 24.3 MB/s eta 0:00:01
     --------------------------------------- 16.3/16.3 MB 13.5 MB/s eta 0:00:00
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [1]:
import torchtext
import torch
print(torchtext.__version__)
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence

0.17.2+cpu


In [2]:
sentences = ["An asteroid set to swing by Earth in 2032 has a one in sixty three chance of smashing into our planet.",
             "Porches, verandas, porticoes, and other types of outdoor coverings connected to a building have existed in various forms across the globe for centuries.",
             "The Commander of the International Space Station interacted with students during an outreach activity.",
             "You are awesome!",
             "Hence, nil tax slab means tax is applicable, but the rebate brings the final tax liability to zero.",
             "People who are emotionally immature often won’t take responsibility for their own actions when something goes wrong.",    
]

In [3]:
class CustomDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __getitem__(self, ix):
        return self.sentences[ix]

    def __len__(self):
        return len(self.sentences)

In [6]:
custom_dataset = CustomDataset(sentences)

batch_size=2

dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

for itr in dataloader:
    print(itr)

['The Commander of the International Space Station interacted with students during an outreach activity.', 'Porches, verandas, porticoes, and other types of outdoor coverings connected to a building have existed in various forms across the globe for centuries.']
['An asteroid set to swing by Earth in 2032 has a one in sixty three chance of smashing into our planet.', 'You are awesome!']
['People who are emotionally immature often won’t take responsibility for their own actions when something goes wrong.', 'Hence, nil tax slab means tax is applicable, but the rebate brings the final tax liability to zero.']


Creating tensors:
- Tokenize sentences
- Convert them into tensors of token indices

In [17]:
class CustomTokenizedDataset(Dataset):
    def __init__(self, sentences, tokenizer, vocab):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, ix):
        tokens = self.tokenizer(self.sentences[ix])
        tensor_indices = [self.vocab[token] for token in tokens]
        return torch.tensor(tensor_indices)

tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(map(tokenizer, sentences))

In [18]:
custom_tokenized_dataset = CustomTokenizedDataset(sentences, tokenizer, vocab)

In [33]:
print(f"length: {len(custom_tokenized_dataset)}")
print(f"ix[3]: {custom_tokenized_dataset[3]}")

length: 6
ix[3]: tensor([81,  9, 19, 11])


In [30]:
data_loader_tokenized = DataLoader(custom_tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=None)

In [31]:
# This will give error since tensors are not of same size
for batch in data_loader_tokenized:
    print(batch)

RuntimeError: stack expects each tensor to be equal size, but got [4] at entry 0 and [18] at entry 1

In [48]:
def collate_fn(batch):
    return pad_sequence(batch, batch_first=True, padding_value=0)

def collate_fn_bfFALSE(batch):
    return pad_sequence(batch, padding_value=0)

In [39]:
dataloader_tokenized_padded = DataLoader(custom_tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [41]:
for batch in dataloader_tokenized_padded:
    print(batch)

tensor([[ 2, 26,  4,  2, 42, 66, 67, 41, 78, 68, 29,  8, 53, 15,  1,  0,  0,  0,
          0,  0,  0,  0],
        [ 8, 18, 61,  6, 69, 23, 30,  3, 12, 37,  7, 49,  3, 62, 72, 25,  4, 64,
         43, 51, 56,  1]])
tensor([[39,  0, 47,  5, 63, 46,  5, 44, 17,  0, 22,  2, 59, 20,  2, 33,  5, 45,
          6, 82,  1],
        [81,  9, 19, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0]])
tensor([[57,  0, 75,  0, 58,  0, 16, 50, 73,  4, 52, 28, 27,  6,  7, 21, 38, 32,
          3, 74, 34, 13,  2, 35, 10, 24,  1],
        [55, 77,  9, 31, 40, 48, 79, 70, 60, 10, 71, 54, 14, 76, 65, 36, 80,  1,
          0,  0,  0,  0,  0,  0,  0,  0,  0]])


In [54]:
for batch in dataloader_tokenized_padded:
    for row in batch:
        print([vocab.get_itos()[ix] for ix in row])
        print("##")

['hence', ',', 'nil', 'tax', 'slab', 'means', 'tax', 'is', 'applicable', ',', 'but', 'the', 'rebate', 'brings', 'the', 'final', 'tax', 'liability', 'to', 'zero', '.', ',', ',', ',', ',', ',', ',']
##
['porches', ',', 'verandas', ',', 'porticoes', ',', 'and', 'other', 'types', 'of', 'outdoor', 'coverings', 'connected', 'to', 'a', 'building', 'have', 'existed', 'in', 'various', 'forms', 'across', 'the', 'globe', 'for', 'centuries', '.']
##
['you', 'are', 'awesome', '!', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']
##
['an', 'asteroid', 'set', 'to', 'swing', 'by', 'earth', 'in', '2032', 'has', 'a', 'one', 'in', 'sixty', 'three', 'chance', 'of', 'smashing', 'into', 'our', 'planet', '.']
##
['people', 'who', 'are', 'emotionally', 'immature', 'often', 'won’t', 'take', 'responsibility', 'for', 'their', 'own', 'actions', 'when', 'something', 'goes', 'wrong', '.']
##
['the', 'commander', 'of', 'the', 'international', 'space', 'station', 'interacted',

In [55]:
dataloader_tokenized_padded_bfFalse = DataLoader(
    custom_tokenized_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn_bfFALSE)

In [57]:
for batch in dataloader_tokenized_padded_bfFalse:
    print(batch)

tensor([[55, 81],
        [77,  9],
        [ 9, 19],
        [31, 11],
        [40,  0],
        [48,  0],
        [79,  0],
        [70,  0],
        [60,  0],
        [10,  0],
        [71,  0],
        [54,  0],
        [14,  0],
        [76,  0],
        [65,  0],
        [36,  0],
        [80,  0],
        [ 1,  0]])
tensor([[ 2, 39],
        [26,  0],
        [ 4, 47],
        [ 2,  5],
        [42, 63],
        [66, 46],
        [67,  5],
        [41, 44],
        [78, 17],
        [68,  0],
        [29, 22],
        [ 8,  2],
        [53, 59],
        [15, 20],
        [ 1,  2],
        [ 0, 33],
        [ 0,  5],
        [ 0, 45],
        [ 0,  6],
        [ 0, 82],
        [ 0,  1]])
tensor([[57,  8],
        [ 0, 18],
        [75, 61],
        [ 0,  6],
        [58, 69],
        [ 0, 23],
        [16, 30],
        [50,  3],
        [73, 12],
        [ 4, 37],
        [52,  7],
        [28, 49],
        [27,  3],
        [ 6, 62],
        [ 7, 72],
        [21, 25],
        

In [56]:
for batch in dataloader_tokenized_padded_bfFalse:
    for row in batch:
        print([vocab.get_itos()[ix] for ix in row])
        print("##")

['porches', 'hence']
##
[',', ',']
##
['verandas', 'nil']
##
[',', 'tax']
##
['porticoes', 'slab']
##
[',', 'means']
##
['and', 'tax']
##
['other', 'is']
##
['types', 'applicable']
##
['of', ',']
##
['outdoor', 'but']
##
['coverings', 'the']
##
['connected', 'rebate']
##
['to', 'brings']
##
['a', 'the']
##
['building', 'final']
##
['have', 'tax']
##
['existed', 'liability']
##
['in', 'to']
##
['various', 'zero']
##
['forms', '.']
##
['across', ',']
##
['the', ',']
##
['globe', ',']
##
['for', ',']
##
['centuries', ',']
##
['.', ',']
##
['people', 'an']
##
['who', 'asteroid']
##
['are', 'set']
##
['emotionally', 'to']
##
['immature', 'swing']
##
['often', 'by']
##
['won’t', 'earth']
##
['take', 'in']
##
['responsibility', '2032']
##
['for', 'has']
##
['their', 'a']
##
['own', 'one']
##
['actions', 'in']
##
['when', 'sixty']
##
['something', 'three']
##
['goes', 'chance']
##
['wrong', 'of']
##
['.', 'smashing']
##
[',', 'into']
##
[',', 'our']
##
[',', 'planet']
##
[',', '.']
##
['the', 

In [68]:
corpus = [
    "Ceci est une phrase.",
    "C'est un autre exemple de phrase.",
    "Voici une troisième phrase.",
    "Il fait beau aujourd'hui.",
    "J'aime beaucoup la cuisine française.",
    "Quel est ton plat préféré ?",
    "Je t'adore.",
    "Bon appétit !",
    "Je suis en train d'apprendre le français.",
    "Nous devons partir tôt demain matin.",
    "Je suis heureux.",
    "Le film était vraiment captivant !",
    "Je suis là.",
    "Je ne sais pas.",
    "Je suis fatigué après une longue journée de travail.",
    "Est-ce que tu as des projets pour le week-end ?",
    "Je vais chez le médecin cet après-midi.",
    "La musique adoucit les mœurs.",
    "Je dois acheter du pain et du lait.",
    "Il y a beaucoup de monde dans cette ville.",
    "Merci beaucoup !",
    "Au revoir !",
    "Je suis ravi de vous rencontrer enfin !",
    "Les vacances sont toujours trop courtes.",
    "Je suis en retard.",
    "Félicitations pour ton nouveau travail !",
    "Je suis désolé, je ne peux pas venir à la réunion.",
    "À quelle heure est le prochain train ?",
    "Bonjour !",
    "C'est génial !"
]

In [74]:
corpus_sorted = sorted(corpus, key=lambda x: len(x))
corpus_sorted

['Bonjour !',
 "Je t'adore.",
 'Je suis là.',
 'Au revoir !',
 'Bon appétit !',
 "C'est génial !",
 'Je ne sais pas.',
 'Je suis heureux.',
 'Merci beaucoup !',
 'Je suis en retard.',
 'Ceci est une phrase.',
 "Il fait beau aujourd'hui.",
 'Voici une troisième phrase.',
 'Quel est ton plat préféré ?',
 'La musique adoucit les mœurs.',
 "C'est un autre exemple de phrase.",
 'Le film était vraiment captivant !',
 'Je dois acheter du pain et du lait.',
 'Nous devons partir tôt demain matin.',
 "J'aime beaucoup la cuisine française.",
 'À quelle heure est le prochain train ?',
 'Je vais chez le médecin cet après-midi.',
 'Je suis ravi de vous rencontrer enfin !',
 'Les vacances sont toujours trop courtes.',
 'Félicitations pour ton nouveau travail !',
 "Je suis en train d'apprendre le français.",
 'Il y a beaucoup de monde dans cette ville.',
 'Est-ce que tu as des projets pour le week-end ?',
 'Je suis désolé, je ne peux pas venir à la réunion.',
 'Je suis fatigué après une longue journée

In [110]:
class FrenchDataset(Dataset):
    def __init__(self, sentences, tokenizer, vocab):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, ix):
        tokens = self.tokenizer(self.sentences[ix])
        token_indices = [self.vocab[token] for token in tokens]
        return torch.tensor(token_indices)

french_tokenizer = get_tokenizer("spacy", language='fr_core_news_sm')
french_vocab = build_vocab_from_iterator(map(french_tokenizer, corpus))

def collate_french(batch):
    return pad_sequence(batch, batch_first=True, padding_value=0)
    
def collate_fn_fr(batch):
    # Pad sequences within the batch to have equal lengths
    tensor_batch=[]
    for sample in batch:
        tokens = french_tokenizer(sample)
        tensor_batch.append(torch.tensor([french_vocab[token] for token in tokens]))
         
    padded_batch = pad_sequence(tensor_batch,batch_first=True)
    return padded_batch





In [115]:
french_batch_size = 4
dataloader_french_basic = DataLoader(
    corpus_sorted,
    batch_size=french_batch_size,
    shuffle=False,
    collate_fn=collate_fn_fr)

In [117]:
for itr in dataloader_french_basic:
    print(itr)

tensor([[ 27,   2,   0,   0],
        [  1, 105,  41,   0],
        [  1,   3,  82,   0],
        [ 25, 101,   2,   0]])
tensor([[ 26,  45,   2,   0,   0],
        [ 11,   4,  74,   2,   0],
        [  1,  16, 103,  17,   0],
        [  1,   3,  76,   0,   0]])
tensor([[ 35,   8,   2,   0,   0],
        [  1,   3,  14, 100,   0],
        [ 28,   4,  10,   9,   0],
        [ 12,  69,  51,  49,   0]])
tensor([[ 38,  10, 107,   9,   0,   0,   0,   0],
        [ 37,   4,  19,  92,  95,   7,   0,   0],
        [ 32,  85,  42,  80,  87,   0,   0,   0],
        [ 11,   4, 111,  50,  68,   5,   9,   0]])
tensor([[ 33,  71, 122, 117,  52,   2,   0,   0,   0],
        [  1,  63,  40,  13,  89,  67,  13,  79,   0],
        [ 36,  62,  90, 110,  60,  83,   0,   0,   0],
        [ 31,  43,   8,  15,  57,  73,   0,   0,   0]])
tensor([[120,  97,  75,   4,   6,  93,  20,   7],
        [  1, 113,  55,   6,  86,  53,  47,   0],
        [  1,   3,  98,   5, 116,  99,  66,   2],
        [ 34, 112, 104, 1

In [118]:
french_dataset = FrenchDataset(corpus_sorted, french_tokenizer, french_vocab)
french_batch_size = 4
dataloader_french = DataLoader(
    french_dataset,
    batch_size=french_batch_size,
    shuffle=False,
    collate_fn=collate_french)

In [119]:
for itr in dataloader_french:
    print(itr)

tensor([[ 27,   2,   0,   0],
        [  1, 105,  41,   0],
        [  1,   3,  82,   0],
        [ 25, 101,   2,   0]])
tensor([[ 26,  45,   2,   0,   0],
        [ 11,   4,  74,   2,   0],
        [  1,  16, 103,  17,   0],
        [  1,   3,  76,   0,   0]])
tensor([[ 35,   8,   2,   0,   0],
        [  1,   3,  14, 100,   0],
        [ 28,   4,  10,   9,   0],
        [ 12,  69,  51,  49,   0]])
tensor([[ 38,  10, 107,   9,   0,   0,   0,   0],
        [ 37,   4,  19,  92,  95,   7,   0,   0],
        [ 32,  85,  42,  80,  87,   0,   0,   0],
        [ 11,   4, 111,  50,  68,   5,   9,   0]])
tensor([[ 33,  71, 122, 117,  52,   2,   0,   0,   0],
        [  1,  63,  40,  13,  89,  67,  13,  79,   0],
        [ 36,  62,  90, 110,  60,  83,   0,   0,   0],
        [ 31,  43,   8,  15,  57,  73,   0,   0,   0]])
tensor([[120,  97,  75,   4,   6,  93,  20,   7],
        [  1, 113,  55,   6,  86,  53,  47,   0],
        [  1,   3,  98,   5, 116,  99,  66,   2],
        [ 34, 112, 104, 1