In [1]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize

dataset = load_dataset('MLDataScientist/SlimOrca-Dedup-English-Uzbek', split='train')

dataset = dataset.filter(lambda x: x["from"] == 'gpt')

def strip_fields(example):
    example["translations"] = example["translations"].strip().lower()
    example["value"] = example["value"].strip().lower()
    return example

dataset = dataset.map(strip_fields)

def count_word_count(example):
    example['uz_word_count'] = len(word_tokenize(example['translations']))
    example['en_word_count'] = len(word_tokenize(example['value']))
    return example

dataset = dataset.map(count_word_count)
dataset = dataset.filter(lambda x: 0 < x['uz_word_count'] < 51 and 0 < x['en_word_count'] < 51)

print("Number of samples in the dataset:", len(dataset))

Map:   0%|          | 0/363491 [00:00<?, ? examples/s]

Map:   0%|          | 0/363491 [00:00<?, ? examples/s]

Filter:   0%|          | 0/363491 [00:00<?, ? examples/s]

Number of samples in the dataset: 135311


In [None]:
dataset

Dataset({
    features: ['conv_id', 'msg_id', 'from', 'value', 'translations', 'avg_len'],
    num_rows: 1000
})

In [2]:
dataset.save_to_disk("SlimOrca_Dedup_English_Uzbek_preprocessed")

Saving the dataset (0/1 shards):   0%|          | 0/135311 [00:00<?, ? examples/s]

In [1]:
from nltk.tokenize import word_tokenize
from datasets import load_from_disk

dataset = load_from_disk("SlimOrca_Dedup_English_Uzbek_preprocessed")

uz_texts = dataset['translations']
en_texts = dataset['value']

en_frequency = {}
uz_frequency = {}

for text in en_texts:
    words = word_tokenize(text)
    for word in words:
        if word not in en_frequency:
            en_frequency[word] = 0
        en_frequency[word] += 1
for text in uz_texts:
    words = word_tokenize(text)
    for word in words:
        if word not in uz_frequency:
            uz_frequency[word] = 0
        uz_frequency[word] += 1

en_frequency = sorted(en_frequency.items(), key=lambda x: x[1], reverse=True)
uz_frequency = sorted(uz_frequency.items(), key=lambda x: x[1], reverse=True)

print("English vocabulary size:", len(en_frequency))
print("Uzbek vocabulary size:", len(uz_frequency))

en_encoding_vocab = {'<bos>': 0, '<eos>': 1, '<unk>': 2}
uz_encoding_vocab = {'<bos>': 0, '<eos>': 1, '<unk>': 2}

en_decoding_vocab = {0: '<bos>', 1: '<eos>', 2: '<unk>'}
uz_decoding_vocab = {0: '<bos>', 1: '<eos>', 2: '<unk>'}

for i, (word, freq) in enumerate(en_frequency):
    if i < 50000:  # Limit to top 50,000 words
        en_encoding_vocab[word] = i + 3  # Start from index 3
        en_decoding_vocab[i + 3] = word
for i, (word, freq) in enumerate(uz_frequency):
    if i < 50000:  # Limit to top 50,000 words
        uz_encoding_vocab[word] = i + 3  # Start from index 3
        uz_decoding_vocab[i + 3] = word

English vocabulary size: 117194
Uzbek vocabulary size: 143266


In [2]:
import json

with open('en_encoding_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(en_encoding_vocab, f, ensure_ascii=False, indent=4)
    
with open('uz_encoding_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(uz_encoding_vocab, f, ensure_ascii=False, indent=4)
    
with open('en_decoding_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(en_decoding_vocab, f, ensure_ascii=False, indent=4)
    
with open('uz_decoding_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(uz_decoding_vocab, f, ensure_ascii=False, indent=4)