In [1]:
import sentencepiece as spm

In [4]:
import unicodedata

# Read raw Hindi corpus
with open("hindi_corpus.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Unicode normalization (NFC is recommended)
normalized_text = unicodedata.normalize("NFC", text)

# Save normalized corpus
with open("hindi_normalized.txt", "w", encoding="utf-8") as f:
    f.write(normalized_text)

print("Hindi corpus normalized successfully.")

Hindi corpus normalized successfully.


In [6]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input="hindi_normalized.txt",
    model_prefix="hindi_bpe",
    model_type="bpe",
    vocab_size=5000,           # > 5000 (requirement)
    character_coverage=0.9995,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)


In [8]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("hindi_bpe.model")

print("Vocabulary size:", sp.get_piece_size())


Vocabulary size: 5000


In [9]:
with open("hindi_corpus.txt", "r", encoding="utf-8") as f:
    text = f.read()

num_characters = len(text)

tokens = sp.encode(text, out_type=int)
num_tokens = len(tokens)

compression_ratio = num_characters / num_tokens

print("Characters:", num_characters)
print("Tokens:", num_tokens)
print("Compression Ratio:", compression_ratio)


Characters: 17724
Tokens: 4016
Compression Ratio: 4.413346613545817


In [10]:
sample = "अयोध्या सूर्यवंशी राजाओं की राजधानी है। इस राजवंश में विचित्रता यह है कि और जितने राजवंश भारत में हुये उनमें यह सबसे लम्बा है।"
print(sp.encode(sample, out_type=str))


['▁अयोध्या', '▁सूर्यवंशी', '▁राजाओं', '▁की', '▁राजधानी', '▁है', '।', '▁इस', '▁राजवंश', '▁में', '▁विचित्रता', '▁यह', '▁है', '▁कि', '▁और', '▁जितने', '▁राजवंश', '▁भारत', '▁में', '▁हुये', '▁उनमें', '▁यह', '▁सबसे', '▁लम्बा', '▁है', '।']
