In [5]:
from datasets import load_dataset
import re
import json

# Load Gujarati dataset
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="guj_Gujr", streaming=True)

In [6]:
def gujarati_sentence_tokenizer(text):
    # Split on Gujarati sentence endings (purna viram = .) and common punctuation
    return re.split(r'(?<=[.!?])\s+', text)

def gujarati_word_tokenizer(text):
    # Gujarati Unicode range: 0A80-0AFF, plus URLs, emails, dates, decimals
    patterns = [
        r'https?://\S+',           # URLs
        r'\w+@\w+\.\w+',           # Emails  
        r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # Dates
        r'\d+\.\d+',               # Decimals
        r'[\u0A80-\u0AFF]+',       # Gujarati characters (including matras)
        r'[a-zA-Z]+',              # English words
        r'\d+',                    # Numbers
        r'[.,!?;:()\'""-]'        # Punctuation (purna viram = .)
    ]
    return re.findall('|'.join(patterns), text)

In [12]:
all_sentences = []
all_words = []

# Process limited dataset to avoid memory issues
for i, example in enumerate(dataset):
    if i >= 5000:  # Limit to first 5000 examples
        break
        
    text = example['text']
    sentences = gujarati_sentence_tokenizer(text)
    
    for sentence in sentences:
        if sentence.strip():  # Skip empty sentences
            all_sentences.append(sentence.strip())
            words = gujarati_word_tokenizer(sentence)
            all_words.extend(words)

print(f"Processing completed! Total examples processed: {i + 1}")

Processing completed! Total examples processed: 5001


In [13]:
# Save tokenized sentences (each line = one sentence with space-separated tokens)
with open('gujarati_tokenized_sentences.txt', 'w', encoding='utf-8') as f:
    for sentence in all_sentences:
        tokenized_words = gujarati_word_tokenizer(sentence)
        tokenized_sentence = ' '.join(tokenized_words)
        f.write(tokenized_sentence + '\n')

# Also save individual words for reference
with open('gujarati_words.txt', 'w', encoding='utf-8') as f:
    for word in all_words:
        f.write(word + '\n')

In [15]:
total_sentences = len(all_sentences)
total_words = len(all_words)
total_characters = sum(len(word) for word in all_words)

# Calculate averages
avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
avg_word_length = total_characters / total_words if total_words > 0 else 0

# Calculate Type/Token Ratio
unique_words = len(set(all_words))
ttr = unique_words / total_words if total_words > 0 else 0

In [16]:
print("GUJARATI CORPUS STATISTICS")
print(f"i.   Total sentences: {total_sentences:,}")
print(f"ii.  Total words: {total_words:,}")
print(f"iii. Total characters: {total_characters:,}")
print(f"iv.  Avg sentence length: {avg_sentence_length:.2f}")
print(f"v.   Avg word length: {avg_word_length:.2f}")
print(f"vi.  Type/Token Ratio: {ttr:.4f}")

GUJARATI CORPUS STATISTICS
i.   Total sentences: 6,880
ii.  Total words: 115,577
iii. Total characters: 512,764
iv.  Avg sentence length: 16.80
v.   Avg word length: 4.44
vi.  Type/Token Ratio: 0.2184
