In [6]:
import re
import hashlib

INPUT_FILE = "alpaca_data_cleaned_bhojpuri.csv"
OUTPUT_FILE = "bhojpuri_cleaned.txt"

'''Remove:
English letters
English digits
Devanagari digits
Special characters including |
'''
CLEAN_RE = re.compile(
    r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]"
)

seen_hashes = set()
saved_count = 0

def normalize_line(s):
    s = s.strip()                 #remove leading/trailing spaces
    s = CLEAN_RE.sub("", s)       # emove unwanted chars incl |
    s = re.sub(r"\s+", " ", s)    #collapse multiple spaces
    return s.strip()              # inal trim

with open(INPUT_FILE, "r", encoding="utf-8") as infile, \
     open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:

    for line in infile:
        normalized = normalize_line(line)
        if not normalized:
            continue

        # Hash-based deduplication
        h = hashlib.md5(normalized.encode("utf-8")).hexdigest()
        if h in seen_hashes:
            continue
        seen_hashes.add(h)

        outfile.write(normalized + "\n")
        saved_count += 1

print("Done!")
print(f"Total unique sentences saved: {saved_count}")
print(f"Output file: {OUTPUT_FILE}")


Done!
Total unique sentences saved: 198619
Output file: bhojpuri_cleaned.txt


In [7]:
with open("bhojpuri_cleaned1.txt", "r", encoding="utf-8") as f:
    text = f.read()

words = text.split()
print("Total words:", len(words))

Total words: 6882382


In [11]:
import re
import hashlib

INPUT_FILE = "alpaca_data_cleaned_bhojpuri.csv"
OUTPUT_FILE = "bhojpuri_cleaned1.txt"

'''Remove:
English letters
English digits
Devanagari digits
'''
CLEAN_RE = re.compile(
    r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]"
)

seen_hashes = set()
saved_count = 0

def normalize_line(s):
    s = s.strip()                 #remove leading/trailing spaces
    s = CLEAN_RE.sub("", s)       # emove unwanted chars incl |
    s = re.sub(r"\s+", " ", s)    #collapse multiple spaces
    return s.strip()              # inal trim

with open(INPUT_FILE, "r", encoding="utf-8") as infile, \
     open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:

    for line in infile:
        normalized = normalize_line(line)
        if not normalized:
            continue

        # Hash-based deduplication
        h = hashlib.md5(normalized.encode("utf-8")).hexdigest()
        if h in seen_hashes:
            continue
        seen_hashes.add(h)

        outfile.write(normalized + "\n")
        saved_count += 1

print("Done!")
print(f"Total unique sentences saved: {saved_count}")
print(f"Output file: {OUTPUT_FILE}")


Done!
Total unique sentences saved: 198619
Output file: bhojpuri_cleaned1.txt


In [16]:
#Downsampling data
import random
import re

target_size = 179000  #downsampling to the ßlowest words of Tamang langauge

lang_file = "bhojpuri_cleaned1.txt"
with open(lang_file, "r", encoding="utf-8") as f:
    words = f.read().split()

#Clean words: keep only Devanagari
def clean_word(word):
    word = re.sub(r'[^ऀ-ॿ]', '', word)  #remove non-Devanagari
    return word.strip()

#Clean all words
cleaned_words = [clean_word(w) for w in words if clean_word(w)]

#Downsample to exact target size
if len(cleaned_words) > target_size:
    cleaned_words = random.sample(cleaned_words, target_size)
elif len(cleaned_words) < target_size:
    
    cleaned_words = cleaned_words * (target_size // len(cleaned_words)) + cleaned_words[:target_size % len(cleaned_words)]

#Save
out_file = "bhojpuri_balanced.txt"
with open(out_file, "w", encoding="utf-8") as f:
    f.write(" ".join(cleaned_words))

print(f"Bhojpuri done: {len(cleaned_words)} words saved as {out_file}")

Bhojpuri done: 179000 words -> saved as bhojpuri_balanced.txt


In [17]:
with open("bhojpuri_balanced.txt", "r", encoding="utf-8") as f:
    text = f.read()

words = text.split()
print("Total words:", len(words))

Total words: 179000
