In [7]:
import re
import hashlib

INPUT_FILE = "alpaca_data_cleaned_bhojpuri.csv"
OUTPUT_FILE = "bhojpuri_cleaned.txt"
MAX_SENTENCES = 30000

'''Remove:
English letters
English digits
Devanagari digits
Special characters including |
'''
CLEAN_RE = re.compile(
    r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]"
)

seen_hashes = set()
saved_count = 0

def normalize_sentence(s):
    s = s.strip()                 #remove leading/trailing spaces
    s = CLEAN_RE.sub("", s)       # emove unwanted chars incl |
    s = re.sub(r"\s+", " ", s)    #collapse multiple spaces
    return s.strip()              # inal trim

with open(INPUT_FILE, "r", encoding="utf-8") as infile, \
     open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:

    for line in infile:
        #Split by Devanagari danda
        sentences = re.split(r"[редрее]", line)

        for sent in sentences:
            sent = normalize_sentence(sent)

            #Hash-based deduplication
            h = hashlib.md5(sent.encode("utf-8")).hexdigest()
            if h in seen_hashes:
                continue

            seen_hashes.add(h)
            outfile.write(sent + "\n")
            saved_count += 1

            if saved_count >= MAX_SENTENCES:
                break

        if saved_count >= MAX_SENTENCES:
            break

print("Done!")
print(f"Total unique sentences saved: {saved_count}")
print(f"Output file: {OUTPUT_FILE}")


Done!
Total unique sentences saved: 30000
Output file: bhojpuri_cleaned.txt
