In [4]:
import re

INPUT_FILE = "nepali_rawsentences.csv"       
OUTPUT_FILE = "nepali_cleaned.txt"
MAX_SENTENCES = 50000

'''Remove:
#English letters
#English digits
#Devanagari digits
#Special characters
'''
CLEAN_RE = re.compile(
    r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]"
)

unique_sentences = set()
clean_sentences = []

with open(INPUT_FILE, "r", encoding="utf-8") as infile:
    for line in infile:
        # Split by danda
        for sentence in re.split(r"[редрее]", line):
            s = sentence.strip()
            s = s.rstrip("|")                  # remove trailing |
            s = CLEAN_RE.sub("", s)            # remove unwanted chars
            s = re.sub(r"\s+", " ", s).strip() # normalize spaces

            # Skip short junk
            if len(s) < 3:
                continue

            # Deduplicate
            if s not in unique_sentences:
                unique_sentences.add(s)
                clean_sentences.append(s)

                # Stop at 50k
                if len(clean_sentences) >= MAX_SENTENCES:
                    break

        if len(clean_sentences) >= MAX_SENTENCES:
            break

# Write cleaned corpus
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
    for s in clean_sentences:
        out.write(s + "\n")

print("Cleaning complete!")
print(f"Total clean sentences: {len(clean_sentences):,}")
print(f"Saved to: {OUTPUT_FILE}")


Cleaning complete!
Total clean sentences: 50,000
Saved to: nepali_cleaned.txt
