In [4]:
import re

INPUT_FILE = "nepali_rawsentences.csv"       
OUTPUT_FILE = "nepali_cleaned.txt"
MAX_SENTENCES = 50000

'''Remove:
#English letters
#English digits
#Devanagari digits
#Special characters
'''
CLEAN_RE = re.compile(
    r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]"
)

unique_sentences = set()
clean_sentences = []

with open(INPUT_FILE, "r", encoding="utf-8") as infile:
    for line in infile:
        # Split by danda
        for sentence in re.split(r"[।॥]", line):
            s = sentence.strip()
            s = s.rstrip("|")                  # remove trailing |
            s = CLEAN_RE.sub("", s)            # remove unwanted chars
            s = re.sub(r"\s+", " ", s).strip() # normalize spaces

            # Skip short junk
            if len(s) < 3:
                continue

            # Deduplicate
            if s not in unique_sentences:
                unique_sentences.add(s)
                clean_sentences.append(s)

                # Stop at 50k
                if len(clean_sentences) >= MAX_SENTENCES:
                    break

        if len(clean_sentences) >= MAX_SENTENCES:
            break

# Write cleaned corpus
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
    for s in clean_sentences:
        out.write(s + "\n")

print("Cleaning complete!")
print(f"Total clean sentences: {len(clean_sentences):,}")
print(f"Saved to: {OUTPUT_FILE}")


Cleaning complete!
Total clean sentences: 50,000
Saved to: nepali_cleaned.txt


In [1]:
import re

input_file = "nepali_corpus.txt"
output_file = "nepali_cleaned1.txt"

with open(input_file, 'r', encoding='utf-8') as f:
    text = f.read()

# Remove English letters and digits
text = re.sub(r'[A-Za-z0-9]', '', text)

# Remove Devanagari digits (०-९)
text = re.sub(r'[\u0966-\u096F]', '', text)

# Remove extra spaces and empty lines
text = re.sub(r'\s+', ' ', text).strip()

with open(output_file, 'w', encoding='utf-8') as f:
    f.write(text)

print("Cleaning complete!")
print("Saved to:", output_file) 

Cleaning complete!
Saved to: nepali_cleaned1.txt


In [2]:
with open("nepali_cleaned1.txt", "r", encoding="utf-8") as f:
    text = f.read()

words = text.split()
print("Total words:", len(words))

Total words: 2005826


In [6]:
#Downsampling data
import random
import re

target_size = 179000  

lang_file = "nepali_cleaned1.txt"
with open(lang_file, "r", encoding="utf-8") as f:
    words = f.read().split()

#Clean words: keep only Devanagari
def clean_word(word):
    word = re.sub(r'[^ऀ-ॿ]', '', word)  #remove non-Devanagari
    return word.strip()

#Clean all words
cleaned_words = [clean_word(w) for w in words if clean_word(w)]

#Downsample to exact target size
if len(cleaned_words) > target_size:
    cleaned_words = random.sample(cleaned_words, target_size)
elif len(cleaned_words) < target_size:
    
    cleaned_words = cleaned_words * (target_size // len(cleaned_words)) + cleaned_words[:target_size % len(cleaned_words)]

#Save
out_file = "nepali_balanced.txt"
with open(out_file, "w", encoding="utf-8") as f:
    f.write(" ".join(cleaned_words))

print(f"Nepali done: {len(cleaned_words)} words saved as {out_file}")

Nepali done: 179000 words saved as nepali_balanced.txt
