In [8]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import itertools
import re

# Define phonetic variations
variation_map = {
    "aa": ["a", "aa"],
    "a": ["a", "aa"],
    "ah": ["a", "ah", "aa"],
    "w": ["a", "w"],
    "e": ["e", "ee"],
    "ee": ["e", "ee"],
    "i": ["i", "ii"],
    "o": ["o"],
    "oo": ["u", "uu", "oo"],
    "u": ["u", "uu"],
    "uu": ["u", "uu"],
    "ae": ["ai"],
    "s": ["sh", "shh"],
    "sh": ["s", "shh"],
    "shh": ["s", "sh"]
}

# Additional consonant variations
consonant_map = {
    "chh": ["x", "ch", "xw", "xah"],
    "e": ["ye", "ya"],
    "pha": ["fa"],
    "a": ["w",],
    'T': ["t", "tt"],
    'Th': ["Th"],
    'D': ["d"],
    'Dh': ["dh"],
    'v': ['b', 'bh'],
    'b': ['v'],
    'bh': ['v']
}

last_word_check_list = ['w','ah']

def handle_repeated_chars(roman_word):
        """Reduce long repeated vowels (e.g., 'aaaa' → 'aa')."""
        optimized_word = []
        i = 0
        while i < len(roman_word):
            match = re.match(r"([a-zA-Z])\1{2,}", roman_word[i:])
            if match:
                char = match.group(1)
                optimized_word.append(char)  # Keep one repetition (aa, ee, etc.)
                i += len(match.group(0))
            else:
                optimized_word.append(roman_word[i])
                i += 1
        return "".join(optimized_word)



def generate_variations(word):
    normalize_word = handle_repeated_chars(word)

    new_words = [normalize_word]

    for key, variations in consonant_map.items():
        for variation in variations:
            if variation in normalize_word and (key not in normalize_word or key in ["e"]):
                normalize_word = normalize_word.replace(variation, key)
    new_words.append(normalize_word)
    # print('new_words', new_words)

    new_variations = list(new_words)
    # print('new_variations', new_variations)
    for roman_word in new_words:
        tokens = []
        i = 0
        while i < len(roman_word):
            # print('roman_word', roman_word)
            if i + 1 < len(roman_word) and roman_word[i:i+2] in variation_map:
                if (i + 2 < len(roman_word)) and roman_word[i:i+2] in last_word_check_list:
                    if roman_word[i] in variation_map:
                        tokens.append(variation_map[roman_word[i]])
                    else:
                        tokens.append([roman_word[i]])
                    i += 1
                else:
                    tokens.append(variation_map[roman_word[i:i+2]])
                    i += 2
            elif roman_word[i] in variation_map:
                if (i + 2 < len(roman_word)) and roman_word[i] in last_word_check_list:
                    tokens.append([roman_word[i]])
                else:
                    tokens.append(variation_map[roman_word[i]])
                i += 1
            else:
                tokens.append([roman_word[i]])
                i += 1
            # print('tokens', tokens)

        variations = ["".join(variant) for variant in itertools.product(*tokens)]
        new_variations.extend(variations)

    return set(new_variations)

# Example usage

def generate_devnagri_variations(variant):
    latin_variations = generate_variations(variant)
    # print('latin_variations', latin_variations)
    devanagari_variations = []
    print('Romanized variations', list(latin_variations))
    for variant in latin_variations:
        nepali = transliterate(variant, sanscript.ITRANS, sanscript.DEVANAGARI)
        devanagari_variations.append(nepali)
    return devanagari_variations


variant = "bigriyako"
# variant = "vahirw"
print('word:', variant)
devanagari_variations = generate_devnagri_variations(variant)
print('Devanagari variations:', devanagari_variations)

word: bigriyako
Romanized variations ['biigriieko', 'biigrieko', 'biigriyako', 'bigrieko', 'biigriiyaako', 'bigriyaako', 'biigrieeko', 'biigriiyako', 'bigriieeko', 'biigriieeko', 'bigriiyaako', 'bigriyako', 'bigrieeko', 'bigriiyako', 'biigriyaako', 'bigriieko']
Devanagari variations: ['बीग्रीएको', 'बीग्रिएको', 'बीग्रियको', 'बिग्रिएको', 'बीग्रीयाको', 'बिग्रियाको', 'बीग्रिईको', 'बीग्रीयको', 'बिग्रीईको', 'बीग्रीईको', 'बिग्रीयाको', 'बिग्रियको', 'बिग्रिईको', 'बिग्रीयको', 'बीग्रियाको', 'बिग्रीएको']


In [14]:
import pickle
import os

PICKLE_FILE = "dictionary/word_count.pkl"  # Path to the saved pickle file

def load_saved_word_count():
    """Load and return the word frequency dictionary from the pickle file."""
    if os.path.exists(PICKLE_FILE):
        with open(PICKLE_FILE, "rb") as f:
            return pickle.load(f)
    else:
        print("No saved word count dictionary found.")
        return {}  # Return an empty dictionary if the file doesn't exist

# Usage
saved_word_count = load_saved_word_count()

In [15]:
len(saved_word_count)

240683

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def rank_variants_by_similarity(variants, nepali_dictionary):
    """
    Rank the generated variants by their similarity to words in the Nepali dictionary using TF-IDF.

    :param variants: List of transliterated Nepali variants.
    :param nepali_dictionary: List of words in the Nepali dictionary.
    :return: List of variants ordered by similarity along with matched dictionary word and similarity score.
    """
    # Ensure both dictionary and variants are lists
    nepali_dictionary = list(nepali_dictionary)

    # Combine variants and dictionary into a single corpus
    corpus = nepali_dictionary + variants

    # Compute TF-IDF vectors
    # vectorizer = TfidfVectorizer(analyzer='char')
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Calculate similarity of each variant to the dictionary words
    dictionary_size = len(nepali_dictionary)
    variant_vectors = tfidf_matrix[dictionary_size:]
    dictionary_vectors = tfidf_matrix[:dictionary_size]

    similarities = cosine_similarity(variant_vectors, dictionary_vectors)

    # Rank variants by maximum similarity to any dictionary word
    ranked_variants = []
    for i, variant in enumerate(variants):
        max_sim_index = similarities[i].argmax()
        matched_word = nepali_dictionary[max_sim_index]
        similarity_score = similarities[i, max_sim_index]
        ranked_variants.append((variant, matched_word, similarity_score))

    ranked_variants.sort(key=lambda x: x[2], reverse=True)

    return ranked_variants

In [18]:

def generate_devnagri_variations(variant):
    latin_variations = generate_variations(variant)
    devanagari_variations = []
    print('Romanized variations', list(latin_variations))
    for variant in latin_variations:
        nepali = transliterate(variant, sanscript.ITRANS, sanscript.DEVANAGARI)
        devanagari_variations.append(nepali)
    return devanagari_variations



variant = "bigreko" # "choto"
devanagari_variations = generate_devnagri_variations(variant)
print('Devanagari variations:', devanagari_variations)

NEPALI_DICTIONARY= saved_word_count
top_matched_words = rank_variants_by_similarity(devanagari_variations, NEPALI_DICTIONARY)

print("\nTop 5 Matched Words:")
top_matched_words

Romanized variations ['biigreko', 'bigreeko', 'bigreko', 'biigreeko']
Devanagari variations: ['बीग्रेको', 'बिग्रीको', 'बिग्रेको', 'बीग्रीको']

Top 5 Matched Words:


[('बीग्रेको', 'बीग्रेको', np.float64(1.0)),
 ('बिग्रेको', 'बिग्रेको', np.float64(1.0)),
 ('बिग्रीको', 'बिग्री', np.float64(0.8490152233079645)),
 ('बीग्रीको', 'बीग्रेड', np.float64(0.6655647416719088))]

In [19]:
NEPALI_DICTIONARY= saved_word_count
top_matched_words = rank_variants_by_similarity(devanagari_variations, NEPALI_DICTIONARY)

print("\nTop 5 Matched Words:")
top_matched_words


Top 5 Matched Words:


[('बीग्रेको', 'बीग्रेको', np.float64(1.0)),
 ('बिग्रेको', 'बिग्रेको', np.float64(1.0)),
 ('बिग्रीको', 'बिग्री', np.float64(0.8490152233079645)),
 ('बीग्रीको', 'बीग्रेड', np.float64(0.6655647416719088))]

In [21]:
from collections import defaultdict
import pandas as pd
import unicodedata
from metaphone import doublemetaphone

# Load the CSV file
df = pd.read_csv("files/multi_words_devnagri_root.csv")

In [22]:
df.head()

Unnamed: 0,Words,Devanagari,Root
0,raamrai,राम्रै,राम्रा
1,raamro,राम्रो,राम्रा
2,fohor,फोहोर,फोहोर
3,xodyo,छोड्यो,छोड्यो
4,sidhai,सिधै,सिधा


In [23]:
grouped = df.groupby('Devanagari')['Words'].apply(list).reset_index()

# Filter to only include entries with more than one word
filtered = grouped[grouped['Words'].apply(len) > 1]

In [24]:
filtered

Unnamed: 0,Devanagari,Words
0,खुलेको,"[khuuleko, khuleko]"
1,छैन,"['xhaina', 'xiana', 'xaena', chhaina]"
2,टिक्छ,"[tikcha, tikxa]"
3,परियो,"[pariyo, pareyo]"
4,पर्यो,"[paryoo, paryo]"
...,...,...
113,हुने,"[hune, hunay]"
114,हुनेछ,"[hunexa, hunexw, hunexa]"
115,हुन्छ,"[hunchha, hunxa]"
116,है,"[haii, hae]"


In [25]:
from collections import Counter

total = 0
accurate = 0
not_accurate = 0
NEPALI_DICTIONARY = saved_word_count

for nepali_word, variants in zip(filtered['Devanagari'], filtered['Words']):
    nepali_variations = []
    for variant in variants:
        total += 1
        latin_variations = generate_variations(variant)
        devanagari_variations = []
        for new_variant in latin_variations:
            nepali = transliterate(new_variant, sanscript.ITRANS, sanscript.DEVANAGARI)
            devanagari_variations.append(nepali)
        top_matched_words = rank_variants_by_similarity(devanagari_variations, NEPALI_DICTIONARY)
        nepali = top_matched_words[0][1] if top_matched_words else None
        nepali_variations.append((nepali, variant))  # store variant too for debugging

    # Find the most common mapped word
    mapped_words_only = [x[0] for x in nepali_variations]
    counter = Counter(mapped_words_only)
    most_common_word, count = counter.most_common(1)[0]

    # Now keep only those variants where mapped word == most_common_word
    filtered_nepali_variations = [
        (mapped_word, variant) for mapped_word, variant in nepali_variations if mapped_word == most_common_word
    ]

    if len(counter) == 1:
        # print(f"✅ All variants map to the same Nepali word: '{most_common_word}' — Adding {count}")
        accurate += count
    else:
        # print(f"❌ Multiple mappings found: {dict(counter)}")
        # print(f"✅ Keeping only the most frequent one: '{most_common_word}' — Adding {count}")
        accurate += count

    # Now check mismatches based on filtered mappings
    for mapped_word, variant in nepali_variations:
        if mapped_word != nepali_word:
            # print(f"🔁 Mismatch: Variant '{variant}' → Mapped '{mapped_word}', Expected '{nepali_word}'")
            not_accurate += 1


print(f'accurate: {accurate}, total: {total}, percent: {accurate / total:.2f}')


accurate: 220, total: 302, percent: 0.73
