In [35]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import itertools
import re

# Define phonetic variations
variation_map = {
    "aa": ["a", "aa"],
    "a": ["a", "aa"],
    "ah": ["a", "ah", "aa"],
    "w": ["a", "w"],
    "e": ["e", "ee"],
    "ee": ["e", "ee"],
    "i": ["i", "ii"],
    "o": ["o"],
    "oo": ["u", "uu", "oo"],
    "u": ["u", "uu"],
    "uu": ["u", "uu"],
    "ae": ["ai"],
    "ye": ["e", "y"],
    "s": ["sh", "shh"],
    "sh": ["s", "shh"],
    "shh": ["s", "sh"]
}

# Additional consonant variations
consonant_map = {
    "chh": ["x", "ch", "xw", "xah"],
    "e": ["ye", "ya"],
    "pha": ["fa"],
    "a": ["w",],
    'T': ["t", "tt"],
    'Th': ["Th"],
    'D': ["d"],
    'Dh': ["dh"],
    'v': ['b', 'bh'],
    # 'b': ['v'],
    'bh': ['v']
}

last_word_check_list = ['w','ah']

def handle_repeated_chars(roman_word):
        """Reduce long repeated vowels (e.g., 'aaaa' тЖТ 'aa')."""
        optimized_word = []
        i = 0
        while i < len(roman_word):
            match = re.match(r"([a-zA-Z])\1{2,}", roman_word[i:])
            if match:
                char = match.group(1)
                optimized_word.append(char)  # Keep one repetition (aa, ee, etc.)
                i += len(match.group(0))
            else:
                optimized_word.append(roman_word[i])
                i += 1
        return "".join(optimized_word)



def generate_variations(word):
    normalize_word = handle_repeated_chars(word)

    new_words = [normalize_word]

    for key, variations in consonant_map.items():
        for variation in variations:
            if variation in normalize_word and (key not in normalize_word or key in ["e"]):
                normalize_word = normalize_word.replace(variation, key)
    new_words.append(normalize_word)
    # print('new_words', new_words)

    new_variations = list(new_words)
    # print('new_variations', new_variations)
    for roman_word in new_words:
        tokens = []
        i = 0
        while i < len(roman_word):
            # print('roman_word', roman_word)
            if i + 1 < len(roman_word) and roman_word[i:i+2] in variation_map:
                if (i + 2 < len(roman_word)) and roman_word[i:i+2] in last_word_check_list:
                    if roman_word[i] in variation_map:
                        tokens.append(variation_map[roman_word[i]])
                    else:
                        tokens.append([roman_word[i]])
                    i += 1
                else:
                    tokens.append(variation_map[roman_word[i:i+2]])
                    i += 2
            elif roman_word[i] in variation_map:
                if (i + 2 < len(roman_word)) and roman_word[i] in last_word_check_list:
                    tokens.append([roman_word[i]])
                else:
                    tokens.append(variation_map[roman_word[i]])
                i += 1
            else:
                tokens.append([roman_word[i]])
                i += 1
            # print('tokens', tokens)

        variations = ["".join(variant) for variant in itertools.product(*tokens)]
        new_variations.extend(variations)

    return set(new_variations)

# Example usage

def generate_devnagri_variations(variant):
    latin_variations = generate_variations(variant)
    # print('latin_variations', latin_variations)
    devanagari_variations = []
    print('Romanized variations', list(latin_variations))
    for variant in latin_variations:
        nepali = transliterate(variant, sanscript.ITRANS, sanscript.DEVANAGARI)
        devanagari_variations.append(nepali)
    return devanagari_variations


variant = "bhayera"
# variant = "vahirw"
print('word:', variant)
devanagari_variations = generate_devnagri_variations(variant)
print('Devanagari variations:', devanagari_variations)

word: bhayera
Romanized variations ['bhhairaa', 'bhayera', 'bhaeraa', 'bhayraa', 'bhhaira', 'bhayra', 'bhaaera', 'bhaaeraa', 'bhaera', 'bhaayraa', 'bhaayra', 'bhhaera']
Devanagari variations: ['рднреНрд╣реИрд░рд╛', 'рднрдпреЗрд░', 'рднрдПрд░рд╛', 'рднрдпреНрд░рд╛', 'рднреНрд╣реИрд░', 'рднрдпреНрд░', 'рднрд╛рдПрд░', 'рднрд╛рдПрд░рд╛', 'рднрдПрд░', 'рднрд╛рдпреНрд░рд╛', 'рднрд╛рдпреНрд░', 'рднреНрд╣рдПрд░']


In [36]:
import pickle
import os

PICKLE_FILE = "dictionary/word_count.pkl"  # Path to the saved pickle file

def load_saved_word_count():
    """Load and return the word frequency dictionary from the pickle file."""
    if os.path.exists(PICKLE_FILE):
        with open(PICKLE_FILE, "rb") as f:
            return pickle.load(f)
    else:
        print("No saved word count dictionary found.")
        return {}  # Return an empty dictionary if the file doesn't exist

# Usage
saved_word_count = load_saved_word_count()

In [29]:
len(saved_word_count)

240683

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def rank_variants_by_similarity(variants, nepali_dictionary):
    """
    Rank the generated variants by their similarity to words in the Nepali dictionary using TF-IDF.

    :param variants: List of transliterated Nepali variants.
    :param nepali_dictionary: List of words in the Nepali dictionary.
    :return: List of variants ordered by similarity along with matched dictionary word and similarity score.
    """
    # Ensure both dictionary and variants are lists
    nepali_dictionary = list(nepali_dictionary)

    # Combine variants and dictionary into a single corpus
    corpus = nepali_dictionary + variants

    # Compute TF-IDF vectors
    # vectorizer = TfidfVectorizer(analyzer='char')
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Calculate similarity of each variant to the dictionary words
    dictionary_size = len(nepali_dictionary)
    variant_vectors = tfidf_matrix[dictionary_size:]
    dictionary_vectors = tfidf_matrix[:dictionary_size]

    similarities = cosine_similarity(variant_vectors, dictionary_vectors)

    # Rank variants by maximum similarity to any dictionary word
    ranked_variants = []
    for i, variant in enumerate(variants):
        max_sim_index = similarities[i].argmax()
        matched_word = nepali_dictionary[max_sim_index]
        similarity_score = similarities[i, max_sim_index]
        ranked_variants.append((variant, matched_word, similarity_score))

    ranked_variants.sort(key=lambda x: x[2], reverse=True)

    return ranked_variants

In [37]:

def generate_devnagri_variations(variant):
    latin_variations = generate_variations(variant)
    devanagari_variations = []
    print('Romanized variations', list(latin_variations))
    for variant in latin_variations:
        nepali = transliterate(variant, sanscript.ITRANS, sanscript.DEVANAGARI)
        devanagari_variations.append(nepali)
    return devanagari_variations



variant = "bhayera" # "choto"
devanagari_variations = generate_devnagri_variations(variant)
print('Devanagari variations:', devanagari_variations)

Romanized variations ['bhhairaa', 'bhayera', 'bhaeraa', 'bhayraa', 'bhhaira', 'bhayra', 'bhaaera', 'bhaaeraa', 'bhaera', 'bhaayraa', 'bhaayra', 'bhhaera']
Devanagari variations: ['рднреНрд╣реИрд░рд╛', 'рднрдпреЗрд░', 'рднрдПрд░рд╛', 'рднрдпреНрд░рд╛', 'рднреНрд╣реИрд░', 'рднрдпреНрд░', 'рднрд╛рдПрд░', 'рднрд╛рдПрд░рд╛', 'рднрдПрд░', 'рднрд╛рдпреНрд░рд╛', 'рднрд╛рдпреНрд░', 'рднреНрд╣рдПрд░']


In [38]:
NEPALI_DICTIONARY= saved_word_count
top_matched_words = rank_variants_by_similarity(devanagari_variations, NEPALI_DICTIONARY)

print("\nTop 5 Matched Words:")
top_matched_words


Top 5 Matched Words:


[('рднрдпреЗрд░', 'рднрдпреЗрд░', np.float64(1.0000000000000002)),
 ('рднрдПрд░', 'рднрдПрд░', np.float64(1.0)),
 ('рднрдпреНрд░', 'рднрдпреНрд░рд░', np.float64(0.8047247769238031)),
 ('рднрд╛рдПрд░', 'рдирд┐рднрд╛рдПрд░', np.float64(0.7061244569651446)),
 ('рднрдпреНрд░рд╛', 'рднрдпреНрд░рд░', np.float64(0.7044708057411735)),
 ('рднрд╛рдпреНрд░', 'рдорд╛рдпреНрд░', np.float64(0.6456314729709742)),
 ('рднрдПрд░рд╛', 'рднрдПрд░', np.float64(0.6434916170887707)),
 ('рднрд╛рдпреНрд░рд╛', 'рдорд╛рдпреНрд░', np.float64(0.5848289707228022)),
 ('рднрд╛рдПрд░рд╛', 'рдирд┐рднрд╛рдПрд░', np.float64(0.5561902315551083)),
 ('рднреНрд╣реИрд░рд╛', 'рд╣реИрд░рд╛рди', np.float64(0.5516259195722237)),
 ('рднреНрд╣реИрд░', 'рд╣реИрд░реА', np.float64(0.36008991294924636)),
 ('рднреНрд╣рдПрд░', 'рд░рд╣рдП', np.float64(0.23631770944209657))]

In [8]:
from collections import defaultdict
import pandas as pd
import unicodedata
from metaphone import doublemetaphone

# Load the CSV file
df = pd.read_csv("files/words_devnagri_root.csv")

In [9]:
df.head()

Unnamed: 0,Words,Devanagari,Root
0,xata,рдЫрдЯ,рдЫрдЯ
1,xatw,рдЫрдЯ,рдЫрдЯ
2,xodyo,рдЫреЛрдбреНрдпреЛ,рдЫреЛрдбреНрдпреЛ
3,xoto,рдЫреЛрдЯреЛ,рдЫреЛрдЯреЛ
4,sidhai,рд╕рд┐рдзреИ,рд╕рд┐рдзрд╛


In [10]:
grouped = df.groupby('Devanagari')['Words'].apply(list).reset_index()

# Filter to only include entries with more than one word
filtered = grouped[grouped['Words'].apply(len) > 1]

In [11]:
filtered

Unnamed: 0,Devanagari,Words
3,рдкрд░рд┐рдпреЛ,"[pariyo, pareyo]"
4,рдкрд░реНрдпреЛ,"[paryoo, paryo]"
5,рдкреБрд░реИ,"[paurai, purae]"
7,рднрдПрд░,"[vayerw, vaera, vayeraw, bhaera]"
16,рднрдПрд░,"[vayera, vayeraw, vayara, vayerw, bhayera]"
18,рд╕рд┐рдзреИ,"[sidhai, siddai]"


In [12]:
# from collections import Counter
# total = 0
# accurate = 0
# not_accurate = 0
# NEPALI_DICTIONARY = saved_word_count

# for nepali_word, variants in zip(filtered['Devanagari'], filtered['Words']):
#     # print("nepali_word", nepali_word, 'variants', variants)
#     nepali_variations = []
#     for variant in variants:
#       total +=1
#       latin_variations = generate_variations(variant)
#       devanagari_variations = []
#       for new_variant in latin_variations:
#           nepali = transliterate(new_variant, sanscript.ITRANS, sanscript.DEVANAGARI)
#           devanagari_variations.append(nepali)
#       top_matched_words = rank_variants_by_similarity(devanagari_variations, NEPALI_DICTIONARY)
#       nepali = top_matched_words[0][0] if top_matched_words else None
#       nepali_variations.append((nepali,))
#     # Count occurrences of each mapped Nepali word
#     counter = Counter(nepali_variations)
#     most_common_word, count = counter.most_common(1)[0]

#     if len(counter) == 1:
#         print(f"тЬЕ All variants map to the same Nepali word: '{most_common_word}' тАФ Adding {count}")
#         accurate += count
#     else:
#         print(f"тЭМ Multiple mappings found: {dict(counter)}")
#         print(f"тЬЕ Keeping only the most frequent one: '{most_common_word}' тАФ Adding {count}")
#         accurate += count
# print(f'accurate: {accurate} total: {total} percent{accurate/total}')

тЭМ Multiple mappings found: {('рдкрд╛рд░рд┐рдпреЛ',): 1, ('рдкрд╛рд░реАрдпреЛ',): 1}
тЬЕ Keeping only the most frequent one: '('рдкрд╛рд░рд┐рдпреЛ',)' тАФ Adding 1
тЭМ Multiple mappings found: {('рдкрд╛рд░реНрдпреБ',): 1, ('рдкрд░реНрдпреЛ',): 1}
тЬЕ Keeping only the most frequent one: '('рдкрд╛рд░реНрдпреБ',)' тАФ Adding 1
тЭМ Multiple mappings found: {('рдкреМрд░рд╛рдИ',): 1, ('рдкреБрд░реИ',): 1}
тЬЕ Keeping only the most frequent one: '('рдкреМрд░рд╛рдИ',)' тАФ Adding 1
тЭМ Multiple mappings found: {('рд╡рдпреЗрд░',): 1, ('рдмреИрд░рд╛',): 2, ('рднрдПрд░',): 1}
тЬЕ Keeping only the most frequent one: '('рдмреИрд░рд╛',)' тАФ Adding 2
тЭМ Multiple mappings found: {('рдмреИрд░рд╛',): 2, ('рд╡рд╛рдпрд░',): 1, ('рд╡рдпреЗрд░',): 1, ('рднрдпреЗрд░',): 1}
тЬЕ Keeping only the most frequent one: '('рдмреИрд░рд╛',)' тАФ Adding 2
тЭМ Multiple mappings found: {('рд╕рд┐рдзреИ',): 1, ('рд╕рд┐рджреНрджреИ',): 1}
тЬЕ Keeping only the most frequent one: '('рд╕рд┐рдзреИ',)' тАФ Adding 1
accurate: 

In [14]:
from collections import Counter

total = 0
accurate = 0
not_accurate = 0
NEPALI_DICTIONARY = saved_word_count

for nepali_word, variants in zip(filtered['Devanagari'], filtered['Words']):
    nepali_variations = []
    for variant in variants:
        total += 1
        latin_variations = generate_variations(variant)
        devanagari_variations = []
        for new_variant in latin_variations:
            nepali = transliterate(new_variant, sanscript.ITRANS, sanscript.DEVANAGARI)
            devanagari_variations.append(nepali)
        top_matched_words = rank_variants_by_similarity(devanagari_variations, NEPALI_DICTIONARY)
        nepali = top_matched_words[0][0] if top_matched_words else None
        nepali_variations.append((nepali, variant))  # store variant too for debugging

    # Find the most common mapped word
    mapped_words_only = [x[0] for x in nepali_variations]
    counter = Counter(mapped_words_only)
    most_common_word, count = counter.most_common(1)[0]

    # Now keep only those variants where mapped word == most_common_word
    filtered_nepali_variations = [
        (mapped_word, variant) for mapped_word, variant in nepali_variations if mapped_word == most_common_word
    ]

    if len(counter) == 1:
        print(f"тЬЕ All variants map to the same Nepali word: '{most_common_word}' тАФ Adding {count}")
        accurate += count
    else:
        print(f"тЭМ Multiple mappings found: {dict(counter)}")
        print(f"тЬЕ Keeping only the most frequent one: '{most_common_word}' тАФ Adding {count}")
        accurate += count

    # Now check mismatches based on filtered mappings
    for mapped_word, variant in nepali_variations:
        if mapped_word != nepali_word:
            print(f"ЁЯФБ Mismatch: Variant '{variant}' тЖТ Mapped '{mapped_word}', Expected '{nepali_word}'")
            not_accurate += 1


print(f'accurate: {accurate}, total: {total}, percent: {accurate / total:.2f}')


ЁЯФБ Mismatch: Variant 'pariyo' тЖТ Mapped 'рдкрд╛рд░рд┐рдпреЛ', Expected ' рдкрд░рд┐рдпреЛ'
ЁЯФБ Mismatch: Variant 'pareyo' тЖТ Mapped 'рдкрд╛рд░реАрдпреЛ', Expected ' рдкрд░рд┐рдпреЛ'
ЁЯФБ Mismatch: Variant 'paryoo' тЖТ Mapped 'рдкрд╛рд░реНрдпреБ', Expected ' рдкрд░реНрдпреЛ'
ЁЯФБ Mismatch: Variant 'paryo' тЖТ Mapped 'рдкрд░реНрдпреЛ', Expected ' рдкрд░реНрдпреЛ'
ЁЯФБ Mismatch: Variant 'paurai' тЖТ Mapped 'рдкреМрд░рд╛рдИ', Expected ' рдкреБрд░реИ'
ЁЯФБ Mismatch: Variant 'purae' тЖТ Mapped 'рдкреБрд░реИ', Expected ' рдкреБрд░реИ'
ЁЯФБ Mismatch: Variant 'vayerw' тЖТ Mapped 'рд╡рдпреЗрд░', Expected ' рднрдПрд░'
ЁЯФБ Mismatch: Variant 'vaera' тЖТ Mapped 'рдмреИрд░рд╛', Expected ' рднрдПрд░'
ЁЯФБ Mismatch: Variant 'vayeraw' тЖТ Mapped 'рдмреИрд░рд╛', Expected ' рднрдПрд░'
ЁЯФБ Mismatch: Variant 'bhaera' тЖТ Mapped 'рднрдПрд░', Expected ' рднрдПрд░'
ЁЯФБ Mismatch: Variant 'vayera' тЖТ Mapped 'рдмреИрд░рд╛', Expected 'рднрдПрд░'
ЁЯФБ Mismatch: Variant 'vayeraw' тЖТ Mapped 'рдмреИрд░рд╛', E

тЭМ Multiple mappings found: {'рд╕рд┐рдзреИ': 1, 'рд╕рд┐рджреНрджреИ': 1}
тЬЕ Keeping only the most frequent one: 'рд╕рд┐рдзреИ' тАФ Adding 1
ЁЯФБ Mismatch: Variant 'siddai' тЖТ Mapped 'рд╕рд┐рджреНрджреИ', Expected 'рд╕рд┐рдзреИ'
