In [7]:
from collections import defaultdict
import pandas as pd
import unicodedata
from metaphone import doublemetaphone

# Load the CSV file
df = pd.read_csv("files/words_devnagri_root.csv")



In [8]:
df.head()

Unnamed: 0,Words,Devanagari,Root
0,xata,छट,छट
1,xatw,छट,छट
2,xodyo,छोड्यो,छोड्यो
3,xoto,छोटो,छोटो
4,sidhai,सिधै,सिधा


In [9]:
# Function to normalize text to ASCII
def normalize_to_ascii(word):
    return ''.join(
        (c for c in unicodedata.normalize('NFD', word) if unicodedata.category(c) != 'Mn')
    ).encode('ascii', 'ignore').decode('ascii')

# Dictionary to group both Romanized and Devanagari words
nepali_phonetic_groups = defaultdict(lambda: {"Words": set(), "Root": set()})

# Iterate over rows to group by Double Metaphone
for _, row in df.iterrows():
    roman = row["Words"]
    dev = row["Root"]
    normalized_word = normalize_to_ascii(roman)
    primary_code, secondary_code = doublemetaphone(normalized_word)

    # Choose primary code if available, otherwise fallback to secondary
    phonetic_code = primary_code or secondary_code

    if phonetic_code and len(phonetic_code) > 0:
        nepali_phonetic_groups[phonetic_code]["Words"].add(roman)
        nepali_phonetic_groups[phonetic_code]["Root"].add(dev)

# Convert sets to lists and print the results
for phonetic_code, groups in nepali_phonetic_groups.items():
    romanized_list = list(groups["Words"])
    devnagari_list = list(groups["Root"])
    print(f"Phonetic Code: {phonetic_code} -> Romanized: {romanized_list}, Root: {devnagari_list}")


Phonetic Code: ST -> Romanized: ['xodyo', 'sayad', 'swaad', 'xoto', 'xito', 'xatw', 'sidhai', 'xata', 'siddai', 'sutae', 'sidai', 'sidha'], Root: [' सुते', ' छट', ' सिधा', ' छोड्यो', ' सिधा ', ' छिटो', 'स्वाद', ' छोटो', ' सायद']
Phonetic Code: KT -> Romanized: ['chhito'], Root: [' छिटो']
Phonetic Code: FR -> Romanized: ['feri', 'vaera', 'pheri', 'vayeraw', 'vayerw', 'vayera', 'vayara'], Root: [' फेरी', ' भएर']
Phonetic Code: PR -> Romanized: ['puraa', 'purae', 'pareyo', 'paurai', 'pariyo', 'bhayera', 'paryo', 'paryoo', 'bhaera'], Root: [' पुरा', ' पर्यो', ' भएर']


In [11]:
# Accuracy Evaluation
total_groups = len(nepali_phonetic_groups)
passed_groups = sum(1 for group in nepali_phonetic_groups.values() if len(group["Root"]) == 1)
failed_groups = total_groups - passed_groups
accuracy = (passed_groups / total_groups) * 100 if total_groups else 0

print(f"\nTotal Groups: {total_groups}")
print(f"Passed Groups (One Unique Devanagari): {passed_groups}")
print(f"Failed Groups (Multiple Devanagari): {failed_groups}")
print(f"Phonetic Accuracy (Double Metaphone): {accuracy:.2f}%")



Total Groups: 4
Passed Groups (One Unique Devanagari): 1
Failed Groups (Multiple Devanagari): 3
Phonetic Accuracy (Double Metaphone): 25.00%
