In [35]:
from collections import defaultdict
import pandas as pd
import unicodedata
from metaphone import doublemetaphone

# Load the CSV file
df = pd.read_csv("files/multi_words_devnagri_root.csv")



In [36]:
df.head()

Unnamed: 0,Words,Devanagari,Root
0,raamrai,राम्रै,राम्रा
1,raamro,राम्रो,राम्रा
2,fohor,फोहोर,फोहोर
3,xodyo,छोड्यो,छोड्यो
4,sidhai,सिधै,सिधा


In [37]:
# Function to normalize text to ASCII
def normalize_to_ascii(word):
    return ''.join(
        (c for c in unicodedata.normalize('NFD', word) if unicodedata.category(c) != 'Mn')
    ).encode('ascii', 'ignore').decode('ascii')

# Dictionary to group both Romanized and Devanagari words
nepali_phonetic_groups = defaultdict(lambda: {"Words": set(), "Root": set()})

# Iterate over rows to group by Double Metaphone
for _, row in df.iterrows():
    roman = row["Words"]
    dev = row["Root"]
    normalized_word = normalize_to_ascii(roman)
    primary_code, secondary_code = doublemetaphone(normalized_word)

    # Choose primary code if available, otherwise fallback to secondary
    phonetic_code = primary_code or secondary_code

    if phonetic_code and len(phonetic_code) > 0:
        nepali_phonetic_groups[phonetic_code]["Words"].add(roman)
        nepali_phonetic_groups[phonetic_code]["Root"].add(dev)

# Convert sets to lists and print the results
for phonetic_code, groups in nepali_phonetic_groups.items():
    romanized_list = list(groups["Words"])
    devnagari_list = list(groups["Root"])
    print(f"Phonetic Code: {phonetic_code} -> Romanized: {romanized_list}, Root: {devnagari_list}")


Phonetic Code: RMR -> Romanized: ['raamrai', 'ramro', 'raamro', 'ramroi', 'ramroo'], Root: ['राम्रो', 'राम्रा', ' राम्रै', ' राम्रा', ' राम्रो']
Phonetic Code: FHR -> Romanized: ['fohor'], Root: ['फोहोर']
Phonetic Code: ST -> Romanized: ['xodyo', 'sadaie', 'sadhai', 'sidhai', 'siddai'], Root: [' छोड्यो', ' सिधा ', 'सधै', ' सिधा']
Phonetic Code: TKNKS -> Romanized: ['dekhinx', 'dekinxa'], Root: ['देखिन्छ']
Phonetic Code: FR -> Romanized: ['vayera', 'vayeraw', 'vaera', 'vayerw', 'vayara'], Root: [' भएर']
Phonetic Code: PR -> Romanized: ['paryoo', 'paurai', 'pariyo', 'purae', 'pareyo', 'bhayera', 'bhaera', 'paryo', 'puraie'], Root: [' भएर', ' पर्यो', ' पुरा', 'पुरै']
Phonetic Code: MLTN -> Romanized: ['mildaina'], Root: ['मिल्दैन']
Phonetic Code: A0 -> Romanized: ['authyo', 'aauthyo'], Root: ['आउथ्यो']
Phonetic Code: KRK -> Romanized: ['koreko', 'gariyeko', 'garieko'], Root: ['कोरेको', 'गरीएको']
Phonetic Code: PN -> Romanized: ['bhannuu', 'paaunu', 'bhannu', 'pugne', 'paunu'], Root: [' पु

In [38]:
# Accuracy Evaluation
total_groups = len(nepali_phonetic_groups)
passed_groups = sum(1 for group in nepali_phonetic_groups.values() if len(group["Root"]) == 1)
failed_groups = total_groups - passed_groups
accuracy = (passed_groups / total_groups) * 100 if total_groups else 0

print(f"\nTotal Groups: {total_groups}")
print(f"Passed Groups (One Unique Devanagari): {passed_groups}")
print(f"Failed Groups (Multiple Devanagari): {failed_groups}")
print(f"Phonetic Accuracy (Double Metaphone): {accuracy:.2f}%")



Total Groups: 120
Passed Groups (One Unique Devanagari): 78
Failed Groups (Multiple Devanagari): 42
Phonetic Accuracy (Double Metaphone): 65.00%


In [34]:
import pandas as pd

data = []
for phonetic_code, groups in nepali_phonetic_groups.items():
    romanized_list = list(groups["Words"])
    devnagari_list = list(groups["Root"])
    data.append({
        "Phonetic Code": phonetic_code,
        "Romanized": romanized_list,
        "Devanagari": devnagari_list
    })

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("results/metaphone_phonetic_groups.csv", index=False, encoding="utf-8")
