In [9]:
from collections import defaultdict
from fuzzy import Soundex
import pandas as pd
import unicodedata

# Load the CSV file
df = pd.read_csv("files/words_devnagri_root.csv")




In [10]:
df.head()

Unnamed: 0,Words,Devanagari,Root
0,xata,छट,छट
1,xatw,छट,छट
2,xodyo,छोड्यो,छोड्यो
3,xoto,छोटो,छोटो
4,sidhai,सिधै,सिधा


In [11]:
# Initialize Soundex instance
soundex = Soundex(4)

# Function to normalize text to ASCII
def normalize_to_ascii(word):
    return ''.join(
        (c for c in unicodedata.normalize('NFD', word) if unicodedata.category(c) != 'Mn')
    ).encode('ascii', 'ignore').decode('ascii')

# Dictionary to group both Romanized and Devanagari words
nepali_phonetic_groups = defaultdict(lambda: {"Words": set(), "Devanagari": set()})

# Iterate over rows to group by Soundex
for _, row in df.iterrows():
    roman = row["Words"]
    dev = row["Devanagari"]
    normalized_word = normalize_to_ascii(roman)
    phonetic_code = soundex(normalized_word)
    if phonetic_code and len(phonetic_code) > 0:
        nepali_phonetic_groups[phonetic_code]["Words"].add(roman)
        nepali_phonetic_groups[phonetic_code]["Devanagari"].add(dev)

# Convert sets to lists and print the results
for phonetic_code, groups in nepali_phonetic_groups.items():
    romanized_list = list(groups["Words"])
    devnagari_list = list(groups["Devanagari"])
    print(f"Phonetic Code: {phonetic_code} -> Romanized: {romanized_list}, Devanagari: {devnagari_list}")

Phonetic Code: X3 -> Romanized: ['xata', 'xoto', 'xatw', 'xito', 'xodyo'], Devanagari: [' छोटो', 'छोड्यो', ' छट', ' छिटो', 'छट']
Phonetic Code: S3 -> Romanized: ['sidha', 'sutae', 'swaad', 'sidhai', 'sidai', 'sayad', 'siddai'], Devanagari: [' सिधै', 'सिधै', 'सुते', 'सायद', ' स्वाद', ' सिधा']
Phonetic Code: C3 -> Romanized: ['chhito'], Devanagari: ['छिटो']
Phonetic Code: F6 -> Romanized: ['feri'], Devanagari: ['फेरी']
Phonetic Code: P6 -> Romanized: ['pareyo', 'paurai', 'puraa', 'pheri', 'purae', 'paryo', 'pariyo', 'paryoo'], Devanagari: [' पुरै', ' पर्यो', ' फेरी', ' परियो', 'पुरा']
Phonetic Code: V6 -> Romanized: ['vayera', 'vayara', 'vayerw', 'vayeraw', 'vaera'], Devanagari: ['भएर', ' भएर']
Phonetic Code: B6 -> Romanized: ['bhayera', 'bhaera'], Devanagari: ['भएर', ' भएर']


In [12]:
# Filter out groups with empty 'Words'
valid_groups = {k: v for k, v in nepali_phonetic_groups.items() if v["Words"]}

# Initialize counters
total_groups = len(valid_groups)
passed_groups = 0
failed_groups = 0

# Evaluate each valid group
for code, group in valid_groups.items():
    if len(group["Devanagari"]) == 1:
        passed_groups += 1
    else:
        failed_groups += 1

# Calculate accuracy
accuracy = (passed_groups / total_groups) * 100 if total_groups > 0 else 0

# Print results
print(f"\nTotal Valid Groups: {total_groups}")
print(f"Passed Groups (One Unique Devanagari): {passed_groups}")
print(f"Failed Groups (Multiple Devanagari): {failed_groups}")
print(f"Phonetic Accuracy: {accuracy:.2f}%")


Total Valid Groups: 7
Passed Groups (One Unique Devanagari): 2
Failed Groups (Multiple Devanagari): 5
Phonetic Accuracy: 28.57%


In [13]:
import pandas as pd

data = []
for phonetic_code, groups in nepali_phonetic_groups.items():
    romanized_list = list(groups["Words"])
    devnagari_list = list(groups["Devanagari"])
    data.append({
        "Phonetic Code": phonetic_code,
        "Romanized": romanized_list,
        "Devanagari": devnagari_list
    })

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("results/soundex_phonetic_groups.csv", index=False, encoding="utf-8")
