In [9]:
# Import necessary libraries
import string
from mlphon import PhoneticAnalyser

In [10]:
# Create an object for PhoneticAnalyser()
mlphon = PhoneticAnalyser()

In [22]:
# Read the list of Malayalam words from the vocabulary file
mlwords = []
with open("ml_vocab.txt", encoding = "utf-8") as file:
    mlwords = file.read().splitlines()

In [23]:
len(mlwords)

120870

In [24]:
def split_ipa(word, ml_ipa):
    """
    Function to split the IPA representation of a Malayalam word into individual phonemes
    :param word: string, Malayalam word
    :param IPA: string, IPA representation of word
    :return: the split IPA as a string
    """
    ipa_split = []   # Initialize an empty list to store the split IPA
    idx = 0   # Initialize an index variable to keep track of the current position in the split IPA list
    for ch in ml_ipa[0]:   # Iterate over the characters of the IPA representation
        # if character is " ̪ "[1] or "ː", concatenate the character to the last element of ipa_split
        if ch == ' ̪ '[1] or ch == "ː":
            ipa_split[idx-1] = ipa_split[idx-1] + ch + (' ')
        # check if character is '̯'
        elif ch == '̯':
                # handle special cases where a single phoneme comrpises of multiple characters. Eg: "au̯" is one phoneme
                if ipa_split[idx-1] in ['u', 'i'] and ipa_split[idx-2] == 'a':
                    ipa_split[idx-2] = ipa_split[idx-2] + ipa_split[idx-1] + ch + (' ')
                    ipa_split = ipa_split[:-1]
                    idx -= 1
                else:
                    ipa_split[idx-1] = ipa_split[idx-1] + ch + (' ')
        # if character is is "ɨ" and the last element of ipa_split is "r", add character to ipa_split
        elif ch =="ɨ" and ipa_split[idx-1] == "r":
            ipa_split[idx-1] = ipa_split[idx-1] + ch + (' ')
        # check if the character is "ʃ" 
        elif ch == "ʃ":
            # if character occurs at the beginning of the IPA representation, add character to ipa_split
            if idx == 0:
                ipa_split.append(ch)
                idx += 1
            # the last element of ipa_split is " ͡ "[1], concatenate the last two elements and the current character to the second last element (before space).
            elif ipa_split[idx-1] == ' ͡ '[1]:
                ipa_split[idx-2] = ipa_split[idx-2] + ' ͡ '[1] + ch + (' ')
                ipa_split = ipa_split[:-1]
                idx -=1
        # check if the character is "ʰ" or "ʱ"
        elif ch == "ʰ" or ch == "ʱ":
            # if the last element of ipa_split is more than one character, it replaces the space with the current character
            if len(ipa_split[idx-1]) > 1:
                ipa_split[idx-1] = ipa_split[idx-1][:-1] + ch + (' ')
            # else add character to ipa_split
            else:
                ipa_split[idx-1] = ipa_split[idx-1] + ch + (' ')
        # if none of the conditions are met, append the current character to ipa_split
        else:
            ipa_split.append(ch)
            idx += 1
    # if continuent phoneme has only one character, append a space character. Eg., "a" -> "a "
    ipa_split = [ch + ' ' if len(ch) == 1 else ch for idx, ch in enumerate(ipa_split)]
    ipa_split = ''.join(ipa_split)   # Join all characters in the list to form a single string
    ipa_split = ipa_split.rstrip()   # Remove trailing whitespaces, if any
    return ipa_split

In [25]:
def get_ipa(input_words):
    """
    Function that generates IPA representation for a Malayalam word or a list of Malayaalam words.
    :param input_word: a string or a list of strings of Malayalam word(s)
    :return: a dictionary, with <malayalam_word>:<space seperated IPA representation> as the key-value pair
    """
    ipa = {}    # Initialize an empty dictionary to store the word-IPA pairs
    # Assign input to mlwords depending on datatype (string vs list of strings)
    mlwords = [''.join(input_words.split())] if isinstance(input_words, str) else input_words
    for word in mlwords:
        try:
            # If current word has a valid IPA representation in mlphon library, use it as the IPA representation
            ml_ipa = mlphon.grapheme_to_phoneme(word)
        except ValueError:
            # Else, concatenate IPA representation of constituent letters
            ml_ipa[0] = ''.join([mlphon.grapheme_to_phoneme(letter)[0] for letter in word])
        ipa[word] = ' '.join(split_ipa(word, ml_ipa).split()) 
    return ipa

In [26]:
ml_ipa = get_ipa(mlwords)
ml_ipa

{'അ': 'a',
 'അം': 'a m',
 'അആ': 'a aː',
 'അകം': 'a k a m',
 'അകണശ്വേതകോശങ്ങൾ': 'a k a ɳ a ʋ eː t̪ a k oː a ŋ ŋ a ɭ',
 'അകത്തളങ്ങളിൽ': 'a k a t̪ t̪ a ɭ a ŋ ŋ a ɭ i l',
 'അകത്തളത്തിൽ': 'a k a t̪ t̪ a ɭ a t̪ t̪ i l',
 'അകത്താക്കി': 'a k a t̪ t̪ aː k k i',
 'അകത്തി': 'a k a t̪ t̪ i',
 'അകത്തു': 'a k a t̪ t̪ u',
 'അകത്തും': 'a k a t̪ t̪ u m',
 'അകത്തുകയറി': 'a k a t̪ t̪ u k a j a r i',
 'അകത്തുനിന്നു': 'a k a t̪ t̪ u n i n̪ n̪ u',
 'അകത്തുനിന്നും': 'a k a t̪ t̪ u n i n̪ n̪ u m',
 'അകത്തുനിന്ന്': 'a k a t̪ t̪ u n i n̪ n̪ ə',
 'അകത്തുള്ള': 'a k a t̪ t̪ u ɭ ɭ a',
 'അകത്തെ': 'a k a t̪ t̪ e',
 'അകത്തേ': 'a k a t̪ t̪ eː',
 'അകത്തേക്കു': 'a k a t̪ t̪ eː k k u',
 'അകത്തേക്കും': 'a k a t̪ t̪ eː k k u m',
 'അകത്തേക്ക്': 'a k a t̪ t̪ eː k k ə',
 'അകത്തേത്തറ': 'a k a t̪ t̪ eː t̪ t̪ a r a',
 'അകത്തേയ്ക്കു': 'a k a t̪ t̪ eː j k k u',
 'അകത്തേയ്ക്ക്': 'a k a t̪ t̪ eː j k k ə',
 'അകത്തോ': 'a k a t̪ t̪ oː',
 'അകത്ത്': 'a k a t̪ t̪ ə',
 'അകനാനൂറ്': 'a k a n aː n uː r ə',
 'അകന്ന': 'a k a n̪ n̪ a',
 'അകന്നകന്

In [28]:
# Generate lexicon file 
with open("ml_lexicon.txt", "w", encoding = "utf-8") as out_file:
    for word, ipa in ml_ipa.items():
        out_file.write(word + " " + ipa + "\n")