In [1]:
import pandas as pd
from tabulate import tabulate
import json
from variables import LATIN_GURMUKHI

In [2]:
def find_incompatible_chars(series):
    incompatible_chars = set()
    num = 0

    # print(LATIN_GURMUKHI)

    for val in series:
        num += 1
        for char in val:
            if char not in LATIN_GURMUKHI:
                incompatible_chars.add(char)

    return incompatible_chars

In [3]:
# Define combinational_map in Python (equivalent to JS Map of Maps)
combinational_map = {
    'A': {'g': '\u0A09', 't': '\u0A0A'},
    'd': {'e': '\u0A06', 'w': '\u0A10', 'q': '\u0A14'},
    'D': {'f': '\u0A07', 'r': '\u0A08', 's': '\u0A0F'}
}

def transform_latin_to_gurmukhi(latin_text):
    has_sihari = False
    combinational_char = None
    gurmukhi_text = []

    for char in latin_text:
        if char == 'f':  # Handling sihari 'ਿ'
            has_sihari = True
            continue

        if has_sihari:
            if char == "D":  # Handling ੲ (Unicode: ਿ)
                gurmukhi_text.append("\u0A07")  # Add 'ਇ'
            else:
                gurmukhi_text.append(get_char(char))  # Add corresponding character
                gurmukhi_text.append(get_char("f"))   # Add 'ਿ'
            has_sihari = False
            continue

        if combinational_char:
            combi_map = combinational_map.get(combinational_char, {})
            if char in combi_map:
                gurmukhi_text.append(combi_map[char])
            else:
                gurmukhi_text.append(combinational_char)
                gurmukhi_text.append(char)
            combinational_char = None
        elif char in combinational_map:  # Handle combinational characters
            combinational_char = char
        else:
            gurmukhi_text.append(get_char(char))  # Default case for regular characters

    return ''.join(gurmukhi_text)

def get_char(char):
    c = LATIN_GURMUKHI.get(char)
    return c if c is not None else char

In [4]:
df = pd.read_json('./data/in/dict.json')
df['akharIndex'] = df['akharIndex'].astype('Int8')

In [5]:
df2 = df.rename(columns={'lexeme': 'headword'})

df2['akharIndex'] = df2['akharIndex'].ffill()

In [6]:
# ends_with_semicolon = df2['headword'].str.endswith(';');

# all_have_semicolon = ends_with_semicolon.all()
# print("Do all headwords have a ';' suffix?", all_have_semicolon)

# names_without_semicolon = df2[~ends_with_semicolon]['headword'].tolist()
# print("Headwords without a ';' suffix:", names_without_semicolon)

series = df2['definition']
incompatible_chars = find_incompatible_chars(series)
# print(incompatible_chars)

In [7]:
df2['headword'] = df2['headword'].str.rstrip(';')

In [8]:
id_series = df2.index + 1
df2.insert(0, 'id', id_series)

In [9]:
headword_series = df2['headword']
definition_series = df2['definition']
unicode_headword_series = headword_series.apply(lambda hw: transform_latin_to_gurmukhi(hw))
unicode_definition_series = definition_series.apply(lambda definition: transform_latin_to_gurmukhi(definition))

df2.insert(3, 'unicodeHeadword', unicode_headword_series)
df2.insert(5, 'unicodeDefinition', unicode_definition_series)

In [10]:
json_output = df2.to_json(orient='records')
# json_formatted = json.dumps(json_output, indent=4)
with open('./data/out/dict_out.json', 'w') as file:
    file.write(json_output)

In [11]:
# pretty_table = tabulate(df2, headers='keys', tablefmt='pretty')
# print(pretty_table)