In [18]:
import pandas as pd
from tabulate import tabulate
import json
import os
import re
from dotenv import load_dotenv
from meilisearch import Client
from variables import LATIN_GURMUKHI, PAIRI_RARA_CHARS

load_dotenv()

True

In [19]:
ms_url = os.getenv('MEILISEARCH_URL')
print(ms_url)
ms_key = f'{os.getenv('MEILISEARCH_KEY')}'
print(ms_key)

db = Client('http://localhost:7700', 'CTt64BmnTQp3SbERoImqXUEY-CegZfnuNnHZfiH1TQs')

http://localhost:7700
4a08fa4ddc18643e82c78475e74226e25e4714caecbb08774a9047d35c725bfa


In [20]:
def find_incompatible_chars(series):
    incompatible_chars = {}
    num = 0

    for val in series:
        num += 1
        for char in val:
            if char not in LATIN_GURMUKHI:
                incompatible_chars[char] = 1 if char not in incompatible_chars else incompatible_chars[char] + 1

    return incompatible_chars

In [21]:

def find_after_sihari(series):
    after_sihari = set()
    pattern = r'\b\w*f\w{1,2}J\w*\b'

    for val in series:
        matches = re.findall(pattern, val, re.DOTALL)
        for match in matches:
            after_sihari.add(match)
    
    return after_sihari

res = find_after_sihari([' afsd prefixfあJsuffix more words'])
print(res)

{'prefixfあJsuffix'}


In [22]:
# Define combinational_map in Python (equivalent to JS Map of Maps)
combinational_map = {
    'A': {'g': '\u0A09', 't': '\u0A0A'},
    'd': {'e': '\u0A06', 'w': '\u0A10', 'q': '\u0A14', 'X': 'ਆਂ'},
    'D': {'f': '\u0A07', 'r': '\u0A08', 's': '\u0A0F'}
}

def transform_latin_to_gurmukhi(latin_text):
    has_sihari = False
    combinational_char = None
    pairi_rara = False
    gurmukhi_text = []


    for i, char in enumerate(latin_text):
        if pairi_rara:
            pairi_rara = False
            continue

        if char == 'f':  # Handling sihari 'ਿ'
            has_sihari = True
            continue

        if has_sihari:
            if char == "D":  # Handling ੲ (Unicode: ਿ)
                gurmukhi_text.append("\u0A07")  # Add 'ਇ'
            else:
                if combinational_char:
                    gurmukhi_text.append(get_char(combinational_char))
                    combinational_char = None

                next_char = latin_text[i + 1] if i < len(latin_text) - 1 else None
                if next_char == 'J':
                    gurmukhi_text.extend([get_char(char) + get_char('J'), get_char('f')])
                    pairi_rara = True
                else:
                    gurmukhi_text.extend([get_char(char), get_char("f")])
            has_sihari = False
            continue

        if combinational_char:
            combi_map = combinational_map.get(combinational_char, {})

            if char in combi_map:
                gurmukhi_text.append(combi_map[char])
            else:
                gurmukhi_text.append(get_char(combinational_char))
                gurmukhi_text.append(get_char(char))
            combinational_char = None
        elif char in combinational_map:  # Handle combinational characters
            next_char = latin_text[i + 1] if i < len(latin_text) - 1 else None

            if next_char == None:
                gurmukhi_text.append(get_char(char))
            else:
                combinational_char = char
        else:
            gurmukhi_text.append(get_char(char))  # Default case for regular characters

    return ''.join(gurmukhi_text)

def get_char(char):
    c = LATIN_GURMUKHI.get(char)
    return c if c is not None else char

x = transform_latin_to_gurmukhi('hjflJd')
print(x)

ਪਰਤ੍ਰਿਅ


In [23]:
def split_definitions(unicode_txt):
    txt = '੧. ' + unicode_txt
    pattern = r'[\u0A66-\u0A6F]+\.\s*(.*?)\s*(?=[\u0A66-\u0A6F]+\.|$)'
    
    def_list = re.findall(pattern, txt, re.DOTALL)
    return def_list

In [24]:
df = pd.read_json('./data/in/dict.json')
df['akharIndex'] = df['akharIndex'].astype('Int8')

In [25]:
df2 = df.rename(columns={'lexeme': 'headword'})

df2['akharIndex'] = df2['akharIndex'].ffill()

In [None]:
# ends_with_semicolon = df2['headword'].str.endswith(';');

# all_have_semicolon = ends_with_semicolon.all()
# print("Do all headwords have a ';' suffix?", all_have_semicolon)

# names_without_semicolon = df2[~ends_with_semicolon]['headword'].tolist()
# print("Headwords without a ';' suffix:", names_without_semicolon)

series = df2['definition']
incompatible_chars = find_incompatible_chars(series)
sorted_incompatible_chars = dict(sorted(incompatible_chars.items(), key=lambda item: item[1]))
sorted_incompatible_chars_desc = dict(sorted(incompatible_chars.items(), key=lambda item: item[1], reverse=True))
pairi_rara = find_after_sihari(series)
pairi_rara_gurmukhi = map(transform_latin_to_gurmukhi, pairi_rara)

# print(sorted_incompatible_chars)
# print(sorted_incompatible_chars_desc.keys())
print(pairi_rara)
print(list(pairi_rara_gurmukhi))

In [27]:
df2['headword'] = df2['headword'].str.rstrip(';')

In [28]:
headword_list = df2['headword'].values
filtered_df = df2[df2['headword'].str.contains(r'f.*J', case=True, na=False)]
headword_list = filtered_df['headword'].values
print(headword_list)

['AgSWmJbefDm, AgSWmJbeDrM' 'AgNQfyJzfl' 'AgfOJl' ... 'fbJzOe' 'fbJzfO'
 'bJrfu, bJrur']


In [29]:
id_series = df2.index + 1
df2.insert(0, 'id', id_series)

In [30]:
headword_series = df2['headword']
definition_series = df2['definition']
unicode_headword_series = headword_series.apply(lambda hw: transform_latin_to_gurmukhi(hw))
unicode_definition_series = definition_series.apply(lambda definition: transform_latin_to_gurmukhi(definition))
definitions_list = unicode_definition_series.apply(lambda definition: split_definitions(definition))

In [31]:
df2.insert(3, 'unicodeHeadword', unicode_headword_series)
df2.insert(5, 'unicodeDefinition', unicode_definition_series)
df2.insert(6, 'definitionList', definitions_list)

In [32]:
json_output = df2.to_json(orient='records')
json_parsed = json.loads(json_output)
json_formatted = json.dumps(json_parsed, indent=4)

db.index('mk-6').add_documents(json_parsed)

with open('./data/out/dict_out.json', 'w') as file:
    file.write(json_formatted)

In [33]:
# pretty_table = tabulate(df2, headers='keys', tablefmt='pretty')
# print(pretty_table)