<a href="https://colab.research.google.com/github/detektor777/colab_list/blob/main/stress_syllables_ua.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title ##**Run** { display-mode: "form" }
import requests
from bs4 import BeautifulSoup
from IPython.display import display
import ipywidgets as widgets
import unicodedata
import regex  # Use regex module

custom_dict = {}

def normalize_text(text):
    return unicodedata.normalize('NFC', text)

def remove_combining_chars(text):
    decomposed = unicodedata.normalize('NFD', text)
    filtered = ''.join(c for c in decomposed if unicodedata.category(c) != 'Mn')
    return unicodedata.normalize('NFC', filtered)

def convert_accented_text(text):
    result = ""
    for char in text:
        decomposed = unicodedata.normalize('NFD', char)
        if any('COMBINING ACUTE ACCENT' in unicodedata.name(c, '') for c in decomposed):
            base_char = ''.join([c for c in decomposed if 'COMBINING ACUTE ACCENT' not in unicodedata.name(c, '')])
            result += unicodedata.normalize('NFC', base_char) + "+"
        else:
            result += unicodedata.normalize('NFC', char)
    return result

def adjust_case(original, replacement):
    if original.isupper():
        return replacement.upper()
    elif original[0].isupper() and original[1:].islower():
        return replacement.capitalize()
    elif original.islower():
        return replacement.lower()
    else:
        adjusted = ''
        for o_char, r_char in zip(original, replacement):
            if o_char.isupper():
                adjusted += r_char.upper()
            else:
                adjusted += r_char.lower()
        adjusted += replacement[len(original):]
        return adjusted

def replace_with_custom_dict(text):
    text = normalize_text(text)
    tokens = regex.findall(r'[\p{L}\p{M}\+]+|\s+|[^\s\p{L}\p{M}]+', text)
    new_tokens = []
    for token in tokens:
        token_normalized = normalize_text(token)
        if regex.match(r'^[\p{L}\p{M}\+]+$', token_normalized):
            token_no_combining = remove_combining_chars(token_normalized)
            base_token = token_no_combining.replace('+', '').lower()
            base_token = normalize_text(base_token)
            if base_token in custom_dict:
                replacement = custom_dict[base_token]
                adjusted_replacement = adjust_case(token, replacement)
                new_tokens.append(adjusted_replacement)
            else:
                new_tokens.append(token)
        else:
            new_tokens.append(token)
    return ''.join(new_tokens)

def send_post_request(text):
    url = "https://slovnyk.ua/nagolos.php"
    data = {'text': text}
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.post(url, data=data, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            emph_content = soup.find(id="emph")
            if emph_content:
                for br in emph_content.find_all("br"):
                    br.replace_with("\n")
                plain_text = emph_content.get_text()
                processed_text = replace_with_custom_dict(plain_text.strip())
                processed_text = convert_accented_text(processed_text)
                output.value = processed_text
            else:
                output.value = "Element with id='emph' not found."
        else:
            output.value = f"Error: {response.status_code}"
    except requests.exceptions.RequestException as e:
        output.value = f"Connection error: {e}"

def on_submit_button_clicked(b):
    text = text_input.value
    if text:
        send_post_request(text)
    else:
        output.value = "Please enter text."

def on_file_upload_change(change):
    global custom_dict
    if file_upload.value:
        uploaded_file = next(iter(file_upload.value.values()))
        content = uploaded_file['content'].decode('utf-8')
        lines = content.strip().split('\n')
        custom_dict = {}
        for line in lines:
            line = line.strip()
            if line:
                line_normalized = normalize_text(line)
                base_word = remove_combining_chars(line_normalized.replace('+', '').lower())
                custom_dict[base_word] = line_normalized

try:
    import regex
except ImportError:
    !pip install regex
    import regex

file_upload = widgets.FileUpload(
    accept='.txt',
    multiple=False
)
file_upload.observe(on_file_upload_change, names='value')

text_input = widgets.Textarea(
    description='Text:',
    placeholder='Enter text...',
    layout=widgets.Layout(width='90%', height='200px')
)
submit_button = widgets.Button(description="Submit")

output = widgets.Textarea(
    description='Result:',
    disabled=True,
    layout=widgets.Layout(width='90%', height='200px')
)

submit_button.on_click(on_submit_button_clicked)

display(widgets.HBox([file_upload, widgets.Label('Custom Dictionary')]))
display(text_input, submit_button, output)
