# Импорт библиотек

In [132]:
from lxml import etree as ET
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from matplotlib import pyplot as plt
import numpy as np
import json
from imblearn.over_sampling import RandomOverSampler

# Пути к каталогам

In [133]:
XML_TRAINING_FILE_PATH = "./data/train.xml"
XML_TEST_FILE_PATH = "./data/test.xml"
PREDICTED_RESULT_PATH = "./data/result.json"

# Извлечение информации из XML-файла

In [134]:
def read_xml(path):
    """
    Функция загружает данные из XML-файла, возвращает список слов и меток.

    :param path: путь к файлу
    :return: список слов и список меток
    """
    word_cnt_in_sentence = 1
    curr_parent = 0
    words = []
    labels = []
    cnt = 0

    for action, word in ET.iterparse(path, tag="word"):
        try:
            features = {
                "word": word.get("original"),  # Исходное слово: <word original/>
                "phrasal_stress": True if word.get("nucleus") else False,
                "pause": False,
                "pause_len": -1,
            }

            dictitem = word.find("dictitem")

            # Род и одушевленность слова <word genesys/>, <dictitem genesys/>
            features["genesys"] = dictitem.get("genesys")

            # Значения семантики слова: теги: <word>, <dictitem>, атрибуты: semantics1, semantics2
            features["semantics1"] = dictitem.get("semantics1")
            features["semantics2"] = dictitem.get("semantics2")

            # Части речи слов, теги <word>, <dictitem>, subpart_of_speech
            features["subpart_of_speech"] = dictitem.get("subpart_of_speech")

            # Формы слова (тег <word> тег <dictitem>, form).
            features["form"] = dictitem.get("form")

            # Информация о слове: гласные, Ё, прописные/строчные … (тег <word> тег <letter> для группы букв).
            letters = []
            for letter in word.findall("letter"):
                if letter.get("char") is not None:
                    letters.append(letter.get("char"))

            if len(letters) == 0:
                letters = list(features["word"])

            features["length"] = len(letters)
            features["vowels_sounds"] = sum(
                1 if letter in "аоиыуэАОИЫУЭ" else 0 for letter in letters
            )
            features["vowels_letters"] = sum(
                1 if letter in "аяуюоеёэиыАЯУЮОЕЁЭИЫ" else 0 for letter in letters
            )

            # Знаки препинания и эмфазы на слове (тег <content>, PunktEnd, PunktBeg, EmphEnd, EmphBeg).
            content = word.getprevious()
            features["PunktEnd"] = content.get("PunktEnd")
            features["PunktBeg"] = content.get("PunktBeg")
            features["EmphEnd"] = content.get("EmphEnd")
            features["EmphBeg"] = content.get("EmphBeg")

            if curr_parent == 0:
                curr_parent = word.getparent()[1]
            elif curr_parent == word.getparent()[1]:
                word_cnt_in_sentence += 1
            else:
                word_cnt_in_sentence = 1
                curr_parent = word.getparent()[1]

            # сколько слов в предложении до текущего
            features["words_before"] = word_cnt_in_sentence - 1

            pause = None
            if word.getnext() is not None:
                if word.getnext().tag == "pause":
                    pause = word.getnext()
                elif (
                    word.getnext().getnext() is not None
                    and word.getnext().getnext().tag == "pause"
                ):
                    pause = word.getnext().getnext()

            label = [
                "undefined"
                if pause is None
                else pause.get(
                    "type"
                ),  # тип паузы, если добавляем отсутствующую паузу после слова, то undefined.
                0
                if pause is None or pause.get("time") is None
                else pause.get("time"),  # время паузы
                0
                if pause is None
                else 1,  # есть ли пауза после текущего слова или нет 1/0.
                0 if word.get("nucleus") is None else word.get("nucleus"),
            ]  # те самые нуклеус, если в word есть nucleus, то переносим в список, если нет то ставим 0.

            labels.append(label)

            words.append(features)

            word.clear()

        except:
            cnt += 1
    print("Количество исключений:", cnt)
    return labels, words

In [135]:
labels, words = read_xml(XML_TRAINING_FILE_PATH)

Количество исключений: 0


In [136]:
labels[:5]

[['undefined', 0, 0, 0],
 ['undefined', 0, 0, 0],
 ['undefined', 0, 0, 0],
 ['x-long', '1020', 1, '2'],
 ['undefined', 0, 0, 0]]

In [137]:
words[0]

{'word': 'ПРЕДИСЛОВИЕ',
 'phrasal_stress': False,
 'pause': False,
 'pause_len': -1,
 'genesys': '6',
 'semantics1': None,
 'semantics2': None,
 'subpart_of_speech': '1',
 'form': '1',
 'length': 11,
 'vowels_sounds': 3,
 'vowels_letters': 5,
 'PunktEnd': None,
 'PunktBeg': None,
 'EmphEnd': None,
 'EmphBeg': None,
 'words_before': 0}

# Преобразование информации в признаки для обучения

In [138]:
def get_features(words):
    keys = list(words[0].keys())[4:]
    df = pd.DataFrame([{k: word[k] if word[k] is not None else 0 for k in keys} for word in words])
    scaler = MinMaxScaler()
    features = scaler.fit_transform(df)
    return features

In [139]:
def get_labels(labels):
    pause_type = [element[0] for element in labels]
    pause_time = [element[1] for element in labels]
    pause = [element[2] for element in labels]
    phrasal_stress = [element[3] for element in labels]
    phrasal_stress_binary = []
    for i in phrasal_stress:
        if i == 0:
            i = "False"
        else:
            i = "True"
        phrasal_stress_binary.append(i)
    return pause_type, pause_time, pause, phrasal_stress, phrasal_stress_binary

In [140]:
features = get_features(words)

In [141]:
pause_type, pause_time, pause, phrasal_stress, phrasal_stress_binary = get_labels(labels)

In [142]:
pause_time= list(map(lambda x : int(x), pause_time))

# Обучение моделей

## Предсказание мест пауз

In [143]:
# Балансировка датасета c методом overSample
oversample = RandomOverSampler(sampling_strategy='minority')

In [144]:
X_over, y_pause_over = oversample.fit_resample(features, pause)

In [145]:
X_train, X_test, Y_train, Y_test = train_test_split(X_over, y_pause_over, test_size=0.2, random_state=42)
clf_pause = RandomForestClassifier(random_state=42)
clf_pause.fit(X_train,Y_train)
pred_pause = clf_pause.predict(X_test)
print(classification_report(Y_test, pred_pause))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      9853
           1       0.95      0.98      0.96      9855

    accuracy                           0.96     19708
   macro avg       0.96      0.96      0.96     19708
weighted avg       0.96      0.96      0.96     19708



## Предсказание длительности пауз

In [146]:
X_train_pause_time, X_test_pause_time, Y_train_pause_time, Y_test_pause_time = train_test_split(features, pause_time, test_size=0.2, random_state=42)
regressor_pause_time = RandomForestRegressor(random_state=42)
regressor_pause_time.fit(X_train_pause_time, Y_train_pause_time)
regressor_pause_time.score(X_test_pause_time, Y_test_pause_time)

0.8463472855112936

In [147]:
pred_pause_time = regressor_pause_time.predict(X_test_pause_time)

## Предсказание мест фразового ударения

In [148]:
X_over, y_phrasal_stress_binary_over = oversample.fit_resample(features, phrasal_stress_binary)

In [149]:
X_train_phrasal_stress, X_test_phrasal_stress, Y_train_phrasal_stress, Y_test_phrasal_stress = train_test_split(X_over, y_phrasal_stress_binary_over, test_size=0.2, random_state=42)
clf_phrasal_stress = RandomForestClassifier(random_state=42)
clf_phrasal_stress.fit(X_train_phrasal_stress,Y_train_phrasal_stress)
pred_phrasal_stress = clf_phrasal_stress.predict(X_test_phrasal_stress)
print(classification_report(Y_test_phrasal_stress, pred_phrasal_stress))

              precision    recall  f1-score   support

       False       0.97      0.94      0.95      9839
        True       0.94      0.97      0.96      9861

    accuracy                           0.96     19700
   macro avg       0.96      0.96      0.96     19700
weighted avg       0.96      0.96      0.96     19700



# Предсказание на тестовом файле

In [150]:
_, test_words = read_xml(XML_TEST_FILE_PATH)

Количество исключений: 0


In [151]:
test_features = get_features(test_words)

In [152]:
pred_pause = clf_pause.predict(test_features)
pred_pause_time = regressor_pause_time.predict(test_features)
pred_phrasal_stress = clf_phrasal_stress.predict(test_features)

In [153]:
pred_pause_time= list(map(lambda x : int(x), pred_pause_time))

In [154]:
results = []
for i in range(len(test_words)):
    record = {"content":test_words[i]["word"]}
    if pred_pause[i] == 0:
        record["pause_len"] = -1
    else:
        record["pause_len"] = int(pred_pause_time[i])
    record["phrasal_stress"] = pred_phrasal_stress[i]
    results.append(record)

In [155]:
with open(PREDICTED_RESULT_PATH, 'w', encoding='utf-8') as json_file:
    json.dump([{"words": results}], json_file, ensure_ascii=False, indent=4)