# Лабораторная работа №3. Акустика

In [1]:
from tqdm.auto import tqdm
from bs4 import BeautifulSoup as bs
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import classification_report
from scipy import spatial, stats
from numpy import mean

## Чтение XML-файла с размеченными данными

In [2]:
def read_xml(path, train=True):
    with open(path, "r", encoding="utf-8") as file:
        bs_content = bs(file.read(), features="xml")
        sentences = bs_content.find_all("sentence")
        if train:
            allophones = bs_content.find_all("allophone")
            allophones_classes = list(sorted(set([a.get("ph") for a in allophones])))
            pauses_types = list(sorted(set([pause.get("type") for pause in bs_content.find_all("pause")])))
            return sentences, allophones_classes, pauses_types
        else: return sentences

In [3]:
sentences, allophones_classes, pauses_types = read_xml("./data/train.xml")

In [4]:
print("allophones:", allophones_classes)
print("pauses types:", pauses_types)

allophones: ['C', 'CH', 'H', 'SC', 'a0', 'a1', 'a2', 'a4', 'b', "b'", 'c', 'ch', 'd', "d'", 'e0', 'e1', 'f', "f'", 'g', "g'", 'h', "h'", 'i0', 'i1', 'i4', 'j', 'k', "k'", 'l', "l'", 'm', "m'", 'n', "n'", 'o0', 'o1', 'o4', 'p', "p'", 'r', "r'", 's', "s'", 'sc', 'sh', 't', "t'", 'u0', 'u1', 'u4', 'v', "v'", 'y0', 'y1', 'y4', 'z', "z'", 'zh']
pauses types: ['long', 'minimal', 'spelling', 'weak', 'x-long']


## Извлечение признаков

In [5]:
def get_allophone_features(allophone, allophones_classes):
    # Выделение признаков отдельно взятого аллофона.
    features = {}
    ph = allophone.get("ph")
    if ph[-1].isdigit():
        features["digit"] = int(ph[-1])
    else:
        features["digit"] = None
    if ph[-1] == "'":
        features["soft"] = True
    else:
        features["soft"] = False
    features["ph_id"] = allophones_classes.index(
        ph
    )  # Перевод строки в числовой идентификатор.

    mfcc = allophone.get("mfcc")
    if mfcc:
        for i, coefficient in enumerate(mfcc[1:-1].split("|")):
            features[f"mfcc_{i}"] = float(coefficient)
    return features


def get_word_features(word):
    # Выделение признаков отдельно взятого слова (тег <word>).
    features = {}
    features["content"] = word.get("original", "")

    features["phrasal_stress"] = True if word.get("nucleus") else False
    features["pause"] = False
    features["pause_type"] = None

    dictitem = word.find("dictitem")
    features["stressed"] = True if word.find("stress") else False
    features["subpart_of_speech"] = dictitem.get("subpart_of_speech")
    features["form"] = dictitem.get("form")
    features["genesys"] = dictitem.get("genesys")
    features["semantics1"] = dictitem.get("semantics1")
    features["semantics2"] = dictitem.get("semantics2")

    letters = [letter.get("char") for letter in word.find_all("letter")]
    features["length"] = len(letters)
    features["vowels"] = sum(
        [0 if letter == None or letter not in "ауоиэыяюеё" else 1 for letter in letters]
    )
    features["vowels_ratio"] = features["vowels"] / features["length"]

    return features


def get_allophones_features_by_sentence(sentence, pauses_types, allophones_classes):
    # Выделение признаков слов в составе предложения - данные о пунктуации, паузации,
    # числе фонетически значимых слов до и после заданного слова.

    allophones_features = []
    words_features = []
    punkt_flag = False
    emph_flag = False

    for child in sentence.children:
        if child.name == "word":
            current_features = get_word_features(child)
            current_features["punkt"] = punkt_flag
            punkt_flag = False
            current_features["emph"] = emph_flag
            emph_emph = False
            words_features.append(current_features)

        elif child.name == "pause":
            words_features[-1]["pause"] = True
            words_features[-1]["pause_type"] = pauses_types.index(
                child.get("type")
            )  # Перевод строки в числовой идентификатор.

        elif child.name == "content":
            if child.has_attr("punktbeg") or child.has_attr("punktend"):
                punkt_flag = True
            if child.has_attr("emphbeg") or child.has_attr("emphend"):
                emph_flag = True

    for i, feat in enumerate(words_features):
        feat["words_before"] = sum(
            1 if f["stressed"] else 0 for f in words_features[:i]
        )
        feat["words_after"] = sum(
            1 if f["stressed"] else 0 for f in words_features[i + 1 :]
        )

    for word, features in zip(sentence.find_all("word"), words_features):
        for allophone in word.find_all("allophone"):
            cur_allophone_features = get_allophone_features(allophone, allophones_classes)
            cur_allophone_features.update(
                features
            )  # Обогатим признаки аллофона признаками слова, которому он принадлежит.
            allophones_features.append(cur_allophone_features)

    return allophones_features

In [6]:
dataset = [
    allophones_features
    for sentence in tqdm(sentences)
    for allophones_features in get_allophones_features_by_sentence(
        sentence, pauses_types, allophones_classes
    )
]

  0%|          | 0/4490 [00:00<?, ?it/s]

In [7]:
df = pd.DataFrame.from_dict(dataset)
df

Unnamed: 0,digit,soft,ph_id,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,...,genesys,semantics1,semantics2,length,vowels,vowels_ratio,punkt,emph,words_before,words_after
0,,False,37,-0.738621,-0.873899,-0.042522,-0.250536,-0.087017,0.095343,0.218702,...,6,,,11,5,0.454545,False,False,0,2
1,,True,40,-0.937843,0.717164,-0.721997,-0.102828,0.735957,0.273095,-0.323994,...,6,,,11,5,0.454545,False,False,0,2
2,1.0,False,23,-1.504312,0.565370,-0.484591,0.585640,0.931318,-0.035323,-0.472230,...,6,,,11,5,0.454545,False,False,0,2
3,,True,13,-0.351709,-0.792893,0.618621,0.639400,0.271709,0.380412,0.031763,...,6,,,11,5,0.454545,False,False,0,2
4,1.0,False,23,-0.598699,0.193774,-0.429317,0.580467,0.532159,0.367080,-0.390045,...,6,,,11,5,0.454545,False,False,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367916,1.0,False,48,-1.870997,1.131810,0.765903,0.136126,-0.019280,-0.259830,-0.115509,...,4,,,6,3,0.500000,False,False,16,0
367917,,False,18,-1.341518,0.952777,0.847543,0.159161,0.075260,-0.173946,-0.010557,...,4,,,6,3,0.500000,False,False,16,0
367918,1.0,False,48,-1.787228,1.118154,0.587195,0.141254,-0.012791,-0.243145,-0.109019,...,4,,,6,3,0.500000,False,False,16,0
367919,,False,32,-1.442069,0.766079,0.131344,0.594842,0.228244,-0.003596,0.241259,...,4,,,6,3,0.500000,False,False,16,0


In [8]:
df.iloc[1]

digit                        NaN
soft                        True
ph_id                         40
mfcc_0                 -0.937843
mfcc_1                  0.717164
mfcc_2                 -0.721997
mfcc_3                 -0.102828
mfcc_4                  0.735957
mfcc_5                  0.273095
mfcc_6                 -0.323994
mfcc_7                 -0.012875
mfcc_8                 -0.013139
mfcc_9                 -0.023937
mfcc_10                 0.088735
mfcc_11                 0.124683
content              ПРЕДИСЛОВИЕ
phrasal_stress             False
pause                      False
pause_type                   NaN
stressed                    True
subpart_of_speech              1
form                           1
genesys                        6
semantics1                  None
semantics2                  None
length                        11
vowels                         5
vowels_ratio            0.454545
punkt                      False
emph                       False
words_befo

## Обучения модели

In [9]:
target_columns = [f"mfcc_{i}" for i in range(12)]
X = df.drop(columns=target_columns + ["content"])
y = df[target_columns]

categorical_mask = [
    True,
    False,
    True,
    False,
    False,
    True,
    False,
    True,
    True,
    True,
    True,
    False,
    False,
    False,
    False,
    False,
    False,
    False,
    False,
]


def train_mfcc_regressor():
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    regressor = MultiOutputRegressor(
        HistGradientBoostingRegressor(
            random_state=0, categorical_features=categorical_mask
        ),
        n_jobs=12,
    )
    regressor.fit(X_train, y_train)

    y_pred = regressor.predict(X_test)
    cosine_distances = [
        spatial.distance.cosine(a, b) for a, b in zip(y_pred, y_test.values)
    ]
    print(stats.describe(cosine_distances))
    print("R2 coefficient of determination:", regressor.score(X_test, y_test))
    return regressor


mfcc_regressor = train_mfcc_regressor()


DescribeResult(nobs=73585, minmax=(0.0002597091537360763, 1.7599758690147556), mean=0.04724914467471499, variance=0.007911564841390516, skewness=6.14422420009068, kurtosis=56.41366141951458)
R2 coefficient of determination: 0.7905871802701642


## Предсказание результатов

In [10]:
test_sentences = read_xml("./data/input_lab3.xml", train=False)
test_dataset = [
    allophones_features
    for sentence in tqdm(test_sentences)
    for allophones_features in get_allophones_features_by_sentence(
        sentence, pauses_types, allophones_classes
    )
]
test_df = pd.DataFrame.from_dict(test_dataset)
X_test = test_df.drop(columns=["content"])
predictions = mfcc_regressor.predict(X_test)

  0%|          | 0/200 [00:00<?, ?it/s]

In [11]:
result = []
current_word = test_df["content"][0]
current_mfcc = []
for i in range(len(predictions)):
    if test_df["content"][i] == current_word:
        current_mfcc.append(list(predictions[i]))
    else:
        result.append({
            "content": current_word,
            "mfcc": current_mfcc
        })
        current_word = test_df["content"][i]
        current_mfcc = [list(predictions[i])]
result.append({
            "content": current_word,
            "mfcc": current_mfcc
        })

In [12]:
import json
with open("./data/output_lab3.json", 'w', encoding='utf-8') as json_file:
        json.dump([{"words": result}], json_file, ensure_ascii=False, indent=4)