## Imports

In [6]:
%pip install fasttext

Collecting fasttext
  Using cached fasttext-0.9.2.tar.gz (68 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-macosx_10_15_x86_64.whl size=339740 sha256=ed6e4fe9ca86bc45b9ddc8612dd950fb80819a305a74183d270dc87555ede53b
  Stored in directory: /Users/luda-gordeeva/Library/Caches/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2
Note: you may need to restart the kernel to use updated packages.


In [56]:
import numpy as np
from numpy.linalg import norm
import json
from sklearn.linear_model import LogisticRegression
from scipy.special import expit

from pymorphy2 import MorphAnalyzer
from nltk.stem import PorterStemmer

import Levenshtein

from sklearn.utils import resample
import random
from catboost.utils import eval_metric
from scipy.special import expit 
from scipy.spatial import distance

stemmer = PorterStemmer()
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
article_words = {"the", "a", "an"}
negation_words = {"no", "not"}

### Global parameters

In [57]:
# LANGUAGE = "ru"
LANGUAGE = "en"

VECTORS = {} # cache
CLF = LogisticRegression(multi_class="multinomial", solver="newton-cg", random_state=0, max_iter=10000, tol=1e-5)  # solver

### FastText Embeddings

In [58]:
import fasttext.util
#fasttext.util.download_model(LANGUAGE)
ft = fasttext.load_model(f'cc.{LANGUAGE}.300.bin')



## Features

In [59]:
class Feature:
    def __init__(self, expression, lang, order):
        self.expression = expression
        self.lang = lang
        self.order = order

    def __call__(self, *args, **kwargs):
        return self.expression(*args)

In [60]:
FEATURES = {
    "Bias": Feature(
        expression=lambda word1, word2, analyzer: 1,
        lang={"ru", "en"},
        order=0
    ),
    "Words equality": Feature(
        expression=lambda word1, word2, analyzer:
            int(word1 == word2),

        lang={"ru", "en"},
        order=1
    ),
    "Words inequality": Feature(
        expression=lambda word1, word2, analyzer:
            int(word1 != word2),

        lang={"ru", "en"},
        order=2
    ),
    "Lemmas equality": Feature(
        expression=lambda word1, word2, analyzer:
            int(analyzer.parse(word1)[0].normal_form == analyzer.parse(word2)[0].normal_form),

        lang={"ru"},
        order=3
    ),
    "Lemmas inequality": Feature(
        expression=lambda word1, word2, analyzer:
            int(analyzer.parse(word1)[0].normal_form != analyzer.parse(word2)[0].normal_form),

        lang={"ru"},
        order=4
    ),
    "Words stem equality": Feature(
        expression=lambda word1, word2, analyzer:
            int(analyzer.stem(word1) == analyzer.stem(word2)),

        lang={"en"},
        order=5
    ),
    "Words stem inequality": Feature(
        expression=lambda word1, word2, analyzer:
            int(analyzer.stem(word1) == analyzer.stem(word2)),

        lang={"en"},
        order=6
    ),    
    "Word length difference": Feature(
        expression=lambda word1, word2, analyzer:
        abs(len(word1) - len(word2)),

        lang={"ru", "en"},
        order=7
    ),
    "Word length difference norm by max": Feature(
        expression=lambda word1, word2, analyzer:
        abs(len(word1) - len(word2)) / max(len(word1), len(word2)),

        lang={"ru", "en"},
        order=8
    ),

    "Levenshtein distance between words": Feature(
        expression=lambda word1, word2, analyzer:
        Levenshtein.distance(word1, word2),

        lang={"ru", "en"},
        order=9
    ),
    "Levenshtein distance between lemmas": Feature(
        expression=lambda word1, word2, analyzer:
        Levenshtein.distance(analyzer.parse(word1)[0].normal_form, analyzer.parse(word2)[0].normal_form),

        lang={"ru"},
        order=10
    ),
    "Levenshtein distance between lemmas norm by the sum of word lengths": Feature(
        expression=lambda word1, word2, analyzer:
        Levenshtein.distance(analyzer.parse(word1)[0].normal_form, analyzer.parse(word2)[0].normal_form) /
        (len(analyzer.parse(word1)[0].normal_form) + len(analyzer.parse(word2)[0].normal_form)),

        lang={"ru"},
        order=11
    ),
    "Levenshtein distance word stems": Feature(
        expression=lambda word1, word2, analyzer:
        Levenshtein.distance(analyzer.stem(word1), analyzer.stem(word2)),

        lang={"en"},
        order=12
    ),
    "Word similarity": Feature(
        expression=lambda word1, word2, analyzer: 1 - distance.cosine(ft.get_word_vector(word1), ft.get_word_vector(word2)) if word1 and word2 else 1,

        lang={"ru", "en"},
        order=13
    ),
    "Reference in dictionary": Feature(
        expression=lambda word1, word2, analyzer:
        analyzer.parse(word1)[0].is_known,

        lang={"ru"},
        order=14
    ),
    "Hypothesis in dictionary": Feature(
        expression=lambda word1, word2, analyzer:
        analyzer.parse(word2)[0].is_known,

        lang={"ru"},
        order=15
    ),
    "E with dots": Feature(
        expression=lambda word1, word2, analyzer:
        word1.replace('ё', 'е') == word2.replace('ё', 'е'),

        lang={"ru"},
        order=16
    ),
    "Error in no word": Feature(
        expression=lambda word1, word2, analyzer:
        word1 != word2 and "не" in {word1, word2},

        lang={"ru"},
        order=17
    ),
    "Insertion": Feature(
        expression=lambda word1, word2, analyzer:
        word1 == "",

        lang={"ru", "en"},
        order=18
    ),
    "Deletion": Feature(
        expression=lambda word1, word2, analyzer:
        word2 == "",

        lang={"ru", "en"},
        order=19
    ),
    "Stop-word": Feature(
        expression=lambda word1, word2, analyzer:
        int(word1 in stop_words),

        lang={"en"},
        order=20
    ),
    "IsArticle": Feature(
        expression=lambda word1, word2, analyzer:
        int(word1 != word2 and word1 in article_words and word2 in article_words),

        lang={"en"},
        order=21
    ),
    "IsNegation": Feature(
        expression=lambda word1, word2, analyzer:
        int(word1 != word2 and (word1 in negation_words or word2 in negation_words)),

        lang={"en"},
        order=22
    ),
}

# **MERa Model**

    lang: language (ru or en)
    mode: feature selection mode
        default: all features for the selected language
        black_list: features will be rejected
        white_list: features will be included
    features: list of features for the black_list or white_list mode

In [61]:
class MERa:
    def select_features(self, features):
        selected = set()

        for feature in FEATURES:
            if self.mode == "default" \
                    or self.mode == "white_list" and feature in features \
                    or self.mode == "black_list" and feature not in features:
                selected.add(feature)
        return selected

    def __init__(self, lang, mode="default", features=set()):
        if lang not in {"ru", "en"}:
            raise Exception(f"Invalid language {lang}")
        self.lang = lang

        self.analyzer = MorphAnalyzer() if lang == "ru" else PorterStemmer()

        if mode not in ["default", "black_list", "white_list"]:
            raise Exception("Incorrect mode")
        self.mode = mode

        if features | FEATURES.keys() != FEATURES.keys():
            raise Exception(f"Following features are not defined: {(features | FEATURES.keys()) - FEATURES.keys()}")

        self.features = sorted(list(self.select_features(features)), key=lambda x: FEATURES[x].order)
        self.size = len(self.features)

        self.weights = [0, 0, 1] + [0] * (self.size - 2)  # WER by default

    def __call__(self, ref, hyp, show=False, *args, **kwargs):
        ref_words, hyp_words = tokenizer_split(ref, hyp)
        pairs, costs = optimize(ref_words, hyp_words, self.cost)
        diff_ref, diff_hyp, diff_cost = alignment(pairs, costs)

        value = self.weights[0] + sum(costs) / max(len(ref_words), len(hyp_words))
        value = expit(value)

        if show:
            print(diff_ref)
            print(diff_hyp)
            print(diff_cost)

        return value, {
            "diff_ref": diff_ref,
            "diff_hyp": diff_hyp,
            "diff_cost": diff_cost
        }

    def vector(self, word1, word2):
        """
        Feature map (only if there is no cache)
        """
        if (word1, word2) in VECTORS:
            return VECTORS[(word1, word2)]
        x = [0] * self.size
        index = 0
        for feature in self.features:
            x[index] = FEATURES[feature](word1, word2, self.analyzer)
            index += 1
        VECTORS[(word1, word2)] = x
        return x

    def cost(self, word1, word2):
        return np.array(self.vector(word1, word2)).T.dot(np.array(self.weights[1:]))

    def fit(self, X_texts, y, probs, index_train, index_test, mode="meaning loss", save_path=None):
        global VECTORS, CLF 
        VECTORS = {}
        CLF = LogisticRegression(multi_class="multinomial", solver="newton-cg", random_state=0, max_iter=10000, tol=1e-5)
    
        assert set(index_train) | set(index_test) == set(range(len(y)))
        old_mean, old_std = np.zeros((self.size + 1,)), np.ones((self.size + 1,))
        old_weights = self.weights
        last_norm = 0
        for _ in range(10):
            X, old_mean, old_std = self.e_step(X_texts, old_mean, old_std) if mode == "meaning loss" else self.e_step_sbs(X_texts, old_mean, old_std)
            old_weights = self.weights
            
            self.m_step(X[index_train], np.array(y)[index_train], np.array(probs)[index_train])
            show_scores(index_train, index_test, X, probs, self.weights, self.features)

            print(norm(self.weights - old_weights))   
            if abs(norm(self.weights - old_weights) - last_norm) < 0.01:
                break
                
            last_norm = norm(self.weights - old_weights)

        self.weights = unstandard_weights(X, self.weights, old_mean, old_std)
        if save_path:
            save_coeff(["Sentence bias"] + self.features, self.weights, save_path)

    def e_step(self, X_texts, old_mean, old_std):
        X = np.ones((len(X_texts), self.size))
        i = 0
        for ref, hyp in X_texts:
            pairs, costs = optimize(*tokenizer_split(ref, hyp), self.cost)
#             # uncomment for logging
#             alignment(pairs, costs)
            X[i] = (np.sum([self.vector(word1, word2) for word1, word2 in pairs], 0) / max(len(ref), len(hyp)))
            i += 1
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        X, new_mean, new_std, self.weights = standard(X, self.weights, old_mean, old_std)
        return X, new_mean, new_std
    
    def e_step_sbs(self, X_texts, old_mean, old_std):
        X = np.ones((len(X_texts), self.size))
        i = 0
        for ref, hyp_left, hyp_right in X_texts:
            pairs_left, costs_left = optimize(*tokenizer_split(ref, hyp_left), self.cost)
            pairs_right, costs_right = optimize(*tokenizer_split(ref, hyp_right), self.cost)
#             # uncomment for logging
#             alignment(pairs, costs)
            X[i] = -(np.sum([self.vector(word1, word2) for word1, word2 in pairs_right], 0) / max(len(ref), len(hyp_right))) + \
            (np.sum([self.vector(word1, word2) for word1, word2 in pairs_left], 0) / max(len(ref), len(hyp_left)))
            i += 1
        X = np.hstack((np.ones((X.shape[0], 1)), X))

        X, new_mean, new_std, self.weights = standard(X, self.weights, old_mean, old_std)
        return X, new_mean, new_std
    

    def m_step(self, X, y, probs):
        CLF.fit(X, y, sample_weight=[probs[i][y[i]] for i in range(len(y))])
        self.weights = CLF.coef_[0]

## Utils

In [71]:
def save_coeff(first_column, second_column, path):
        with open(path, "w") as f:
            first_column_size = max([len(row) for row in first_column])
            for i in range(len(first_column)):
                f.write(f'{first_column[i]}{" " * (first_column_size - len(first_column[i]))}: {round(second_column[i], 4)}\n')

def tokenizer_split(ref, hyp):
    return ref.split(), hyp.split()


def optimize(words_ref, words_hyp, cost):
    # --> define dynamic table size
    n = len(words_ref)
    m = len(words_hyp)

    # --> create table
    table = np.full((n + 1, m + 1), np.inf)
    table[0, 0] = 0

    # --> fill table
    for i in range(n):
        for j in range(m):
            # --> deletion
            table[i + 1, j] = min(table[i + 1][j], table[i][j] + cost(words_ref[i], ""))

            # --> insertion
            table[i, j + 1] = min(table[i][j + 1], table[i][j] + cost("", words_hyp[j]))

            # --> substitution
            table[i + 1, j + 1] = min(table[i + 1][j + 1],
                                      table[i][j] + cost(words_ref[i], words_hyp[j]))

    # --> fill table edge
    for j in range(m):
        # --> only insertion possible
        table[n, j + 1] = min(table[n][j + 1], table[n][j] + cost("", words_hyp[j]))

    for i in range(n):
        # --> deletion
        table[i + 1, m] = min(table[i + 1][m], table[i][m] + cost(words_ref[i], ""))

    ### RECONSTRUCTION ###
    eps = 1e-10
    pairs = []
    costs = []
    i, j = n, m

    while i > 0 or j > 0:
        new_i, new_j = i, j

        if i > 0:
            # --> deletion
            if abs(table[i - 1, j] + cost(words_ref[i - 1], "") - table[i, j]) < eps:
                pair = (words_ref[i - 1], "")
                new_i, new_j = i - 1, j

        if i > 0 and j > 0:
            # --> substitution
            if abs(table[i - 1, j - 1] + cost(words_ref[i - 1], words_hyp[j - 1]) - table[i, j]) < eps:
                pair = (words_ref[i - 1], words_hyp[j - 1])
                new_i, new_j = i - 1, j - 1

        if j > 0:
            # --> insertion
            if abs(table[i, j - 1] + cost("", words_hyp[j - 1]) - table[i, j]) < eps:
                pair = ("", words_hyp[j - 1])
                new_i, new_j = i, j - 1

        pairs.append(pair)
        costs.append(cost(pair[0], pair[1]))
        i, j = new_i, new_j

    # --> reverse pairs
    pairs.reverse()
    costs.reverse()

    return pairs, costs


def alignment(pairs, costs):

    diff_ref = []
    diff_hyp = []
    diff_cost = []

    for i in range(len(costs)):
        ref, hyp = pairs[i]
        cost = str(round(costs[i], 3))

        length = max(len(ref), len(hyp), len(cost))
        diff_ref.append(ref + "*" * (length - len(ref)))
        diff_hyp.append(hyp + "*" * (length - len(hyp)))
        diff_cost.append(cost + " " * (length - len(cost)))

    return ' '.join(diff_ref), ' '.join(diff_hyp), ' '.join(diff_cost)


def train_test_split(size, p=0.8):
    indexes = list(range(size))
    random.shuffle(indexes)
    k = round(size * p)
    return indexes[:k], indexes[k:]


def show_scores(index_train, index_test, X, probs, weights, features):
    auc_test = bootstrap(1000, 0.75,
                         np.array(probs)[:, 1][index_test],
                         expit(X[index_test].dot(np.array(weights))),
                         lambda a1, a2: eval_metric(a1, a2, 'AUC'))  # на классе 1
    auc_train = bootstrap(1000, 0.75,
                          np.array(probs)[:, 1][index_train],
                          expit(X[index_train].dot(np.array(weights))),
                          lambda a1, a2: eval_metric(a1, a2, 'AUC'))
    print("Standard weights\n")
    print("sent const:", weights[0])
    print(feature_weights_table(features, weights[1:]))
    print("AUC test", auc_test)
    print("AUC train", auc_train)


def bootstrap(k, p, seq1, seq2, func):
    """

    :param k: samples count
    :param p: samples part
    :param seq1, seq2: two list for comparing
    :param func: score function
    :return: interval
    """

    values = []
    n = round(len(seq1) * p)
    for i in range(k):
        indexes = resample(list(range(len(seq1))), n_samples=n)
        values.append(func(seq1[indexes], seq2[indexes]))
    values.sort()
    tails = int(0.025 * len(values))
    return values[tails], values[-tails]


def feature_weights_table(first_column, second_column):
    first_column_size = max([len(row) for row in first_column])
    for i in range(len(first_column)):
        print(f'{first_column[i]}{" " * (first_column_size - len(first_column[i]))}: {round(second_column[i], 4)}')


def standard(x, weights, old_mean, old_std):
    new_mean = np.mean(x, axis=0)
    new_std = np.std(x, axis=0)

    x_std = x - new_mean
    x_std /= new_std

    # --> consts
    for i in range(1, x.shape[1]):
        if not new_std[i] or not old_std[i] or (set(x[:, i]) | {0, 1}) == {0, 1}:
            x_std[:, i] = x[:, i]
        else:
            weights[0] -= weights[i] * (old_mean[i] / old_std[i] - new_mean[i] / old_std[i])
            weights[i] *= new_std[i] / old_std[i]

    x_std[:, 0] = np.ones((x.shape[0],))
    return x_std, new_mean, new_std, weights


def unstandard_weights(x, weights, old_mean, old_std):
    for i in range(1, weights.shape[0]):
        if old_std[i] and (set(x[:, i]) | {0, 1} != {0, 1}):
            weights[0] -= weights[i] * old_mean[i] / old_std[i]
            weights[i] /= old_std[i]

    return weights


## Meaning Loss

In [72]:
def aggregate_ml(path):
    desc_to_votes = {}
    with open(path, "r") as f:
        data = json.load(f)
    for markup in data:
        desc = (
            markup["reference"],
            markup["hypothesis"],
            markup["recognition_id"],
            markup["dataset"],
            markup["record"],
            markup["model"],
            str(markup["noise"]["level_idx"]) if markup["noise"] is not None else None,
        )

        if desc not in desc_to_votes:
            desc_to_votes[desc] = (0, 0)

        votes_total, votes_ok = desc_to_votes[desc]

        votes_total += 1
        if markup["ok"]:
            votes_ok += 1

        desc_to_votes[desc] = (votes_total, votes_ok)
    return [{
        "reference": desc[0],
        "hypothesis": desc[1],
        "votes_total": desc_to_votes[desc][0],
        "votes_ok": desc_to_votes[desc][1],
    } for desc in desc_to_votes]

def prepare_data_ml():
    data = aggregate_ml("votes_check_raw.json")
    
    X = []
    for item in data:
        ref = item["reference"]
        ref = "" if type(ref) == float else ref

        hyp = item["hypothesis"]
        hyp = "" if type(hyp) == float else hyp
        X.append((ref, hyp))
    y = [1 if item["votes_ok"]/item["votes_total"] < 0.5 else -1 for item in data]  # не правильно 1, правильно -1
    probs = []
    alpha = 0.5
    for item in data:
        p = (item["votes_ok"] + alpha) / (item["votes_total"] + 2 * alpha)
        probs.append([p, 1 - p])

    return X, y, probs    

In [73]:
X, y, probs = prepare_data_ml()
# size = 100
# X, y, probs = X[:size], y[:size], probs[:size]
index_train, index_test = train_test_split(len(y))

In [74]:
print(f'Dataset size: {len(X)}')

Dataset size: 11284


## Baseline (WER)

In [75]:
print("WER")
m = MERa("en", "white_list", features={
    "Bias",
    "Words inequality"
})
m.fit(X, y, probs, index_train, index_test)

WER




Standard weights

sent const: 8.138628659892788e-14
Bias            : -0.0381
Words inequality: 1.0345
None
AUC test ([0.7556062707952622], [0.7743076475686829])
AUC train ([0.7626682309091144], [0.7722889663293212])
0.997968528966293




Standard weights

sent const: 4.7684078907650473e-14
Bias            : -0.0388
Words inequality: 1.0347
None
AUC test ([0.7547862013997131], [0.7744926472382014])
AUC train ([0.7624432444824282], [0.7718993657427033])
0.0006537806875331318




Standard weights

sent const: 4.7684078907650473e-14
Bias            : -0.0388
Words inequality: 1.0347
None
AUC test ([0.7548134672577067], [0.773990988871138])
AUC train ([0.7622870805050236], [0.7724811316754475])
0.0


## Train MERa

In [76]:
m = MERa("en", "white_list", features={
    "Bias",
    "Words inequality",
    "Insertion",
    "Deletion",
    "Words stem inequality",
    "Word length difference",
    "Levenshtein distance between words",
    "Levenshtein distance word stems",
    "Stop-word",
    "Word similarity",
    "IsArticle",
    "IsNegation"
})

In [77]:
m.fit(X, y, probs, index_train, index_test, save_path="coeff_ml.json")

  dist = 1.0 - uv / np.sqrt(uu * vv)


Standard weights

sent const: 1.861863204880396e-06
Bias                              : 1.3753
Words inequality                  : -0.4953
Words stem inequality             : -0.0503
Word length difference            : -0.5814
Levenshtein distance between words: 0.7834
Levenshtein distance word stems   : -0.0468
Word similarity                   : -1.9375
Insertion                         : 0.209
Deletion                          : 0.2929
Stop-word                         : 0.1565
IsArticle                         : -0.0333
IsNegation                        : 0.0061
None
AUC test ([0.7661609239120406], [0.7841083354814438])
AUC train ([0.7758048864466123], [0.7843872654620101])
2.65372385325419




Standard weights

sent const: 8.872123286149565e-09
Bias                              : 0.6943
Words inequality                  : -0.2366
Words stem inequality             : -0.0948
Word length difference            : 0.8912
Levenshtein distance between words: 0.6784
Levenshtein distance word stems   : -0.606
Word similarity                   : -0.7444
Insertion                         : -0.2195
Deletion                          : 0.0486
Stop-word                         : 0.0019
IsArticle                         : 0.0
IsNegation                        : -0.0948
None
AUC test ([0.7187749970791677], [0.7423149097390087])
AUC train ([0.7257886816511769], [0.7374470363694346])
4.947400790817684




Standard weights

sent const: 7.451427491796217e-09
Bias                              : 1.3727
Words inequality                  : -0.6137
Words stem inequality             : -0.0186
Word length difference            : -0.3079
Levenshtein distance between words: 0.7124
Levenshtein distance word stems   : 0.0157
Word similarity                   : -1.8973
Insertion                         : 0.1877
Deletion                          : 0.2179
Stop-word                         : 0.1562
IsArticle                         : -0.0386
IsNegation                        : 0.0065
None
AUC test ([0.7666788685808763], [0.7848226421785778])
AUC train ([0.7759153956350809], [0.7848039647238276])
2.2532596657158335




Standard weights

sent const: -2.3715157369985803e-13
Bias                              : 1.4901
Words inequality                  : -0.6131
Words stem inequality             : 0.1026
Word length difference            : -0.2227
Levenshtein distance between words: 0.8881
Levenshtein distance word stems   : 0.57
Word similarity                   : -1.859
Insertion                         : 0.444
Deletion                          : 0.0981
Stop-word                         : 0.1411
IsArticle                         : 0.0
IsNegation                        : -0.0017
None
AUC test ([0.7648579264889958], [0.783330459409169])
AUC train ([0.7743992583635688], [0.7833470974588266])
1.2937840354410324




Standard weights

sent const: 1.1924969659436082e-07
Bias                              : 1.3625
Words inequality                  : -0.5772
Words stem inequality             : 0.0776
Word length difference            : -0.4363
Levenshtein distance between words: 0.6799
Levenshtein distance word stems   : 0.1495
Word similarity                   : -1.8743
Insertion                         : 0.2722
Deletion                          : 0.288
Stop-word                         : 0.156
IsArticle                         : -0.0055
IsNegation                        : 0.0084
None
AUC test ([0.766905866214356], [0.7852651564202962])
AUC train ([0.776215931121469], [0.7850764278650106])
0.782434684254137




Standard weights

sent const: 3.3894507410770625e-14
Bias                              : 1.4798
Words inequality                  : -0.5891
Words stem inequality             : 0.1019
Word length difference            : -0.1436
Levenshtein distance between words: 0.8086
Levenshtein distance word stems   : 0.601
Word similarity                   : -1.8404
Insertion                         : 0.4489
Deletion                          : 0.1035
Stop-word                         : 0.1414
IsArticle                         : 0.0
IsNegation                        : -0.0013
None
AUC test ([0.7660216478787678], [0.7835025982962716])
AUC train ([0.7746717529803577], [0.7833913574125548])
1.4518106877136978




Standard weights

sent const: 9.312648562183595e-11
Bias                              : 1.3369
Words inequality                  : -0.5504
Words stem inequality             : 0.0927
Word length difference            : -0.4287
Levenshtein distance between words: 0.6346
Levenshtein distance word stems   : 0.1906
Word similarity                   : -1.8675
Insertion                         : 0.2595
Deletion                          : 0.2777
Stop-word                         : 0.156
IsArticle                         : -0.0054
IsNegation                        : 0.0085
None
AUC test ([0.7668027304685249], [0.785019112997917])
AUC train ([0.7762468443232217], [0.7847937847190122])
0.7891922454115212




Standard weights

sent const: 5.568796936848393e-08
Bias                              : 1.484
Words inequality                  : -0.5876
Words stem inequality             : 0.1073
Word length difference            : -0.1195
Levenshtein distance between words: 0.769
Levenshtein distance word stems   : 0.617
Word similarity                   : -1.8477
Insertion                         : 0.4503
Deletion                          : 0.1034
Stop-word                         : 0.1412
IsArticle                         : 0.0
IsNegation                        : -0.0015
None
AUC test ([0.7648230255822491], [0.7829065542370818])
AUC train ([0.7741282974308911], [0.7829968844402373])
1.399889358297225




Standard weights

sent const: 5.973823242459237e-06
Bias                              : 1.3402
Words inequality                  : -0.5513
Words stem inequality             : 0.0952
Word length difference            : -0.4324
Levenshtein distance between words: 0.6386
Levenshtein distance word stems   : 0.1876
Word similarity                   : -1.8735
Insertion                         : 0.2611
Deletion                          : 0.28
Stop-word                         : 0.1554
IsArticle                         : -0.0054
IsNegation                        : 0.0086
None
AUC test ([0.7670895208744999], [0.7851200655071134])
AUC train ([0.7762770695123234], [0.785266665890535])
0.8040407217317165




Standard weights

sent const: -1.911442219646375e-11
Bias                              : 1.4627
Words inequality                  : -0.5887
Words stem inequality             : 0.1047
Word length difference            : -0.1348
Levenshtein distance between words: 0.7705
Levenshtein distance word stems   : 0.6361
Word similarity                   : -1.8263
Insertion                         : 0.4544
Deletion                          : 0.1043
Stop-word                         : 0.1416
IsArticle                         : 0.0
IsNegation                        : -0.0014
None
AUC test ([0.7646850639704701], [0.7831880267590513])
AUC train ([0.7741199226627751], [0.7834590488570656])
1.4380442515700593


In [78]:
m("i don't known that gave me much strength", "don't known what he gave me my friend")

(0.9999923639321406,
 {'diff_ref': "i***** don't* known* that* ****** gave** me**** much** ****** strength",
  'diff_hyp': "****** don't* known* what* he**** gave** me**** ****** my**** friend**",
  'diff_cost': '-2.702 -0.205 -4.117 9.066 13.538 -4.117 -0.205 12.861 13.538 61.534  '})

## Side by side

In [79]:
def aggregate_sbs(path):
    desc_to_votes = {}
    with open(path, "r") as f:
        data = json.load(f)
    for markup in data:
        desc = (
            markup["reference"],
            markup["hypothesis_left"],
            markup["hypothesis_right"],
            markup["recognition_id_left"],
            markup["recognition_id_right"],
            markup["dataset"],
            markup["record"],
            markup["model_left"],
            markup["model_right"],
            str(markup["noise_left"]["level_idx"]) if markup["noise_left"] is not None else None,
            str(markup["noise_right"]["level_idx"]) if markup["noise_right"] is not None else None,
        )

        if desc not in desc_to_votes:
            desc_to_votes[desc] = (0, 0)

        votes_total, votes_left = desc_to_votes[desc]

        votes_total += 1
        if markup["choice"] == "LEFT":
            votes_left += 1

        desc_to_votes[desc] = (votes_total, votes_left)
    return [{
        "reference": desc[0],
        "hypothesis_left": desc[1],
        "hypothesis_right": desc[2],
        "votes_total": desc_to_votes[desc][0],
        "votes_left": desc_to_votes[desc][1],
    } for desc in desc_to_votes]

def prepare_data_sbs():
    data = aggregate_sbs("votes_sbs_raw.json")
    alpha = 0
    X = []
    y = []
    probs = []
    for item in data:
        if item["votes_total"] != 5:
            continue
        persent = (item["votes_left"] + alpha) / (item["votes_total"] + 2 * alpha)
        y.append(1 if persent < 0.5 else -1)
        probs.append([persent, 1 - persent])
    
        sample = (item["reference"], item["hypothesis_left"], item["hypothesis_right"])
    
        X.append(sample)
    return X, y, probs  

In [80]:
X, y, probs = prepare_data_sbs()
# size = 2000
# X, y, probs = X[:size], y[:size], probs[:size]
index_train, index_test = train_test_split(len(y))

In [81]:
print(f'Dataset size: {len(X)}')

Dataset size: 26622


## Baseline (WER)

In [82]:
print("WER")
m = MERa("en", "white_list", features={
    "Bias",
    "Words inequality"
})
m.fit(X, y, probs, index_train, index_test, mode="sbs")

WER




Standard weights

sent const: -1.752764600126966e-14
Bias            : -0.2297
Words inequality: 1.6268
None
AUC test ([0.8200394303292436], [0.8355162589393816])
AUC train ([0.8188009004285949], [0.8267146462458018])
1.603657893237269




Standard weights

sent const: -5.6295246242399344e-14
Bias            : -0.2354
Words inequality: 1.6337
None
AUC test ([0.819220245913219], [0.83489163168746])
AUC train ([0.8187603701134456], [0.82672117154891])
0.0072697307194708




Standard weights

sent const: -5.6295246242399344e-14
Bias            : -0.2354
Words inequality: 1.6337
None
AUC test ([0.8192387732574637], [0.8349740445812472])
AUC train ([0.8188029303003121], [0.8266894025891252])
0.0


## Train MERa

In [83]:
m = MERa("en", "white_list", features={
    "Bias",
    "Words inequality",
    "Insertion",
    "Deletion",
    "Words stem inequality",
    "Word length difference",
    "Levenshtein distance between words",
    "Levenshtein distance word stems",
    "Stop-word",
    "Word similarity",
    "IsArticle",
    "IsNegation"
})

In [84]:
m.fit(X, y, probs, index_train, index_test, mode="sbs", save_path="coeff_sbs.json")

  dist = 1.0 - uv / np.sqrt(uu * vv)


Standard weights

sent const: -1.2592921497245335e-13
Bias                              : 0.1652
Words inequality                  : 0.2248
Words stem inequality             : -0.2373
Word length difference            : -0.548
Levenshtein distance between words: 1.1422
Levenshtein distance word stems   : -0.3121
Word similarity                   : -0.8831
Insertion                         : 0.1543
Deletion                          : 0.2781
Stop-word                         : -0.0165
IsArticle                         : -0.0326
IsNegation                        : 0.0159
None
AUC test ([0.8320726982676984], [0.8470487808472353])
AUC train ([0.8305882148607704], [0.837754277415063])
1.6440092443514494




Standard weights

sent const: 5.447861564468844e-06
Bias                              : 0.487
Words inequality                  : 0.0496
Words stem inequality             : -0.3178
Word length difference            : -1.1603
Levenshtein distance between words: 2.4735
Levenshtein distance word stems   : -0.3374
Word similarity                   : -0.8431
Insertion                         : 0.2936
Deletion                          : 0.2269
Stop-word                         : 0.0066
IsArticle                         : -0.0033
IsNegation                        : 0.0174
None
AUC test ([0.8356884483785381], [0.8502209575242529])
AUC train ([0.8345566896672743], [0.8419358267917665])
1.2127116316103532




Standard weights

sent const: 3.9561664792245446e-08
Bias                              : 0.1817
Words inequality                  : -0.012
Words stem inequality             : -0.2732
Word length difference            : -0.6493
Levenshtein distance between words: 1.6949
Levenshtein distance word stems   : -0.1138
Word similarity                   : -0.7257
Insertion                         : 0.2723
Deletion                          : 0.1276
Stop-word                         : 0.0542
IsArticle                         : 0.005
IsNegation                        : 0.0182
None
AUC test ([0.8367366701247538], [0.8507658463763725])
AUC train ([0.8353473788874382], [0.8425641008814695])
0.2834313133048748




Standard weights

sent const: 2.0425889848605088e-08
Bias                              : 0.1922
Words inequality                  : -0.0799
Words stem inequality             : -0.2487
Word length difference            : -0.4595
Levenshtein distance between words: 1.4701
Levenshtein distance word stems   : -0.1229
Word similarity                   : -0.8033
Insertion                         : 0.2682
Deletion                          : 0.1725
Stop-word                         : 0.0481
IsArticle                         : -0.0017
IsNegation                        : 0.0179
None
AUC test ([0.8366916689090818], [0.8502189978153976])
AUC train ([0.835569288317364], [0.8429530275099808])
0.1928036150826875




Standard weights

sent const: 7.736645436045804e-09
Bias                              : 0.1829
Words inequality                  : -0.0968
Words stem inequality             : -0.2503
Word length difference            : -0.4466
Levenshtein distance between words: 1.4455
Levenshtein distance word stems   : -0.1331
Word similarity                   : -0.8245
Insertion                         : 0.2633
Deletion                          : 0.1857
Stop-word                         : 0.0513
IsArticle                         : -0.007
IsNegation                        : 0.0162
None
AUC test ([0.8368855056743391], [0.8510161695530738])
AUC train ([0.8355916030817512], [0.8428331257304023])
0.0346092980813137




Standard weights

sent const: -6.822870041297766e-14
Bias                              : 0.1872
Words inequality                  : -0.0993
Words stem inequality             : -0.2577
Word length difference            : -0.4503
Levenshtein distance between words: 1.4614
Levenshtein distance word stems   : -0.1361
Word similarity                   : -0.8159
Insertion                         : 0.2679
Deletion                          : 0.1792
Stop-word                         : 0.0504
IsArticle                         : -0.0069
IsNegation                        : 0.0175
None
AUC test ([0.8364194219985028], [0.8511093621679311])
AUC train ([0.8355614130794454], [0.8428611222804724])
0.016226142509755157




Standard weights

sent const: 1.3499422170111085e-09
Bias                              : 0.1865
Words inequality                  : -0.101
Words stem inequality             : -0.2552
Word length difference            : -0.4414
Levenshtein distance between words: 1.4504
Levenshtein distance word stems   : -0.1354
Word similarity                   : -0.82
Insertion                         : 0.2657
Deletion                          : 0.181
Stop-word                         : 0.0499
IsArticle                         : -0.0068
IsNegation                        : 0.0174
None
AUC test ([0.8364313089952694], [0.8503927500808526])
AUC train ([0.8353149649811369], [0.8427072659564985])
0.011827923699301248


In [85]:
m("i don't known that gave me much strength", "don't known what he gave me my friend")

(0.9998455834989539,
 {'diff_ref': "i***** don't* known** that** ****** gave*** me**** much** ****** strength",
  'diff_hyp': "****** don't* known** what** he**** gave*** me**** ****** my**** friend**",
  'diff_cost': '14.394 -5.796 -26.136 20.926 12.398 -26.136 -5.796 10.569 12.398 59.663  '})

## For plots

In [100]:
def read_mera(path):
    with open(path, "r") as f:
        features = []
        weights = []
        for line in f:
            feature_name, weight = line.split(":")
            features.append(feature_name.strip())
            weights.append(float(weight.strip()))
    mera = MERa("en", "white_list", features=features[1:])        
    mera.weights = np.array(weights)
    return mera

def read_check_data(path):
    with open(path, "r") as f:
        data = json.load(f)
        
    result = {
        "dataset": [],
        "model": [],
        "record": [],
        "noise_level": [],
        "variant": [],
        "reference": [],
        "hypothesis": []
    }
    
    for item in data:
        result["dataset"].append(item["dataset"])
        result["model"].append(item["model"])
        result["record"].append(item["record"])
        result["noise_level"].append(item["noise"]["level"] if item["noise"] else None)
        result["variant"].append(item["noise"]["variant"] if item["noise"] else None)
        result["reference"].append(item["reference"])
        result["hypothesis"].append(item["hypothesis"])
        
    return result    

def read_sbs_data(path):
    with open(path, "r") as f:
        data = json.load(f)
        
    result = {
        "dataset": [],
        "model": [],
        "record": [],
        "noise_level": [],
        "variant": [],
        "reference": [],
        "hypothesis": []
    }
    
    for item in data:
        for side in {"left", "right"}:
            result["dataset"].append(item["dataset"])
            result["model"].append(item[f"model_{side}"])
            result["record"].append(item["record"])
            result["noise_level"].append(item[f"noise_{side}"]["level"] if item[f"noise_{side}"] else None)
            result["variant"].append(item[f"noise_{side}"]["variant"] if item[f"noise_{side}"] else None)
            result["reference"].append(item["reference"])
            result["hypothesis"].append(item[f"hypothesis_{side}"])
        
    return result  

def for_plots(raw_data_path, saved_coeff_path, result_path, mode):
    if mode == "check":
        data = read_check_data(raw_data_path)
    elif mode == "sbs":
        data = read_sbs_data(raw_data_path)
    else:
        print("Incorrect mode")
    m = read_mera("coeff_ml.json")
    mers = []
    for i in range(len(data["reference"])):
        mers.append(m(data["reference"][i], data["hypothesis"][i]))
        
    data["mer"] = mers
    with open(result_path, "w") as f:
        json.dump(data, f)

In [None]:
for_plots("votes_check_raw.json", "coeff_ml.json", "mera.json", mode="check")

In [101]:
for_plots("votes_sbs_raw.json", "coeff_sbs.json", "mera_sbs.json", mode="sbs")