In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymorphy2
from pymorphy2.units.by_lookup import DictionaryAnalyzer
import re
import os
import json
import pickle

from tqdm import tqdm_notebook

%matplotlib inline

# Solver 9

In [191]:
morph = pymorphy2.MorphAnalyzer()
slovarnie_slova = pd.read_csv("../models/dictionaries/slovarnie_slova.txt", header=None).rename({0: "word"}, axis=1)
stress_dict = pd.read_csv("../models/dictionaries/orfoepicheskiy_automatic_gde_udarenie_rf.txt", header=None).rename({0: "word"}, axis=1)
with open("/Users/edgy/Downloads/task_9_dixt_drop_ya.pickle", "rb") as f:
    exact_labels = pickle.load(f)

In [122]:
def solver_9(task, testing=False):
    def is_unverifiable(w):
        for w2 in slovarnie_slova.word:
            if re.match(re.sub(r"\.\.", ".", w), w2):
                return True
        return False

    def is_stressed(w, pos):
        if len(w) == pos:
            w = w[:pos] + w[pos].upper()
        else:
            w = w[:pos] + w[pos].upper() + w[pos+1:]
        return w in stress_dict.word.values

    def word_exists(w):
        analysis = morph.parse(w)
        if (analysis[0].methods_stack[0][0].__class__.__name__ == "DictionaryAnalyzer") and (analysis[0].methods_stack[0][1] == w):
            return True
        return False

    def possible_variants(w):
        amount = 0
        for candidate in "аоеиы":
            w_n = re.sub(r"\.\.", candidate, w)
            analysis = morph.parse(w_n)
            if (analysis[0].methods_stack[0][0].__class__.__name__ == "DictionaryAnalyzer") and (analysis[0].methods_stack[0][1] == w_n):
                amount += 1
        if amount == 0:
            amount = 1
        return amount

    def is_alternant(w):
        #зависящие от конечной согласной корня
        patterns_1 = [
            (r"[а-я]*р\.\.(ст|щ)[а-я]*", "а"),
            (r"[а-я]*р\.\.с[а-су-я]*", "о"),
            (r"[а-я]*л\.\.г[а-я]*", "а"),
            (r"[а-я]*л\.\.ж[а-я]*", "о"),
            (r"[а-я]*ск\.\.к[а-я]*", "а"),
            (r"[а-я]*ск\.\.ч[а-я]*", "о"),
        ]
        #зависящие от суффикса "а" после корня
        patterns_2 = [
            (r"[а-я]*(б|д|м|п|т)\.\.ра[а-я]*", "и"),
            (r"[а-я]*бл\.\.ста[а-я]*", "и"),
            (r"[а-я]*ж\.\.га[а-я]*", "и"),
            (r"[а-я]*ст\.\.ла[а-я]*", "и"),
            (r"[а-я]*ч\.\.та[а-я]*", "и"),
            (r"[а-я]*к\.\.са[а-я]*", "а"),
            (r"[а-я]*(б|д|м|п|т)\.\.р[б-я]*", "е"),
            (r"[а-я]*бл\.\.ст[б-я]*", "е"),
            (r"[а-я]*ж\.\.г[б-я]*", "е"),
            (r"[а-я]*ст\.\.л[б-я]*", "е"),
            (r"[а-я]*ч\.\.т[б-я]*", "е"),
            (r"[а-я]*к\.\.с[б-я]*", "о"),
        ]
        #зависящие от ударения (плов-плав хз почему тут, всегда пишется "а" кроме исключений)
        patterns_3a = [
            (r"[а-я]*з\.\.р[а-я]*", "оа"),
            (r"[а-я]*г\.\.р[а-я]*", "ао"),
            (r"[а-я]*тв\.\.р[а-нп-я]+", "ао"),
        ]
        patterns_3b = [
            (r"[а-я]*пл\.\.в[а-я]*", "оа"),
        ]
        #зависящие от лексического значения
        patterns_4 = [
            (r"[а-я]*м\.\.к[а-я]*", "оа"),
            (r"[а-я]*р\.\.вн[а-я]*", "оа"),
        ]

        exceptions = [
            "росток", "ростов", "ростислав", "ростовщик",
            "отрасль", "скачок", "скачу", "сочетать", "сочетание",
            "чета", "зоревать", "зорянка", "пловец", "пловчиха",
            "плывуны", "уровень", "ровесник", "равнина", "равняйсь", 
            "равнение ",
        ]
        w = w.lower()
        pos_space = re.search(r"\.", w).span()[0]
        for p in patterns_1:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"\.\.", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 1, filled_w, pos_space, stressed
        for p in patterns_2:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"\.\.", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 2, filled_w, pos_space, stressed
        for p in patterns_4:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"\.\.", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 4, filled_w, pos_space, stressed
        for p in patterns_3a:
            if re.match(p[0], w):
                for q, p_i in enumerate(p[1]):
                    filled_w = re.sub(r"\.\.", p_i, w)
                    if word_exists(filled_w):
                        stress_ind = is_stressed(filled_w, pos_space)
                        if (stress_ind) and (q == 0):
                            return True, 3, filled_w, pos_space, True
                        if (q == 1) and (not stress_ind):
                            return True, 3, filled_w, pos_space, False
        for p in patterns_3b:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"\.\.", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 3, filled_w, pos_space, stressed
        return False, None, None, pos_space, None

    words = np.array([re.split(r", ", t["text"]) for t in task["question"]["choices"]])
    #обрезаем скобки
#     words = [[re.sub("\([а-я ]+\)", "", t2).strip() for t2 in t1] for t1 in words]
    words = [[re.sub(r"[0-9]+\)", "", re.sub(r"\([а-я ]+\)", "", t2)).strip() for t2 in t1] for t1 in words]
    #в зависимости от числа слов в каждом варианте, мы ожидаем разное число верных ответов
    num_answers = 2
    if len(words[0]) == 1:
        num_answers = 1
    #определяем какой тип нужно искать
    if "чередующ" in task["text"]:
        task_type = 0
    elif "непровер" in task["text"]:
        task_type = 1
    else:
        task_type = 2
    alt_labels = [[is_alternant(t2) for t2 in t1] for t1 in words]
    unver_labels = [[is_unverifiable(t2) for t2 in t1] for t1 in words]
    possible_ways = [[possible_variants(t2) for t2 in t1] for t1 in words]
    scores = np.zeros((len(words), len(words[0]), 3))
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            scores[i, j, 0] = 0
            scores[i, j, 1] = unver_labels[i][j]
            if alt_labels[i][j][0]:
                scores[i, j, 0] = alt_labels[i][j][0]
            scores[i, j, 2] = 1 - scores[i, j, 0] - 10 * scores[i, j, 1] * (possible_ways[i][j]-1)
    if testing: print(scores)
    agg_scores = scores.mean(axis=1)
    if testing: print(agg_scores)
    agg_scores = agg_scores[:, task_type]
    if testing: print(agg_scores)
    max_score = agg_scores.max()
    second_value = agg_scores[agg_scores.argsort()[-2]]
    answer_numbers = np.arange(len(agg_scores))[agg_scores==max_score]
    if (len(answer_numbers) < 2) and (second_value > 0):
        answer_numbers = np.concatenate([answer_numbers,
                                         np.arange(len(agg_scores))[agg_scores==second_value]])
    answer_numbers = answer_numbers[:2]
#     answer_numbers = agg_scores.argsort()[2:]
    answer_numbers += 1
    answer_numbers = [str(t) for t in answer_numbers]
    return answer_numbers

In [208]:
with open("/Users/edgy/Downloads/task_9/T7009.json", "r") as f:
    js = json.load(f)

In [209]:
js

{'tasks': [{'id': '9',
   'text': 'Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная проверяемая гласная корня.',
   'meta': {'language': 'ru', 'source': 'yandex_test'},
   'attachments': [],
   'solution': {'correct_variants': [['4', '5'], ['5', '4']]},
   'score': 1,
   'question': {'type': 'multiple_choice',
    'min_choices': 1,
    'choices': [{'id': '1',
      'text': 'зас..вать (поля), разв..вающиеся (страны), попл..вок'},
     {'id': '2', 'text': 'зас..дание, прож..вать (в деревне), сл..гаемые'},
     {'id': '3', 'text': 'озл..бление, погл..щённый, преп..рательства'},
     {'id': '4', 'text': 'с..мволика, зат..вать (драку), кол..бание'},
     {'id': '5', 'text': 'к..робочка, под..конник, впеч..тление'}]}}]}

In [210]:
solver_9(js["tasks"][0], testing=True)

[[[   0.      0.   -100.  ]
  [   0.      0.   -100.  ]
  [   0.      0.   -100.  ]]

 [[   0.      0.      0.9 ]
  [   0.      0.      0.9 ]
  [   0.9     0.      0.45]]

 [[   0.      0.      0.9 ]
  [   0.      0.      1.  ]
  [   0.9     0.      0.45]]

 [[   0.      0.      0.9 ]
  [   0.      0.      0.9 ]
  [   0.      0.      0.9 ]]

 [[   0.      0.      0.9 ]
  [   0.      0.9     0.45]
  [   0.      0.9     0.45]]]
[[   0.            0.         -100.        ]
 [   0.3           0.            0.75      ]
 [   0.3           0.            0.78333333]
 [   0.            0.            0.9       ]
 [   0.            0.6           0.6       ]]
[-100.            0.75          0.78333333    0.9           0.6       ]


['4', '3']

In [126]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_9/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores_alt = {"hard": [], "soft": []}
scores_ver = {"hard": [], "soft": []}
scores_unver = {"hard": [], "soft": []}
for path_t in paths_to_tasks:
    with open(f"/Users/edgy/Downloads/task_9/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set(solver_9(js['tasks'][0]))
    soft_score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    hard_score = answer == true_ans
    if "череду" in js['tasks'][0]["text"]:
        scores_alt["soft"].append(soft_score)
        scores_alt["hard"].append(hard_score)
    if "непровер" in js['tasks'][0]["text"]:
        scores_unver["soft"].append(soft_score)
        scores_unver["hard"].append(hard_score)
    else:
        scores_ver["soft"].append(soft_score)
        scores_ver["hard"].append(hard_score)
    if not hard_score:
        print(f"Task {path_t.split('.')[0]}")
        print(js["tasks"][0]["text"])
        print(f"True answer: {true_ans}")
        print(f"Predicted answer: {answer}")

Task T5432
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная проверяемая гласная корня.
True answer: {'5', '2'}
Predicted answer: {'3', '2'}
Task T7008
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная проверяемая гласная корня.
True answer: {'1', '5'}
Predicted answer: {'1', '3'}
Task T8538
Укажите варианты ответов, в которых во всех словах одного ряда пропущена чередующаяся гласная корня.
True answer: {'5', '2'}
Predicted answer: {'3', '2'}
Task T3749
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная проверяемая гласная корня.
True answer: {'3', '4', '2'}
Predicted answer: {'1', '3'}
Task T7012
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная проверяемая гласная корня.
True answer: {'1', '4'}
Predicted answer: {'1', '3'}
Task T7013
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная непроверяемая гласная корня.
True a

In [127]:
print("Чередующаяся: ", np.mean(scores_alt["soft"]), np.mean(scores_alt["hard"]))
print("Проверяемая: ", np.mean(scores_ver["soft"]), np.mean(scores_ver["hard"]))
print("Непроверяемая: ", np.mean(scores_unver["soft"]), np.mean(scores_unver["hard"]))
print("Total: ", np.mean(scores_unver["soft"]+scores_alt["soft"]+scores_ver["soft"]),
      np.mean(scores_unver["hard"]+scores_alt["hard"]+scores_ver["hard"]))

Чередующаяся:  0.46527777777777773 0.20833333333333334
Проверяемая:  0.44819819819819817 0.16216216216216217
Непроверяемая:  0.6041666666666666 0.4375
Total:  0.4859307359307359 0.23376623376623376


In [214]:
def word_exists(w):
    analysis = morph.parse(w)
    if (analysis[0].methods_stack[0][0].__class__.__name__ == "DictionaryAnalyzer") and\
       (analysis[0].methods_stack[0][1] == w):
        return True
    return False

def solver_9(task, testing=False):
    def is_unverifiable(w):
        for w2 in slovarnie_slova.word:
            if re.match(re.sub(r"[\.]+", ".", w), w2):
                return True
        return False

    def is_stressed(w, pos):
        if len(w) == pos:
            w = w[:pos] + w[pos].upper()
        else:
            w = w[:pos] + w[pos].upper() + w[pos+1:]
        return w in stress_dict.word.values

    def possible_variants(w):
        amount = 0
        for candidate in "аоеиы":
            w_n = re.sub(r"[\.]+", candidate, w)
            if word_exists(w_n):
                amount += 1
        if amount == 0:
            amount = 1
        return amount

    def is_alternant(w):
        #зависящие от конечной согласной корня
        patterns_1 = [
            (r"[а-я]*р[\.]+(ст|щ)[а-я]*", "а"),
            (r"[а-я]*р[\.]+с[а-су-я]*", "о"),
            (r"[а-я]*л[\.]+г[а-я]*", "а"),
            (r"[а-я]*л[\.]+ж[а-я]*", "о"),
            (r"[а-я]*ск[\.]+к[а-я]*", "а"),
            (r"[а-я]*ск[\.]+ч[а-я]*", "о"),
        ]
        #зависящие от суффикса "а" после корня
        patterns_2 = [
            (r"[а-я]*(б|д|м|п|т)[\.]+ра[а-я]*", "и"),
            (r"[а-я]*бл[\.]+ста[а-я]*", "и"),
            (r"[а-я]*ж[\.]+га[а-я]*", "и"),
            (r"[а-я]*ст[\.]+ла[а-я]*", "и"),
            (r"[а-я]*ч[\.]+та[а-я]*", "и"),
            (r"[а-я]*к[\.]+са[а-я]*", "а"),
            (r"[а-я]*(б|д|м|п|т)[\.]+р[б-я]*", "е"),
            (r"[а-я]*бл[\.]+ст[б-я]*", "е"),
            (r"[а-я]*ж[\.]+г[б-я]*", "е"),
            (r"[а-я]*ст[\.]+л[б-я]*", "е"),
            (r"[а-я]*ч[\.]+т[б-я]*", "е"),
            (r"[а-я]*к[\.]+с[б-я]*", "о"),
        ]
        #зависящие от ударения (плов-плав хз почему тут, всегда пишется "а" кроме исключений)
        patterns_3a = [
            (r"[а-я]*з[\.]+р[а-я]*", "оа"),
            (r"[а-я]*г[\.]+р[а-я]*", "ао"),
            (r"[а-я]*тв[\.]+р[а-нп-я]+", "ао"),
        ]
        patterns_3b = [
            (r"[а-я]*пл[\.]+в[а-я]*", "оа"),
        ]
        #зависящие от лексического значения
        patterns_4 = [
            (r"[а-я]*м[\.]+к[а-я]*", "оа"),
            (r"[а-я]*р[\.]+вн[а-я]*", "оа"),
        ]

        exceptions = [
            r"росток", r"ростов", r"ростислав", r"ростовщик",
            r"отрасль", r"скачок", r"скачу", r"сочетать", r"сочетание",
            r"чета", r"зоревать", r"зорянка", r"пловец", r"пловчиха",
            r"плывун[ы]{0,1}", r"уровень", r"ровесник", r"равнина", r"равняйсь", 
            r"равнение ",
        ]
        w = w.lower()
        pos_space = re.search(r"\.", w).span()[0]
        for p in patterns_1:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"[\.]+", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 1, filled_w, pos_space, stressed
        for p in patterns_2:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"[\.]+", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 2, filled_w, pos_space, stressed
        for p in patterns_4:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"[\.]+", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 4, filled_w, pos_space, stressed
        for p in patterns_3a:
            if re.match(p[0], w):
                for q, p_i in enumerate(p[1]):
                    filled_w = re.sub(r"[\.]+", p_i, w)
                    if word_exists(filled_w):
                        stress_ind = is_stressed(filled_w, pos_space)
                        if (stress_ind) and (q == 0):
                            return True, 3, filled_w, pos_space, True
                        if (q == 1) and (not stress_ind):
                            return True, 3, filled_w, pos_space, False
        for p in patterns_3b:
            if re.match(p[0], w):
                for p_i in p[1]:
                    filled_w = re.sub(r"[\.]+", p_i, w)
                    if word_exists(filled_w):
                        stressed = is_stressed(filled_w, pos_space)
                        return True, 3, filled_w, pos_space, stressed
        return False, None, None, pos_space, None

    words = np.array([re.split(r", ", t["text"]) for t in task["question"]["choices"]])
    #обрезаем цифры 
    words = [[re.sub(r"…", "..", re.sub(r"[0-9]+\)", "", t2)).strip() for t2 in t1] for t1 in words]
    exact_types = np.zeros((len(words), len(words[0]))).astype(int)-1
    for i in range(exact_types.shape[0]):
        for j in range(exact_types.shape[1]):
            if words[i][j] in exact_labels["alt"]:
                exact_types[i, j] = 0
            elif words[i][j] in exact_labels["unver"]:
                exact_types[i, j] = 1
            elif words[i][j] in exact_labels["ver"]:
                exact_types[i, j] = 2
    words = [[re.sub(r"\([а-я ]+\)", "", t2).strip() for t2 in t1] for t1 in words]

    #определяем какой тип нужно искать
    if "чередующ" in task["text"]:
        task_type = 0
    elif "непровер" in task["text"]:
        task_type = 1
    else:
        task_type = 2
        
    alt_labels = [[is_alternant(t2) for t2 in t1] for t1 in words]
    unver_labels = [[is_unverifiable(t2) for t2 in t1] for t1 in words]
    possible_ways = [[possible_variants(t2) for t2 in t1] for t1 in words]
    scores = np.zeros((len(words), len(words[0]), 3))
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            if exact_types[i, j] >= 0:
                if exact_types[i, j] != task_type:
                    scores[i, :, task_type] = -100
                    break
                scores[i, j, exact_types[i, j]] = 1
            else:
                scores[i, j, 0] = 0
                scores[i, j, 1] = unver_labels[i][j] * 0.9
                if alt_labels[i][j][0]:
                    scores[i, j, 0] = alt_labels[i][j][0] * 0.9
                scores[i, j, 2] = 0.9 - 0.5 * (scores[i, j, 0] + scores[i, j, 1])
                
    if testing: print(scores)
    agg_scores = scores.mean(axis=1)
    if testing: print(agg_scores)
    agg_scores = agg_scores[:, task_type]
    if testing: print(agg_scores)
    max_score = agg_scores.max()
    second_value = agg_scores[agg_scores.argsort()[-2]]
    answer_numbers = np.arange(len(agg_scores))[agg_scores==max_score]
    if ((len(answer_numbers) < 2) and (second_value > 0)) or ((len(answer_numbers) == 2) and (second_value > 2.2)):
        answer_numbers = np.concatenate([answer_numbers,
                                         np.arange(len(agg_scores))[agg_scores==second_value]])
        if second_value > 2.2:
            answer_numbers = answer_numbers[:3]
        else:
            answer_numbers = answer_numbers[:2]
    answer_numbers += 1
    answer_numbers = [str(t) for t in answer_numbers]
    return answer_numbers

In [215]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_9/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores_alt = {"hard": [], "soft": []}
scores_ver = {"hard": [], "soft": []}
scores_unver = {"hard": [], "soft": []}
for path_t in paths_to_tasks:
    with open(f"/Users/edgy/Downloads/task_9/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set(solver_9(js['tasks'][0]))
    soft_score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    hard_score = answer == true_ans
    if "череду" in js['tasks'][0]["text"]:
        scores_alt["soft"].append(soft_score)
        scores_alt["hard"].append(hard_score)
    if "непровер" in js['tasks'][0]["text"]:
        scores_unver["soft"].append(soft_score)
        scores_unver["hard"].append(hard_score)
    else:
        scores_ver["soft"].append(soft_score)
        scores_ver["hard"].append(hard_score)
    if not hard_score:
        print(f"Task {path_t.split('.')[0]}")
        print(js["tasks"][0]["text"])
        print(f"True answer: {true_ans}")
        print(f"Predicted answer: {answer}")

Task T7009
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная проверяемая гласная корня.
True answer: {'5', '4'}
Predicted answer: {'4', '3'}
Task T6903
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная чередующаяся гласная корня.
True answer: {'5', '3'}
Predicted answer: {'1', '3'}
Task T6908
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная чередующаяся гласная корня.
True answer: {'5', '2'}
Predicted answer: {'5', '4'}
Task T6899
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная чередующаяся гласная корня.
True answer: {'1', '4'}
Predicted answer: {'1', '3'}
Task T6909
Укажите варианты ответов, в которых во всех словах одного ряда пропущена безударная проверяемая гласная корня.
True answer: {'1', '4', '3'}
Predicted answer: {'1', '3'}


In [216]:
print("Чередующаяся: ", np.mean(scores_alt["soft"]), np.mean(scores_alt["hard"]))
print("Проверяемая: ", np.mean(scores_ver["soft"]), np.mean(scores_ver["hard"]))
print("Непроверяемая: ", np.mean(scores_unver["soft"]), np.mean(scores_unver["hard"]))
print("Total: ", np.mean(scores_unver["soft"]+scores_alt["soft"]+scores_ver["soft"]),
      np.mean(scores_unver["hard"]+scores_alt["hard"]+scores_ver["hard"]))

Чередующаяся:  0.9166666666666666 0.875
Проверяемая:  0.918918918918919 0.8648648648648649
Непроверяемая:  1.0 1.0
Total:  0.935064935064935 0.8961038961038961


In [8]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_9/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores_alt = []
scores_ver = []
scores_unver = []
for path_t in paths_to_tasks:
    with open(f"/Users/edgy/Downloads/task_9/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = {"1", "2", "3", "4", "5"}
    last_score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    if "череду" in js['tasks'][0]["text"]:
        scores_alt.append(last_score)
    if "непровер" in js['tasks'][0]["text"]:
        scores_unver.append(last_score)
    else:
        scores_ver.append(last_score)
        
print("Чередующаяся: ", np.mean(scores_alt))
print("Проверяемая: ", np.mean(scores_ver))
print("Непроверяемая: ", np.mean(scores_unver))
print("Total: ", np.mean(scores_unver+scores_alt+scores_ver))

Чередующаяся:  0.4416666666666667
Проверяемая:  0.4540540540540541
Непроверяемая:  0.41250000000000003
Total:  0.44155844155844154


# Solver 17

In [3]:
import sys
import codecs
from keras_bert import load_trained_model_from_checkpoint
import tokenization

# bert_folder = 'multi_cased_L-12_H-768_A-12'
rubert_folder = "/Users/edgy/Downloads/rubert_cased_L-12_H-768_A-12_v2"

Using TensorFlow backend.


In [4]:
# bert_tokenizer = tokenization.FullTokenizer(vocab_file=bert_folder+'/vocab.txt',
#                                             do_lower_case=False)
# bert = load_trained_model_from_checkpoint(bert_folder+'/bert_config.json',
#                                           bert_folder+'/bert_model.ckpt',
#                                           training=True)
rubert_tokenizer = tokenization.FullTokenizer(vocab_file=rubert_folder+'/vocab.txt',
                                              do_lower_case=False)
rubert = load_trained_model_from_checkpoint(rubert_folder+'/bert_config.json',
                                            rubert_folder+'/bert_model.ckpt',
                                            training=True)

W1009 22:09:41.423934 140735486686080 deprecation_wrapper.py:119] From /Users/edgy/projects/ai-journey-2019/notebooks/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

W1009 22:09:41.835791 140735486686080 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1009 22:09:41.876387 140735486686080 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1009 22:09:41.932894 140735486686080 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:131: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1009 22:09:41.933692 140735486686080 deprecation_wrapper.py:119] 

In [None]:
def solver_17(task, threshold=0.5, testing=False):
    
    max_length = 512
    
    text = task["text"]
    text = re.sub(r"\(\d\)", "[MASK]", text)
    if testing:
        print(text)
    text = text.replace("[ ]*\[MASK\][ ]*","[MASK]")
    text = text.split("[MASK]")
    
    tokens = ["[CLS]"]
    for i in range(len(text)):
        if i == 0:
            tokens = tokens + tokenizer.tokenize(text[i]) 
        else:
            tokens = tokens + ['[MASK]'] + tokenizer.tokenize(text[i]) 
    tokens = tokens + ['[SEP]'] 
    token_input = tokenizer.convert_tokens_to_ids(tokens)
    token_input = np.array(token_input + [0] * (512 - len(token_input)))
    
    mask_input = np.zeros(max_length)
    mask_input[token_input == 103] = 1
    
    seg_input = np.zeros(max_length)
    predicts = model.predict([token_input.reshape(1, -1), seg_input.reshape(1, -1), mask_input.reshape(1, -1)])[0]
    comma_likelihoods = predicts[0, :, 117][mask_input.astype(bool)]
    dot_likelihoods = predicts[0, :, 119][mask_input.astype(bool)]
    and_likelihoods = predicts[0, :, 549][mask_input.astype(bool)]
    or_likelihoods = predicts[0, :, 10880][mask_input.astype(bool)]
    complex_likelihoods = [t1 + t2 for t1, t2 in zip(comma_likelihoods, and_likelihoods)]
    if testing: print(f"',': {comma_likelihoods}, '.': {dot_likelihoods}, 'и': {and_likelihoods}, 'или': {or_likelihoods}")
    return [str(i+1) for i, t in enumerate(complex_likelihoods) if t >= threshold]

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_17/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores = []

for path_t in tqdm_notebook(paths_to_tasks[:]):
    with open(f"/Users/edgy/Downloads/task_17/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set(solver_17(js["tasks"][0], testing=True))
    print("true: ", true_ans)
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores.append(score)

print("Score: ", np.mean(scores))

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_17/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores_t = []

for path_t in tqdm_notebook(paths_to_tasks):
    with open(f"/Users/edgy/Downloads/task_17/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set([t["id"] for t in js["tasks"][0]["question"]["choices"]])
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores_t.append(score)

print("Score: ", np.mean(scores_t))

In [10]:
def tmp(task, model=rubert, tokenizer=rubert_tokenizer):
    
    max_length = 512
    
    text = task["text"]
#     for l in "ЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЁЯЧСМИТЬБЮ":
#         text = re.sub(fr"\n{l}", " "+l.lower(), text)
    text = re.sub(r"\(\d\)", "[MASK]", text)
    text = text.replace("[ ]*\[MASK\][ ]*","[MASK]")
    text = text.split("[MASK]")
    
    tokens = ["[CLS]"]
    for i in range(len(text)):
        if i == 0:
            tokens = tokens + tokenizer.tokenize(text[i]) 
        else:
            tokens = tokens + ['[MASK]'] + tokenizer.tokenize(text[i]) 
    tokens = tokens + ['[SEP]'] 
    token_input = tokenizer.convert_tokens_to_ids(tokens)
    token_input = np.array(token_input + [0] * (512 - len(token_input)))
    
    mask_input = np.zeros(max_length)
    mask_token_id = tokenizer.convert_tokens_to_ids(["[MASK]"])[0]
    mask_input[token_input == mask_token_id] = 1
    
    seg_input = np.zeros(max_length)
    predicts = model.predict([token_input.reshape(1, -1), seg_input.reshape(1, -1), mask_input.reshape(1, -1)])[0]
    comma_id = tokenizer.convert_tokens_to_ids([","])[0]
    dot_id = tokenizer.convert_tokens_to_ids(["."])[0]
    or_id = tokenizer.convert_tokens_to_ids(["или"])[0]
    and_id = tokenizer.convert_tokens_to_ids(["и"])[0]
    dotcom_id = tokenizer.convert_tokens_to_ids([";"])[0]
    or2_id = tokenizer.convert_tokens_to_ids(["либо"])[0]
    with_id = tokenizer.convert_tokens_to_ids(["с"])[0]
    tire_id = tokenizer.convert_tokens_to_ids(["-"])[0]
    da_id = tokenizer.convert_tokens_to_ids(["да"])[0]
    comma_likelihoods = predicts[0, :, comma_id][mask_input.astype(bool)]
    dot_likelihoods = predicts[0, :, dot_id][mask_input.astype(bool)]
    and_likelihoods = predicts[0, :, and_id][mask_input.astype(bool)]
    or_likelihoods = predicts[0, :, or_id][mask_input.astype(bool)]
    dotcom_likelihoods = predicts[0, :, dotcom_id][mask_input.astype(bool)]
    or2_likelihoods = predicts[0, :, or2_id][mask_input.astype(bool)]
    with_likelihoods = predicts[0, :, with_id][mask_input.astype(bool)]
    tire_likelihoods = predicts[0, :, tire_id][mask_input.astype(bool)]
    da_likelihoods = predicts[0, :, da_id][mask_input.astype(bool)]
    return comma_likelihoods, dot_likelihoods, and_likelihoods, or_likelihoods,\
           dotcom_likelihoods, or2_likelihoods, with_likelihoods, tire_likelihoods,\
           da_likelihoods

In [13]:
df2 = pd.DataFrame(columns=["task#", "task_id", "comma", "dot", "and", "or", "dotcom",
                            "or2", "with", "tire", "da", "true", "is_ru"])

for task_number in [17, 18, 19, 20]:
    paths_to_tasks = os.listdir(f"/Users/edgy/Downloads/task_{task_number}/")
    paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
    for path_t in tqdm_notebook(paths_to_tasks):
        with open(f"/Users/edgy/Downloads/task_{task_number}/{path_t}", "r") as f:
            js = json.load(f)
        if "correct" in js['tasks'][0]['solution']:
            true_ans = set(js['tasks'][0]['solution']['correct'])
        else: 
            true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
        model = rubert
        tokenizer = rubert_tokenizer
        comma_likelihoods, dot_likelihoods, and_likelihoods, or_likelihoods, dotcom_likelihoods, or2_likelihoods, with_likelihoods, tire_likelihoods, da_likelihoods = tmp(js["tasks"][0], model=model, tokenizer=tokenizer)
        for i, _ in enumerate(comma_likelihoods):
            true = 0
            if str(i + 1) in true_ans:
                true = 1
            model_name = "rubert"
            df2.loc[len(df2)] = [task_number,
                                   path_t,
                                   comma_likelihoods[i],
                                   dot_likelihoods[i],
                                   and_likelihoods[i],
                                   or_likelihoods[i],
                                   dotcom_likelihoods[i],
                                   or2_likelihoods[i],
                                   with_likelihoods[i],
                                   tire_likelihoods[i],
                                   da_likelihoods[i],
                                   true,
                                   model_name,
                                  ]

HBox(children=(IntProgress(value=0, max=62), HTML(value='')))




HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




HBox(children=(IntProgress(value=0, max=62), HTML(value='')))




HBox(children=(IntProgress(value=0, max=62), HTML(value='')))




In [15]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==17) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma_ru = df_t["comma"].values
dot_ru = df_t["dot"].values
an_ru = df_t["and"].values
o_ru = df_t["or"].values
preds = (comma_ru + an_ru + dot_ru > 0.65).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.8913043478260869
Baseline: 0.577639751552795


In [23]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==17) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma_ru = df_t["comma"].values
dot_ru = df_t["dot"].values
an_ru = df_t["and"].values
o_ru = df_t["or"].values
dotcom_ru = df_t["dotcom"].values
or2_ru = df_t["or2"].values
with_ru = df_t["with"].values
tire_ru = df_t["tire"].values
da_ru = df_t["da"].values
preds = (comma_ru + an_ru + dot_ru > 0.65).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.8913043478260869
Baseline: 0.577639751552795


In [45]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==18) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma = df_t["comma"].values
dot = df_t["dot"].values
an = df_t["and"].values
o = df_t["or"].values
dotcom_ru = df_t["dotcom"].values
or2_ru = df_t["or2"].values
with_ru = df_t["with"].values
tire_ru = df_t["tire"].values
da_ru = df_t["da"].values
preds = (comma + dot + an > 0.35).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.743859649122807
Baseline: 0.5578947368421052


In [46]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==18) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma_ru = df_t["comma"].values
dot_ru = df_t["dot"].values
an_ru = df_t["and"].values
o_ru = df_t["or"].values
dotcom_ru = df_t["dotcom"].values
or2_ru = df_t["or2"].values
with_ru = df_t["with"].values
tire_ru = df_t["tire"].values
da_ru = df_t["da"].values
preds = (comma_ru + dot_ru + an_ru + with_ru + dotcom_ru > 0.3).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.7543859649122807
Baseline: 0.5578947368421052


In [53]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==19) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma_ru = df_t["comma"].values
dot_ru = df_t["dot"].values
an_ru = df_t["and"].values
o_ru = df_t["or"].values
dotcom_ru = df_t["dotcom"].values
or2_ru = df_t["or2"].values
with_ru = df_t["with"].values
tire_ru = df_t["tire"].values
da_ru = df_t["da"].values
preds = (comma_ru + dot_ru + o_ru > 0.55).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.8681318681318682
Baseline: 0.5238095238095238


In [58]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==19) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma_ru = df_t["comma"].values
dot_ru = df_t["dot"].values
an_ru = df_t["and"].values
o_ru = df_t["or"].values
dotcom_ru = df_t["dotcom"].values
or2_ru = df_t["or2"].values
with_ru = df_t["with"].values
tire_ru = df_t["tire"].values
da_ru = df_t["da"].values
preds = (comma_ru + dot_ru + o_ru > 0.55).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.8681318681318682
Baseline: 0.5238095238095238


In [70]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==20) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma_ru = df_t["comma"].values
dot_ru = df_t["dot"].values
an_ru = df_t["and"].values
o_ru = df_t["or"].values
dotcom_ru = df_t["dotcom"].values
or2_ru = df_t["or2"].values
with_ru = df_t["with"].values
tire_ru = df_t["tire"].values
da_ru = df_t["da"].values
preds = (comma_ru > 0.7).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.8237179487179487
Baseline: 0.7115384615384616


In [76]:
from sklearn.metrics import accuracy_score

df_t = df2[(df2["task#"]==20) & (df2["is_ru"]=="rubert")]
true = df_t["true"].values.astype(int)
comma_ru = df_t["comma"].values
dot_ru = df_t["dot"].values
an_ru = df_t["and"].values
o_ru = df_t["or"].values
dotcom_ru = df_t["dotcom"].values
or2_ru = df_t["or2"].values
with_ru = df_t["with"].values
tire_ru = df_t["tire"].values
da_ru = df_t["da"].values
preds = (comma_ru > 0.7).astype(int)
print(f"Accuracy: {accuracy_score(true, preds)}")
print(f"Baseline: {sum(true) / len(true)}")

Accuracy: 0.8237179487179487
Baseline: 0.7115384615384616


# Solver 18

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_18/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores = []

for path_t in tqdm_notebook(paths_to_tasks):
    with open(f"/Users/edgy/Downloads/task_18/{path_t}", "r") as f:
        js = json.load(f)
    text = js["tasks"][0]["text"]
    text = re.sub(r"\n", " ", text)
    for l in "ЙЦУКЕНГШЩЗХФЫВАПРОЛДЖЭЁЯЧСМИТЬБЮ":
        if " " +l in text:
            text = re.sub(" " +l, " " +l.lower(), text)
    js["tasks"][0]["text"] = text
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set(solver_17(js["tasks"][0], True, threshold=0.001))
    print("true: ", true_ans)
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores.append(score)

print("Score: ", np.mean(scores))

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_18/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores_t = []

for path_t in tqdm_notebook(paths_to_tasks):
    with open(f"/Users/edgy/Downloads/task_18/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set([t["id"] for t in js["tasks"][0]["question"]["choices"]])
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores_t.append(score)

print("Score: ", np.mean(scores_t))

# Solver 19

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_19/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores = []

for path_t in tqdm_notebook(paths_to_tasks):
    with open(f"/Users/edgy/Downloads/task_19/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else:
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set(solver_17(js["tasks"][0], True, threshold=0.6))
    print("true: ", true_ans)
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores.append(score)

print("Score: ", np.mean(scores))

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_19/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores_t = []

for path_t in tqdm_notebook(paths_to_tasks):
    with open(f"/Users/edgy/Downloads/task_19/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set([t["id"] for t in js["tasks"][0]["question"]["choices"]])
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores_t.append(score)

print("Score: ", np.mean(scores_t))

# Solver 20

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_20/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores = []

for path_t in tqdm_notebook(paths_to_tasks):
    with open(f"/Users/edgy/Downloads/task_20/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else:
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set(solver_17(js["tasks"][0], True, threshold=0.5))
    print("true: ", true_ans)
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores.append(score)

print("Score: ", np.mean(scores))

In [None]:
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_20/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
scores_t = []

for path_t in tqdm_notebook(paths_to_tasks):
    with open(f"/Users/edgy/Downloads/task_20/{path_t}", "r") as f:
        js = json.load(f)
    if "correct" in js['tasks'][0]['solution']:
        true_ans = set(js['tasks'][0]['solution']['correct'])
    else: 
        true_ans = set(js['tasks'][0]['solution']['correct_variants'][0])
    answer = set([t["id"] for t in js["tasks"][0]["question"]["choices"]])
    score = len(answer.intersection(true_ans)) / len(answer.union(true_ans))
    scores_t.append(score)

print("Score: ", np.mean(scores_t))

# Solver 14

In [3]:
from keras_bert import load_trained_model_from_checkpoint
import tokenization

rubert_folder = "/Users/edgy/Downloads/rubert_cased_L-12_H-768_A-12_v2"
tokenizer_bert = tokenization.FullTokenizer(vocab_file=rubert_folder+'/vocab.txt',
                                              do_lower_case=False)
bert = load_trained_model_from_checkpoint(rubert_folder+'/bert_config.json',
                                          rubert_folder+'/bert_model.ckpt',
                                          training=True)

Using TensorFlow backend.
W1011 02:16:18.975415 140735486686080 deprecation_wrapper.py:119] From /Users/edgy/projects/ai-journey-2019/notebooks/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

W1011 02:16:19.416730 140735486686080 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1011 02:16:19.457597 140735486686080 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1011 02:16:19.522840 140735486686080 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:131: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1011 02:16:19.523860 140735486686080 de

In [4]:
def word_exists(w):
    analysis = morph.parse(w)
    if (analysis[0].methods_stack[0][0].__class__.__name__ == "DictionaryAnalyzer") and \
            (analysis[0].methods_stack[0][1] == w):
        return True
    return False

In [5]:
tasks = []
paths_to_tasks = os.listdir("/Users/edgy/Downloads/task_14/")
paths_to_tasks = [t for t in paths_to_tasks if t.startswith("T")]
for path_t in paths_to_tasks:
    with open(f"/Users/edgy/Downloads/task_14/{path_t}", "r") as f:
        js = json.load(f)
        tasks.append(js)

In [6]:
with open("../models/dictionaries/freq_dict_ruscorpora.json", "r") as f:
    freq = json.load(f)

In [58]:
len(freq)

1054208

In [59]:
def solver_14(task, frequency_normalization=False):
    def possible_variants(w):
        w1 = re.sub(r"[\(\)]", "", w)
        if w.startswith("("):
            w2 = re.sub(r"\)", "-", re.sub(r"\(", "", w))
            w3 = re.sub(r"\)", " ", re.sub(r"\(", "", w))
        else:
            w2 = re.sub(r"\(", "-", re.sub(r"\)", "", w))
            w3 = re.sub(r"\(", " ", re.sub(r"\)", "", w))
        w1_exists = word_exists(w1)
        w2_exists = word_exists(w2)
        w3_exists = word_exists(w3.split(" ")[0]) and word_exists(w3.split(" ")[1])
        # пропишем явно что делать, если слово начинается на ПОЛ, пайморфи тупит
        # в этих случаях, правила приблизительно реализованы (не рассмотрены заглавные,
        # случай с У)
        if w1.startswith("пол"):
            if w1[3] in "леыаояиюэё":
                w1_exists, w2_exists, w3_exists = False, True, False
            else:
                w1_exists, w2_exists, w3_exists = True, False, False
        if (not w2_exists) and (not w3_exists):
            w1_exists = True
        return w1, w1_exists, w2, w2_exists, w3, w3_exists

    def both_together_likelihood(w1_orig, w1_cand, p1, w2_orig, w2_cand, p2, sent):
        max_length = 512

#         if p1 == 1:
#             sent = re.sub(w1_orig, w1_cand, sent)
#         if p2 == 1:
#             sent = re.sub(w2_orig, w2_cand, sent)

        w1_orig_shield = re.sub("\)", "\\)", re.sub("\(", "\\(", w1_orig))
        w2_orig_shield = re.sub("\)", "\\)", re.sub("\(", "\\(", w2_orig))
        sent = re.split(fr"({w1_orig_shield}|{w2_orig_shield})", sent)
        sent = [t for t in sent if len(t) > 0]
        tokens = ["[CLS]"]
        exp_tokens = ["[CLS]"]
        word_masks = [0]

        w1_used = False
        for part in sent:
            if (part == w1_orig) and not w1_used:
                kek = tokenizer_bert.tokenize(w1_cand)
                exp_tokens += kek
                tokens += ["[MASK]"] * len(kek)
                word_masks += [1] * len(kek)
                w1_used = True
            elif part == w2_orig:
                kek = tokenizer_bert.tokenize(w2_cand)
                exp_tokens += kek
                tokens += ["[MASK]"] * len(kek)
                word_masks += [2] * len(kek)
            else:
                kek = tokenizer_bert.tokenize(part.strip())
                exp_tokens += kek
                tokens += kek
                word_masks += [0] * len(kek)
        tokens += ["[SEP]"]
        exp_tokens += ["[SEP]"]
        token_input = tokenizer_bert.convert_tokens_to_ids(tokens)
        token_input = np.array(token_input + [0] * (max_length - len(token_input)))
        exp_token_input = tokenizer_bert.convert_tokens_to_ids(exp_tokens)
        exp_token_input = np.array(exp_token_input + [0] * (max_length - len(exp_token_input)))
        word_masks = np.array(word_masks + [0] * (max_length - len(word_masks)))
        mask_input = word_masks != 0
        seg_input = np.zeros(max_length)

        predicts = bert.predict([token_input.reshape(1, -1),
                                       seg_input.reshape(1, -1),
                                       mask_input.reshape(1, -1)])[0]
        preds_1 = predicts[0, word_masks==1]
        exp_token_id_1 = exp_token_input[word_masks==1]
        subprobas_1 = []
        for i, t_id in enumerate(exp_token_id_1):
            subprobas_1.append(preds_1[i, t_id])
        preds_2 = predicts[0, word_masks==2]
        exp_token_id_2 = exp_token_input[word_masks==2]
        subprobas_2 = []
        for i, t_id in enumerate(exp_token_id_2):
            subprobas_2.append(preds_2[i, t_id])
        print(sent)
        norm_1, norm_2 = 1, 1
        if frequency_normalization:
            norm_1 /= freq[w1_cand]
            norm_2 /= freq[w2_cand]
        print(subprobas_1, norm_1, subprobas_2, norm_2)
        print(min(np.mean(subprobas_1) * norm_1,
                   np.mean(subprobas_2) * norm_2))
        if p1 == 1:
            return np.mean(subprobas_2) * norm_2
        if p2 == 1:
            return np.mean(subprobas_1) * norm_1
        return min(np.mean(subprobas_1) * norm_1,
                   np.mean(subprobas_2) * norm_2)
    text = task["text"]
    tmp = re.split(r"[\n\.\?\!]+", text)
    sentences = []
    word_pairs = []
    possibilities = []
    together_variants = []
    for s in tmp:
        if len(s) < 1:
            continue
        if not s[-1] in ".?!":
            s += "."
        words = re.findall("[А-ЯЁ]*\([А-ЯЁ]+\)[А-ЯЁ]*", s)
        if len(words) == 2:
            w1_1, t1_1, w1_2, t1_2, w1_3, t1_3 = possible_variants(words[0].lower())
            w2_1, t2_1, w2_2, t2_2, w2_3, t2_3 = possible_variants(words[1].lower())
            if t1_1 and t2_1:
                word_pairs.append(words)
                sentences.append(s)
                possibilities.append([[t1_1, t1_2, t1_3], [t2_1, t2_2, t2_3]])
                together_variants.append([w1_1, w2_1])
                if not(t1_2 or t1_3 or t2_2 or t2_3):
                    # У нас есть досрочный ответ
                    return w1_1+w2_1
    max_likelihood = 0
    if len(together_variants) == 1:
        w1_cand, w2_cand = together_variants[0]
        return w1_cand + w2_cand
    elif len(together_variants) == 0:
        return w1_1 + w2_1
    for s, word_pair, possibility, together_variant in zip(sentences,
                                                           word_pairs,
                                                           possibilities,
                                                           together_variants):
        w1_orig, w2_orig = word_pair
        w1_cand, w2_cand = together_variant
        p1, p2 = possibility
        likelihood = both_together_likelihood(w1_orig, w1_cand, sum(p1),
                                              w2_orig, w2_cand, sum(p2),
                                              s)
        if likelihood > max_likelihood:
            max_likelihood = likelihood
            answer = w1_cand + w2_cand
    return answer

In [60]:
scores = []

for i, task in enumerate(tasks):
    print(f"Task # {i}")
    task = task["tasks"][0]
    pred = solver_14(task, frequency_normalization=True)
    print(pred)
    if "correct" in task["solution"]:
        answer = task["solution"]["correct"]
        print(answer)
        scores.append(answer == pred)
    else:
        answer = task["solution"]["correct_variants"]
        print(answer)
        scores.append(any([pred == t for t in answer]))

Task # 0
['Группами и ', '(ПО)ОДИНОЧКЕ', ' бродили туристы в курортных шапочках, ', '(И)ТАК', ' было почти весь год.']
[0.021590982] 0.0013477088948787063 [9.774301e-07, 7.4745697e-07] 0.0014970059880239522
1.2910831259715795e-09
['(НЕ)СМОТРЯ', ' на прошедшие годы, Николай не смог простить человеку, которого считал другом, его ', '(МАЛО)ДУШИЕ', '.']
[0.0006111483] 4.504301608035674e-05 [0.00077143055, 0.00013530701, 2.719771e-05] 0.003125
2.752796310808448e-08
['(С)НАЧАЛА', ' дети шли в колонне ', '(ПО)ДВОЕ', ', но потом стали двигаться беспорядочными кучками.']
[0.00016406883] 4.3863496797964735e-05 [2.819315e-06, 1.4796928e-05] 0.05
7.196632505975666e-09
['Я всё ', 'ТАК(ЖЕ)', ' очень жалел о сказанном, хотел забыть обо всём, ', '(ПРИ)ТОМ', ' как можно скорее.']
[0.00041144894] 1.0867675186924013e-05 [1.7098411e-05] 0.0001312163758037003
2.243591557912384e-09
несмотрямалодушие
['несмотрямалодушие', 'малодушиенесмотря']
Task # 1
['Собираясь ', '(НА)ВСТРЕЧУ', ' с представителем фирмы, п

['Причиной выбрасывания китов на берег могут быть гидролокаторы военных, достаточно мощные, ', 'ЧТО(БЫ)', ' проникнуть ', '(В)ГЛУБЬ', ' океана и напугать животных.']
[0.9747608] 3.751500600240096e-06 [0.34955] 0.0009652509652509653
3.656815616738181e-06
['Недолго пройдя в темноте, Костя понял, что ', '(СО)ВСЕМ', ' сбился с пути, наверно, ', 'ТО(ЖЕ)', ' понял и командир.']
[0.00056440855] 1.1442825920289274e-05 [0.00035654698] 7.72153071624919e-06
2.7530884693842166e-09
['Новобранцев до принятия присяги не полагалось отпускать в город ', '(ПО)ОДИНОЧКЕ', ', но инструктор, ', '(В)ВИДУ', ' моего необычайного успеха по словесности, сделал для меня исключение.']
[6.987365e-05] 0.0013477088948787063 [0.36560205] 0.00028368794326241134
9.416933637426167e-08
['Никита сказал, ', 'ЧТО(БЫ)', ' мы шли ', '(ПО)ДВОЕ', '.']
[0.21487334] 3.751500600240096e-06 [1.95791e-05, 3.1421438e-05] 0.05
8.060974778891468e-07
чтобывглубь
['поодиночкеввиду', 'ввидупоодиночке']
Task # 15
['Петр I хотел, ', 'ЧТО(БЫ)'

KeyError: 'термокостюмы'

In [57]:
np.mean(scores)

0.5769230769230769

In [50]:
scores = []

for i, task in enumerate(tasks):
    print(f"Task # {i}")
    task = task["tasks"][0]
    pred = solver_14(task)
    print(pred)
    if "correct" in task["solution"]:
        answer = task["solution"]["correct"]
        print(answer)
        scores.append(answer == pred)
    else:
        answer = task["solution"]["correct_variants"]
        print(answer)
        scores.append(any([pred == t for t in answer]))

Task # 0
['Группами и ', '(ПО)ОДИНОЧКЕ', ' бродили туристы в курортных шапочках, ', '(И)ТАК', ' было почти весь год.']
[0.021590982] 1 [9.774301e-07, 7.4745697e-07] 1
7.474569656551466e-07
['(НЕ)СМОТРЯ', ' на прошедшие годы, Николай не смог простить человеку, которого считал другом, его ', '(МАЛО)ДУШИЕ', '.']
[0.0006111483] 1 [0.00077143055, 0.00013530701, 2.719771e-05] 1
2.7197709641768597e-05
['(С)НАЧАЛА', ' дети шли в колонне ', '(ПО)ДВОЕ', ', но потом стали двигаться беспорядочными кучками.']
[0.00016406883] 1 [2.819315e-06, 1.4796928e-05] 1
2.819314886437496e-06
['Я всё ', 'ТАК(ЖЕ)', ' очень жалел о сказанном, хотел забыть обо всём, ', '(ПРИ)ТОМ', ' как можно скорее.']
[0.00041144894] 1 [1.7098411e-05] 1
1.7098411262850277e-05
несмотрямалодушие
['несмотрямалодушие', 'малодушиенесмотря']
Task # 1
['Собираясь ', '(НА)ВСТРЕЧУ', ' с представителем фирмы, парень волновался, ', '(НЕ)СМОТРЯ', ' на большой опыт работы.']
[3.4215332e-06] 1 [0.71765965] 1
3.4215331652376335e-06
['(ВО)ВРЕМЯ'

['Теперь, даже ', '(НЕ)СМОТРЯ', ' на седину, морщины и очки, его ', '(НА)ЧИСТО', ' лишённое эмоций лицо кажется прекрасным.']
[0.98087686] 1 [4.838296e-07, 0.0001007038] 1
4.838295808440307e-07
['Илье Антонычу наскучило носить ружьё ', '(ПОД)МЫШКОЙ', ', ', '(ПО)ЭТОМУ', ' он перекинул его на плечо.']
[7.894238e-06, 0.00012953689, 0.003978688] 1 [0.00182377] 1
7.894237569416873e-06
['(НА)КОНЕЦ', ' прошла неделя, а комната всё ', 'ТАК(ЖЕ)', ' была заперта.']
[9.930327e-07] 1 [0.0006739774] 1
9.930326996254735e-07
подмышкойпоэтому
['несмотряначисто', 'начистонесмотря']
Task # 17
подальшевмиг
['подальшевмиг', 'вмигподальше']
Task # 18
['ТО(ЖЕ)', ' тихое мерцание зеленоватого прозрачного неба, ', 'ТАК(ЖЕ)', ' тянет с реки холодной влагой.']
[1.4368223e-07] 1 [7.5075775e-05] 1
1.4368222878147208e-07
['(ПОЛ)ДНЯ', ' ушло на сборы и подготовку к выступлению, ', '(ПРИ)ЧЁМ', ' оказалось, что ещё далеко не всё готово.']
[6.3048516e-05, 1.6333253e-05] 1 [0.0010269227] 1
1.6333253370248713e-05
полдня

['Нет, судите наш народ не ', '(ПО)ТОМУ', ', что он есть, а ', '(ПО)ТОМУ', ', чем он желал бы стать.']
[0.461019] 1 [0.046049424] 1
0.04604942351579666
['(В)ПОСЛЕДСТВИИ', ' речь автора была проста, ', '(ПРИ)ЧЁМ', ' выразительна.']
[1.2495465e-07] 1 [6.4830434e-05] 1
1.2495465284700913e-07
потомупотому
['впоследствиипричём', 'причёмвпоследствии']
Task # 39
['Мы отправили посылку ', '(В)НАЧАЛЕ', ' месяца, и она пришла ', '(ВО)ВРЕМЯ', '.']
[7.671429e-05] 1 [0.0683443] 1
7.67142919357866e-05
['Во ', 'ЧТО(БЫ)', ' нам поиграть, ', 'ЧТО(БЫ)', ' весело провести время.']
[6.4696e-06] 1 [0.5206495] 1
6.469600066338899e-06
['Идите ', '(В)НИЗ', ' по склону, там и будет ', '(АВТО)СТОЯНКА', '.']
[0.54001874] 1 [1.2882702e-05, 0.023408066] 1
1.2882702321803663e-05
внизавтостоянка
['внизавтостоянка', 'автостоянкавниз']
Task # 40
полгодазамуж
['полгодазамуж', 'замужполгода']
Task # 41
['«И имейте ', '(В)ВИДУ', ': ', 'ЧТО(БЫ)', ' там ещё вы ни придумали, мне всё станет известно.']
[0.023901682] 1 [0.000

In [52]:
np.mean(scores)

0.5961538461538461