In [4]:
%load_ext autoreload
%autoreload 2

#!/usr/bin/env python3
import argparse
import lzma
import pickle
import os
os.chdir("hw_14")
import urllib.request
import sys

flatten = lambda l: [item for sublist in l for item in sublist]
import unicodedata

from collections import namedtuple
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize("NFKD", input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])


import pandas as pd
from diacritization import recodex_predict
from sklearn.model_selection import train_test_split
import numpy as np

import string


class Dictionary:
    def __init__(
        self,
        name="fiction-dictionary.txt",
        url="https://ufal.mff.cuni.cz/~straka/courses/npfl129/1920/datasets/",
    ):
        if not os.path.exists(name):
            print("Downloading {}...".format(name), file=sys.stderr)
            urllib.request.urlretrieve(url + name, filename=name)
            urllib.request.urlretrieve(
                url + name.replace(".txt", ".LICENSE"),
                filename=name.replace(".txt", ".LICENSE"),
            )

        # Load the dictionary to `variants`
        self.variants = {}
        with open(name, "r", encoding="utf-8") as dictionary_file:
            for line in dictionary_file:
                nodia_word, *variants = line.rstrip("\n").split()
                self.variants[nodia_word] = variants


class Dataset:
    LETTERS_NODIA = "acdeeinorstuuyz"
    LETTERS_DIA = "áčďéěíňóřšťúůýž"

    # A translation table usable with `str.translate` to rewrite characters with dia to the ones without them.
    DIA_TO_NODIA = str.maketrans(
        LETTERS_DIA + LETTERS_DIA.upper(), LETTERS_NODIA + LETTERS_NODIA.upper()
    )

    def __init__(
        self,
        name="fiction-train.txt",
        url="https://ufal.mff.cuni.cz/~straka/courses/npfl129/1920/datasets/",
    ):
        if not os.path.exists(name):
            print("Downloading dataset {}...".format(name), file=sys.stderr)
            urllib.request.urlretrieve(url + name, filename=name)
            urllib.request.urlretrieve(
                url + name.replace(".txt", ".LICENSE"),
                filename=name.replace(".txt", ".LICENSE"),
            )

        # Load the dataset and split it into `data` and `target`.
        with open(name, "r", encoding="utf-8") as dataset_file:
            self.data = dataset_file.read()


train = Dataset().data

characters = list(np.unique(list(remove_accents(train.lower())))[1:]) + ["#"]

sentences_train = train.split("\n")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
diac_source_target_combo = namedtuple("diac_combo", ["source", "target"])
diacritization_mapping = {
    diac_source_target_combo("a", "á"): "acute",
    diac_source_target_combo("c", "č"): "caron",
    diac_source_target_combo("d", "ď"): "caron",
    diac_source_target_combo("e", "ě"): "caron",
    diac_source_target_combo("i", "í"): "acute",
    diac_source_target_combo("n", "ň"): "caron",
    diac_source_target_combo("o", "ó"): "acute",
    diac_source_target_combo("r", "ř"): "caron",
    diac_source_target_combo("s", "š"): "caron",
    diac_source_target_combo("t", "ť"): "caron",
    diac_source_target_combo("u", "ů"): "ring",
    diac_source_target_combo("y", "ý"): "acute",
    diac_source_target_combo("z", "ž"): "caron",
}

In [9]:
input_len = 21
def find_ngrams(s, n):
    return zip(*[s[i:] for i in range(n)])


def get_simple_labels(
    feature,
    target,
    letter_position=input_len // 2,
    diacritization_mapping=diacritization_mapping,
):
    feature_letter = feature[letter_position]
    source_target_mapping = diac_source_target_combo(
        source=feature_letter, target=target
    )
    if source_target_mapping in diacritization_mapping:
        return diacritization_mapping[source_target_mapping]
    else:
        return "no_change"


features = np.array(
    flatten(
        [
            list(
                find_ngrams(
                    "".join(["#" for i in range(int((input_len - 1) / 2))])
                    + remove_accents(sentence.lower())
                    + "".join(["#" for i in range(int((input_len - 1) / 2))]),
                    input_len,
                )
            )
            for sentence in sentences_train
        ]
    )
)
targets = np.array(
    [
        feature[int((input_len - 1) / 2)]
        for feature in flatten(
            [
                list(
                    find_ngrams(
                        "".join(["#" for i in range(int((input_len - 1) / 2))])
                        + sentence.lower()
                        + "".join(["#" for i in range(int((input_len - 1) / 2))]),
                        input_len,
                    )
                )
                for sentence in sentences_train
            ]
        )
    ]
)
targets = np.array(
    [get_simple_labels(feature, target) for feature, target in zip(features, targets)]
)

In [10]:
with lzma.open("diacritization.model", "rb") as model_file:
    nn = pickle.load(model_file)

with lzma.open("onehot.encoder", "rb") as model_file:
    ohe = pickle.load(model_file)

with lzma.open("label.encoder", "rb") as model_file:
    le = pickle.load(model_file)


In [22]:
for orig_char, char_features for zip(str.translate(train,Dataset.DIA_TO_NODIA),features):
    orig_char_lowered = orig_char.lower()
    orig_char_is_upper = orig_char.isupper()
    if orig_char_lowered in Dataset.LETTERS_NODIA:
        pass
    else:
        

 nosili cizokrajne obleceni zarivych barev , jejich rec byla prostoupena zvlastnimi zvuky .\nDomy mely okrove zlutou , modrou nebo cervenou omitku a na trzisti nabizeli plno zbozi .\nPoutnikovi jen oci prechazely .\nOsvezil se u prodavace vody a kdyz prochazel kolem stanku s pamlsky , zmocnila se ho nesmirna chut na neco sladkeho .\nCtyri cestovatele ctyr ruznych narodnosti se nahodou potkali na trzisti .\nDomluvili se , ze daji dohromady tu trochu penez , kterou mohou utratit , a koupi si neco , co by si jinak kazdy zvlast nemohl dovolit .\n" Ja chci uva ! " kricel Ital .\n" Lepsi bude , kdyz koupime druiventros , " radil Holandan .\n" Ne , kupme si raisins ! " dozadoval se Francouz .\n" Rozumne se mi zda koupit jedine grapes ! " rekl Anglican .\nA protoze ani jeden z nich nerozumel vyznamu tech cizich slov , zacali se hadat .\nJeden z prihlizejicich zaslechl jejich dohadovani a nabidl se jim delat prostrednika .\n" Splnim vsechna vase prani s tim malem , co mate .\nTento pohled se mu

In [16]:
train.replace(Dataset.DIA_TO_NODIA)

TypeError: replace() takes at least 2 arguments (1 given)

In [None]:
for orig_char in 


In [0]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
targets = le.fit_transform(targets)
ohe = OneHotEncoder(
    handle_unknown="ignore", categories=[sorted(characters) for i in range(input_len)]
)
features = ohe.fit_transform(features)

nn = MLPClassifier(
    hidden_layer_sizes=(
        400,
        375,
        350,
        325,
        300,
        275,
        250,
        225,
        200,
        175,
        150,
        125,
        100,
        75,
        50,
        30,
    ),
    verbose=4,
    early_stopping=True,
    validation_fraction=0.05,
    max_iter=400,
    n_iter_no_change=20,
)

In [32]:
nn.fit(features, targets)

Iteration 1, loss = 0.18176929
Validation score: 0.952455
Iteration 2, loss = 0.08877850
Validation score: 0.964982
Iteration 3, loss = 0.06081016
Validation score: 0.965848
Iteration 4, loss = 0.04281816
Validation score: 0.969892
Iteration 5, loss = 0.03242030
Validation score: 0.968953
Iteration 6, loss = 0.02480789
Validation score: 0.970830
Iteration 7, loss = 0.02280229
Validation score: 0.970397
Iteration 8, loss = 0.01789353
Validation score: 0.970939
Iteration 9, loss = 0.01690040
Validation score: 0.970722
Iteration 10, loss = 0.01596756
Validation score: 0.971625
Iteration 11, loss = 0.01577379
Validation score: 0.971011
Iteration 12, loss = 0.01438978
Validation score: 0.971155
Iteration 13, loss = 0.01279207
Validation score: 0.972202
Iteration 14, loss = 0.01148718
Validation score: 0.971697
Iteration 15, loss = 0.01149893
Validation score: 0.972058
Iteration 16, loss = 0.01182189
Validation score: 0.972455
Iteration 17, loss = 0.01111291
Validation score: 0.972635
Iterat

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(400, 375, 350, 325, 300, 275, 250, 225, 200,
                                  175, 150, 125, 100, 75, 50, 30),
              learning_rate='constant', learning_rate_init=0.001, max_iter=400,
              momentum=0.9, n_iter_no_change=20, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.05, verbose=4,
              warm_start=False)

In [27]:
with lzma.open("diacritization.model", "wb") as model_file:
    pickle.dump(nn, model_file)

with lzma.open("onehot.encoder", "wb") as model_file:
    pickle.dump(ohe, model_file)

with lzma.open("label.encoder", "wb") as model_file:
    pickle.dump(le, model_file)

In [208]:

sentences = data.split("\n")


def find_ngrams(s, n):
    return zip(*[s[i:] for i in range(n)])


input_len = 13
features = np.array(
    [
        list(
            find_ngrams(
                "".join(["#" for i in range(int((input_len - 1) / 2))])
                + remove_accents(sentence.lower())
                + "".join(["#" for i in range(int((input_len - 1) / 2))]),
                input_len,
            )
        )
        for sentence in sentences
    ]
)


def capitalize_word(accented_word, capitalized_word):
    final_capitalized_word = []
    if len(accented_word) != len(capitalized_word):
        return capitalized_word
    for accented_character, capitalized_character in zip(
        accented_word, capitalized_word
    ):
        if capitalized_character.isupper():
            final_capitalized_word.append(accented_character.upper())
        else:
            final_capitalized_word.append(accented_character)
    return "".join(final_capitalized_word)


def sentence_predict(sentence_sliding_window, sentence_orig, nn, ohe, le):
    if len(sentence_orig) == 0:
        return ""
    ohe_sentence = ohe.transform(sentence_sliding_window)
    predictions = nn.predict(ohe_sentence)
    characters = le.inverse_transform(predictions)
    sentence_predicted = "".join(characters)
    words_predicted = sentence_predicted.split(" ")
    words_orig = sentence_orig.split(" ")
    words_capitalized = [
        capitalize_word(accented_word, orig_word)
        for accented_word, orig_word in zip(words_predicted, words_orig)
    ]
    return " ".join(words_capitalized)


predicted_sentences = [
    sentence_predict(transformed_sentence, orig_sentence, nn, ohe, le)
    for transformed_sentence, orig_sentence in zip(features, sentences)
]