In [83]:
import math
from random import shuffle
import time

import torch

In [84]:
BATCH_SIZE = 10
STRING_SIZE = 60
NUM_EPOCHS = 50
LEARNING_RATE = 0.1
FILE_NAME = "./3pigs.txt"
CAESAR_N = 2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [85]:
class Alphabet(object):

    def __init__(self):
        self.letters = ""

    def __len__(self):
        return len(self.letters)

    def __contains__(self, item):
        return item in self.letters

    def __getitem__(self, item):
        if isinstance(item, int):
            return self.letters[item % len(self.letters)]
        elif isinstance(item, str):
            return self.letters.find(item)

    def __str__(self):
        letters = " ".join(self.letters)
        return f"Alphabet is:\n {letters}\n {len(self)} chars"

    def load_from_file(self, file_path):
        with open(file_path) as file:
            while True:
                text = file.read(STRING_SIZE)
                if not text:
                    break
                for ch in text:
                    if ch not in self.letters:
                        self.letters += ch
        return self


ALPHABET = Alphabet().load_from_file(FILE_NAME)
print(ALPHABET)

Alphabet is:
 Т Р И   П О С Е Н К А 
 Ж и л - б ы н а с в е т р п о к . В д г , у ь з м х Д ж З я : ф й ш ц —   Я М Г ч ю У ! щ 	 Х Л э Ч ? Э ъ Б « Ф » 1 2 3 4 5 6 7 8 9 0 P R E F A C S U O I N G t h a T r u i s w o m n e g d f p c l y v b k ; B z W H ( j ) " V L ' D Y K q M x J _ Q X [ ] Z ä = æ ë é Æ – * ё … Ю ’ „ “ Ш ” è ê ï ç à Ц Ь â á
 164 chars


In [86]:
class SentenceDataset(torch.utils.data.Dataset):

    def __init__(self, raw_data, alphabet):
        super().__init__()
        self._len = len(raw_data)
        self.y = torch.tensor(
            [[alphabet[ch] for ch in line] for line in raw_data]
        ).to(DEVICE)
        self.x = torch.tensor(
            [[i + CAESAR_N for i in line] for line in self.y]
        ).to(DEVICE)
    
    def __len__(self):
        return self._len

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [87]:
def get_text_array(file_path, step):
    text_array = []
    with open(file_path) as file:
        while True:
            text = file.read(STRING_SIZE)
            if not text:
                break
            text_array.append(text)
    del text_array[-1]
    return text_array

In [88]:
raw_data = get_text_array(FILE_NAME, STRING_SIZE)
shuffle(raw_data)
_10_percent = math.ceil(len(raw_data) * 0.1)
val_data = raw_data[:_10_percent]
raw_data = raw_data[_10_percent:]
_20_percent = math.ceil(len(raw_data) * 0.2)
test_data = raw_data[:_20_percent]
train_data = raw_data[_20_percent:]

Y_val = torch.tensor([[ALPHABET[ch] for ch in line] for line in val_data])
X_val = torch.tensor([[i + CAESAR_N for i in line] for line in Y_val])

train_dl = torch.utils.data.DataLoader(
    SentenceDataset(
        train_data, ALPHABET
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True
)
test_dl = torch.utils.data.DataLoader(
    SentenceDataset(
        test_data, ALPHABET
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True
)

In [89]:
class RNNModel(torch.nn.Module):
    
    def __init__(self):
        super().__init__()
        self.embed = torch.nn.Embedding(len(ALPHABET) + CAESAR_N, 32)
        self.rnn = torch.nn.RNN(32, 128, batch_first=True)
        self.linear = torch.nn.Linear(128, len(ALPHABET) + CAESAR_N)

    def forward(self, sentence, state=None):
        embed = self.embed(sentence)
        o, h = self.rnn(embed)
        return self.linear(o)
    
    

class RnnFlex(torch.nn.Module):
                       
    def __init__(self, rnnClass, dictionary_size, embedding_size, num_hiddens, num_classes):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.embedding = torch.nn.Embedding(dictionary_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = torch.nn.Linear(num_hiddens, num_classes)

    def forward(self, X):
        out = self.embedding(X)
        _, state = self.hidden(out)  
        predictions = self.output(state[0])
        return predictions

In [90]:
model = RNNModel().to(DEVICE)
loss = torch.nn.CrossEntropyLoss().to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [91]:
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc, iter_num = .0, .0, .0
    start_epoch_time = time.time()
    model.train()
    for x_in, y_in in train_dl:
        x_in = x_in
        y_in = y_in.view(1, -1).squeeze()
        optimizer.zero_grad()
        out = model.forward(x_in).view(-1, len(ALPHABET) + CAESAR_N)
        l = loss(out, y_in)
        train_loss += l.item()
        batch_acc = (out.argmax(dim=1) == y_in)
        train_acc += batch_acc.sum().item() / batch_acc.shape[0]
        l.backward()
        optimizer.step()
        iter_num += 1
    print(
        f"Epoch: {epoch}, train loss: {train_loss:.4f}, acc: "
        f"{train_acc / iter_num:.4f}",
        end=" | "
    )
    test_loss, test_acc, iter_num = .0, .0, .0
    model.eval()
    for x_in, y_in in test_dl:
        x_in = x_in
        y_in = y_in.view(1, -1).squeeze()
        out = model.forward(x_in).view(-1, len(ALPHABET) + CAESAR_N)
        l = loss(out, y_in)
        test_loss += l.item()
        batch_acc = (out.argmax(dim=1) == y_in)
        test_acc += batch_acc.sum().item() / batch_acc.shape[0]
        iter_num += 1
    print(
        f"test loss: {test_loss:.4f}, test acc: {test_acc / iter_num:.4f} | "
        f"{time.time() - start_epoch_time:.2f} sec."
    )
    
    

Epoch: 0, train loss: 675.8390, acc: 0.9537 | test loss: 35.4452, test acc: 0.9910 | 37.45 sec.
Epoch: 1, train loss: 91.3472, acc: 0.9949 | test loss: 15.5549, test acc: 0.9974 | 37.99 sec.
Epoch: 2, train loss: 45.1953, acc: 0.9985 | test loss: 8.7967, test acc: 0.9989 | 37.17 sec.
Epoch: 3, train loss: 27.4798, acc: 0.9991 | test loss: 5.8567, test acc: 0.9992 | 37.49 sec.
Epoch: 4, train loss: 19.2780, acc: 0.9993 | test loss: 4.3502, test acc: 0.9993 | 37.77 sec.
Epoch: 5, train loss: 14.7749, acc: 0.9994 | test loss: 3.4447, test acc: 0.9995 | 37.72 sec.
Epoch: 6, train loss: 11.9113, acc: 0.9996 | test loss: 2.8307, test acc: 0.9996 | 37.81 sec.
Epoch: 7, train loss: 9.9264, acc: 0.9997 | test loss: 2.3904, test acc: 0.9997 | 39.27 sec.
Epoch: 8, train loss: 8.4788, acc: 0.9997 | test loss: 2.0596, test acc: 0.9997 | 37.71 sec.
Epoch: 9, train loss: 7.3795, acc: 0.9998 | test loss: 1.8044, test acc: 0.9998 | 39.10 sec.
Epoch: 10, train loss: 6.5295, acc: 0.9998 | test loss: 1.60

In [92]:
sentence = """Судьба.
Был только один выход, ибо наши жизни сплелись в слишком запутанный узел гнева и блаженства, 
чтобы решить все как-нибудь иначе. Доверимся жребию: орел — и мы поженимся, решка — и мы расстанемся навсегда.
Монетка была подброшена. Она звякнула, завертелась и остановилась. Орел.
Мы уставились на нее с недоумением.
Затем, в один голос, мы сказали: «Может, еще разок?». Джей Рип
Elizabeth II was Queen of the United Kingdom and other Commonwealth realms from 6 February 1952 until her death in 2022. 
She was queen regnant of 32 sovereign states during her lifetime and 15 at the time of her death.
Her reign of 70 years and 214 days is the longest of any British monarch, 
the longest recorded of any female head of state in history, 
and the second-longest verified reign of any monarch in history.
"""
sentence_idx = [ALPHABET[i] for i in sentence]
encrypted_sentence_idx = [i + CAESAR_N for i in sentence_idx]
encrypted_sentence = "".join([ALPHABET[i] for i in encrypted_sentence_idx])
result = model(torch.tensor([encrypted_sentence_idx]).to(DEVICE)).argmax(dim=2)
deencrypted_sentence = "".join([ALPHABET[i.item()] for i in result.flatten()])
print(f"""Исходный текст:           {sentence}
""")
print(f"""Зашифрованный текст:      {encrypted_sentence}
""")
print(f"Расшифрованный текст:     {deencrypted_sentence}")

Исходный текст:           Судьба.
Был только один выход, ибо наши жизни сплелись в слишком запутанный узел гнева и блаженства, 
чтобы решить все как-нибудь иначе. Доверимся жребию: орел — и мы поженимся, решка — и мы расстанемся навсегда.
Монетка была подброшена. Она звякнула, завертелась и остановилась. Орел.
Мы уставились на нее с недоумением.
Затем, в один голос, мы сказали: «Может, еще разок?». Джей Рип
Elizabeth II was Queen of the United Kingdom and other Commonwealth realms from 6 February 1952 until her death in 2022. 
She was queen regnant of 32 sovereign states during her lifetime and 15 at the time of her death.
Her reign of 70 years and 214 days is the longest of any British monarch, 
the longest recorded of any female head of state in history, 
and the second-longest verified reign of any monarch in history.


Зашифрованный текст:      Нз,мнвдиФабОп.бмВ.О.,-сОтаж.,ьО-н.Осв—-Оя-хс-Оекбрб-емОтОеб-—В.ДОхвкзпвссацОзхрбОусртвО-ОнбвярсептвьОиУп.наОор—-пмОтерОВвВыс-нз,мО-свУрдОЗ.

In [93]:
from evaluate import load
wer = load("wer")

predictions = [sentence]
references = [deencrypted_sentence]
wer_score = wer.compute (predictions=predictions, references=references)
print(wer_score)

0.0
