### **Основное задание**



Задание 1.  
Обучите нейронную сеть решать шифр цезаря.  
Что надо сделать:  
1.Написать алгоритм шифра цезаря для генерации выборки (сдвиг на К каждой буквы. Например, при сдвиге на 2 буква “А” переходит в букву “В” и т.п.)  
2.Сделать нейронную сеть.  
3.Обучить ее (вход - зашифрованная фраза, выход - дешифрованная фраза).  
4.Проверить качество.  



# Загружаем библиотеки. Смотрим, что доступно cpu или cuda. Загружаем данные.  

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import random
import time

In [3]:
dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
dev

device(type='cpu')

In [5]:
data_dir = '/content/drive/My Drive/Colab Notebooks/caesar_cypher/'


# **1. Написать алгоритм шифра цезаря для генерации выборки (сдвиг на К каждой буквы. Например, при сдвиге на 2 буква “А” переходит в букву “В” и т.п.)**

Создадим функцию, которая сдвигает буквы согласно алгоритму шифра цезаря.

In [6]:
key = 11
vocab = [char for char in 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя']


def encrypt(text):
    indexes = [vocab.index(char) for char in text]
    encrypted_indexes = [(idx + key) % len(vocab) for idx in indexes]
    encrypted_chars = [vocab[idx] for idx in encrypted_indexes]
    encrypted = ''.join(encrypted_chars)
    return encrypted

Применим нашу функцию

In [7]:
print(encrypt('абвгдеёжзийклмнопрстуфхцчшщъыьэюя'))

клмнопрстуфхцчшщъыьэюяабвгдеёжзий


Создадим датасет со случайным выбором букв.

In [8]:
num_examples = 128
message_length = 32


def dataset(num_examples):
    dataset = []
    for x in range(num_examples):
        ex_out = ''.join([random.choice(vocab) for x in range(message_length)])
        ex_in = encrypt(''.join(ex_out))
        ex_in = [vocab.index(x) for x in ex_in]
        ex_out = [vocab.index(x) for x in ex_out]
        dataset.append([torch.tensor(ex_in), torch.tensor(ex_out)])
    return dataset

## **RNN**

In [9]:
embedding_dim = 10
hidden_dim = 10
vocab_size = len(vocab)

In [10]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        ## Здесь создать слои
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, sentences, state=None):
        ## Здесь применить
        embed = self.embed(sentences)
        o, s = self.rnn(embed)
        out = self.linear(o)
        return out

In [11]:
model = Network ()

In [12]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

In [None]:
Обучение:

In [13]:
for ep in range(10):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for encrypted, original in dataset(num_examples):
        
        optimizer.zero_grad()
        answers = model.forward(encrypted.unsqueeze(1))
        answers = answers.view(-1, vocab_size)
        loss = criterion(answers, original)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 0.265, Train loss: 3.106
Epoch 1. Time: 0.137, Train loss: 2.386
Epoch 2. Time: 0.139, Train loss: 1.803
Epoch 3. Time: 0.130, Train loss: 1.343
Epoch 4. Time: 0.148, Train loss: 1.015
Epoch 5. Time: 0.134, Train loss: 0.791
Epoch 6. Time: 0.145, Train loss: 0.636
Epoch 7. Time: 0.136, Train loss: 0.526
Epoch 8. Time: 0.144, Train loss: 0.441
Epoch 9. Time: 0.134, Train loss: 0.374


Проверка качества:

In [14]:
with torch.no_grad():
        matches, total = 0, 0
        for encrypted, original in dataset(num_examples):
            answers = model.forward(encrypted.unsqueeze(1))
            predictions = torch.nn.functional.softmax(answers, dim=2)
            _, batch_out = predictions.max(dim=2)
            batch_out = batch_out.squeeze(1)
            matches += torch.eq(batch_out, original).sum().item()
            total += torch.numel(batch_out)
        accuracy = matches / total
        print('Accuracy: {:4.2f}%'.format(accuracy * 100))

Accuracy: 96.88%


## **LSTM**

In [15]:
embedding_dim = 10
hidden_dim = 10
vocab_size = len(vocab)

embed = torch.nn.Embedding(vocab_size, embedding_dim)
lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
linear = torch.nn.Linear(hidden_dim, vocab_size)
softmax = torch.nn.functional.softmax
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(embed.parameters()) +
                             list(lstm.parameters()) +
                             list(linear.parameters()), lr=0.001)

In [16]:
def zero_hidden():
    return (torch.zeros(1, 1, hidden_dim),
            torch.zeros(1, 1, hidden_dim))

Обучение:

In [17]:
num_epochs = 10

accuracies, max_accuracy = [], 0
for x in range(num_epochs):
    print('Epoch: {}'.format(x))
    for encrypted, original in dataset(num_examples):
        lstm_in = embed(encrypted)
        lstm_in = lstm_in.unsqueeze(1)
        lstm_out, lstm_hidden = lstm(lstm_in, zero_hidden())
        scores = linear(lstm_out)
        scores = scores.transpose(1, 2)
        original = original.unsqueeze(1)
        loss = loss_fn(scores, original) 
        loss.backward()
        optimizer.step()
    print('Loss: {:6.4f}'.format(loss.item()))

Epoch: 0
Loss: 3.1223
Epoch: 1
Loss: 1.9948
Epoch: 2
Loss: 1.0568
Epoch: 3
Loss: 0.5020
Epoch: 4
Loss: 0.2941
Epoch: 5
Loss: 0.1096
Epoch: 6
Loss: 0.0940
Epoch: 7
Loss: 0.0564
Epoch: 8
Loss: 0.0407
Epoch: 9
Loss: 0.0177


Проверка качества:

In [18]:
with torch.no_grad():
        matches, total = 0, 0
        for encrypted, original in dataset(num_examples):
            lstm_in = embed(encrypted)
            lstm_in = lstm_in.unsqueeze(1)
            lstm_out, lstm_hidden = lstm(lstm_in, zero_hidden())
            scores = linear(lstm_out)
            predictions = softmax(scores, dim=2)
            _, batch_out = predictions.max(dim=2)
            batch_out = batch_out.squeeze(1)
            matches += torch.eq(batch_out, original).sum().item()
            total += torch.numel(batch_out)
        accuracy = matches / total
        print('Accuracy: {:4.2f}%'.format(accuracy * 100))

Accuracy: 100.00%


# **Задание 2.**
# **Выполнить практическую работу из лекционного ноутбука.**  
# **а) построить RNN-ячейку на основе полносвязных слоев**  
# **б) применить построенную ячейку для генерации текста с выражениями героев сериала “Симпсоны”**  

## **Загружаем данные** 

In [21]:
import pandas as pd

In [22]:
df = pd.read_csv(data_dir+'data.csv')

In [23]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,0,10368,35,29,"Lisa Simpson: Maggie, look. What's that?",235000,True,9,5.0,Lisa Simpson,Simpson Home,"Maggie, look. What's that?",maggie look whats that,4.0
1,1,10369,35,30,Lisa Simpson: Lee-mur. Lee-mur.,237000,True,9,5.0,Lisa Simpson,Simpson Home,Lee-mur. Lee-mur.,lee-mur lee-mur,2.0
2,2,10370,35,31,Lisa Simpson: Zee-boo. Zee-boo.,239000,True,9,5.0,Lisa Simpson,Simpson Home,Zee-boo. Zee-boo.,zee-boo zee-boo,2.0
3,3,10372,35,33,Lisa Simpson: I'm trying to teach Maggie that ...,245000,True,9,5.0,Lisa Simpson,Simpson Home,I'm trying to teach Maggie that nature doesn't...,im trying to teach maggie that nature doesnt e...,24.0
4,4,10374,35,35,"Lisa Simpson: It's like an ox, only it has a h...",254000,True,9,5.0,Lisa Simpson,Simpson Home,"It's like an ox, only it has a hump and a dewl...",its like an ox only it has a hump and a dewlap...,18.0


In [24]:
phrases = df['normalized_text'].tolist()
phrases[:10]

['maggie look whats that',
 'lee-mur lee-mur',
 'zee-boo zee-boo',
 'im trying to teach maggie that nature doesnt end with the barnyard i want her to have all the advantages that i didnt have',
 'its like an ox only it has a hump and a dewlap hump and dew-lap hump and dew-lap',
 'you know his blood type how romantic',
 'oh yeah whats my shoe size',
 'ring',
 'yes dad',
 'ooh look maggie what is that do-dec-ah-edron dodecahedron']

In [25]:
text = [[c for c in ph] for ph in phrases if type(ph) is str]


 ## **Делаем массив с данными**

In [26]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}

In [69]:
INDEX_TO_CHAR

['none',
 'f',
 'n',
 'v',
 'p',
 'u',
 'b',
 'd',
 'k',
 'o',
 'g',
 'h',
 's',
 'y',
 'a',
 'l',
 'z',
 'q',
 'i',
 'j',
 'm',
 ' ',
 'c',
 'r',
 't',
 'w',
 'x',
 'e']

In [70]:
CHAR_TO_INDEX

{' ': 21,
 'a': 14,
 'b': 6,
 'c': 22,
 'd': 7,
 'e': 27,
 'f': 1,
 'g': 10,
 'h': 11,
 'i': 18,
 'j': 19,
 'k': 8,
 'l': 15,
 'm': 20,
 'n': 2,
 'none': 0,
 'o': 9,
 'p': 4,
 'q': 17,
 'r': 23,
 's': 12,
 't': 24,
 'u': 5,
 'v': 3,
 'w': 25,
 'x': 26,
 'y': 13,
 'z': 16}

In [71]:
MAX_LEN = 50
X = torch.zeros((len(text), MAX_LEN), dtype=int)
for i in range(len(text)):
    for j, w in enumerate(text[i]):
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [72]:
X[0:10]

tensor([[20, 14, 10, 10, 18, 27, 21, 15,  9,  9,  8, 21, 25, 11, 14, 24, 12, 21,
         24, 11, 14, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [15, 27, 27,  0, 20,  5, 23, 21, 15, 27, 27,  0, 20,  5, 23,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [16, 27, 27,  0,  6,  9,  9, 21, 16, 27, 27,  0,  6,  9,  9,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [18, 20, 21, 24, 23, 13, 18,  2, 10, 21, 24,  9, 21, 24, 27, 14, 22, 11,
         21, 20, 14, 10, 10, 18, 27, 21, 24, 11, 14, 24, 21,  2, 14, 24,  5, 23,
         27, 21,  7,  9, 27, 12,  2, 24, 21, 27,  2,  7, 21, 25],
        [18, 24, 12, 21, 15, 18,  8, 27, 21, 14,  2, 21,  9, 26, 21,  9,  2, 15,
       

## **Смотрим на Embedding и RNN ячейку**

In [29]:
embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)
t = embeddings(X[0:10])
t

tensor([[[ 0.2240, -1.5190,  0.8191,  ..., -1.0770,  0.7025, -0.6865],
         [-0.5465,  1.7345, -0.6493,  ..., -1.0592, -1.5871,  0.3748],
         [ 2.1783, -1.2437,  0.2973,  ...,  0.4653, -0.7692,  1.2772],
         ...,
         [-0.3687, -0.1730, -0.4970,  ...,  1.4607,  1.5439, -1.0710],
         [-0.3687, -0.1730, -0.4970,  ...,  1.4607,  1.5439, -1.0710],
         [-0.3687, -0.1730, -0.4970,  ...,  1.4607,  1.5439, -1.0710]],

        [[-0.6415, -1.0177, -1.4828,  ...,  0.1927, -1.3682,  0.5728],
         [-1.3439, -0.9808,  0.6842,  ...,  0.2097,  0.7494,  0.7906],
         [-1.3439, -0.9808,  0.6842,  ...,  0.2097,  0.7494,  0.7906],
         ...,
         [-0.3687, -0.1730, -0.4970,  ...,  1.4607,  1.5439, -1.0710],
         [-0.3687, -0.1730, -0.4970,  ...,  1.4607,  1.5439, -1.0710],
         [-0.3687, -0.1730, -0.4970,  ...,  1.4607,  1.5439, -1.0710]],

        [[-2.0601,  0.5091, -0.5492,  ...,  1.4292,  0.9575,  0.0883],
         [-1.3439, -0.9808,  0.6842,  ...,  0

In [30]:
t.shape, X[0:10].shape

(torch.Size([10, 50, 28]), torch.Size([10, 50]))

In [31]:
rnn = torch.nn.RNN(28, 128, batch_first=True)
o, s = rnn(t)
o.shape, s.shape

(torch.Size([10, 50, 128]), torch.Size([1, 10, 128]))

In [32]:
o, s2 = rnn(t, s)
o.shape, s2.shape

(torch.Size([10, 50, 128]), torch.Size([1, 10, 128]))

## **Практика. Реализуйте код модели нейронной сети.**

3 слоя - embeding (28), скрытая ячейка (128), полносвязанный из состояния rnn в букву (28)

Был код на лекции (оставляем в качестве примера скелета):

In [73]:
class Network(torch.nn.Module):

    def __init__(self):
        super(Network, self).__init__()
        self.word_embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)
        self.gru = torch.nn.GRU(28, 128, batch_first=True)
        self.hidden2tag = torch.nn.Linear(128, len(INDEX_TO_CHAR))

    def forward(self, sentences):
        embeds = self.word_embeddings(sentences)
        gru_out, state = self.gru(embeds)
        tag_space = self.hidden2tag(gru_out.reshape(-1, 128))
        return tag_space.reshape(sentences.shape[0], sentences.shape[1], -1), state

    def forward_state(self, sentences, state):
        embeds = self.word_embeddings(sentences)
        gru_out, state = self.gru(embeds, state)
        tag_space = self.hidden2tag(gru_out.reshape(-1, 128))
        return tag_space.reshape(sentences.shape[0], sentences.shape[1], -1), state
        

In [74]:
model = Network().to(dev)

In [75]:
X[0:1]

tensor([[20, 14, 10, 10, 18, 27, 21, 15,  9,  9,  8, 21, 25, 11, 14, 24, 12, 21,
         24, 11, 14, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [76]:
model.forward(X[0:1].to(dev))[0].shape

torch.Size([1, 50, 28])

In [79]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
batch_size = 100
n_epochs = 100

In [80]:
for ep in range(10):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        batch = X[i * 100:(i + 1) * 100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()

        optimizer.zero_grad()
        answers, _ = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 9.472, Train loss: 2.213
Epoch 1. Time: 9.259, Train loss: 1.958
Epoch 2. Time: 9.349, Train loss: 1.891
Epoch 3. Time: 9.281, Train loss: 1.850
Epoch 4. Time: 9.326, Train loss: 1.817
Epoch 5. Time: 9.232, Train loss: 1.787
Epoch 6. Time: 9.359, Train loss: 1.760
Epoch 7. Time: 9.271, Train loss: 1.735
Epoch 8. Time: 9.329, Train loss: 1.713
Epoch 9. Time: 9.345, Train loss: 1.694


## **Практика. Реализуйте код генерации следующей буквы на основе модели.**

In [52]:
import numpy as np
import string

In [77]:
def generate_sentence():
  sentence = ['h', 'e', 'l', 'l', 'o']
  state = None
  for i in range(MAX_LEN):
    X = torch.Tensor([[CHAR_TO_INDEX[sentence[i]]]]).type(torch.long).to(dev)
    if i == 0:
      result, state = model.forward(X)
    else:
      result, state = model.forward_state(X, state)
    prediction = result[0, -1, :]
    index_of_prediction = prediction.argmax()
    if i >= len(sentence) - 1:
      if index_of_prediction == 0:
        break
      sentence.append(INDEX_TO_CHAR[index_of_prediction])

  print(''.join(sentence))

In [78]:
generate_sentence()

hellohrzz iieexzz iieexzz iieexzz iieexzz iieexzz i


In [82]:
for ep in range(300):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        batch = X[i * 100:(i + 1) * 100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()

        optimizer.zero_grad()
        answers, _ = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))
    generate_sentence()

Epoch 0. Time: 9.373, Train loss: 1.608
hello the the the the the the the the the the the t
Epoch 1. Time: 9.368, Train loss: 1.601
hello the the the the the the the the the the the t
Epoch 2. Time: 9.369, Train loss: 1.594
hello the the the the the the the the the the the t
Epoch 3. Time: 9.327, Train loss: 1.588
hello the the the the the the the the the the the t
Epoch 4. Time: 9.391, Train loss: 1.581
hello the the the the the the the the the the the t
Epoch 5. Time: 9.333, Train loss: 1.576
hello the the the the the the the the the the the t
Epoch 6. Time: 9.388, Train loss: 1.570
hello the the the the the the the the the the the t
Epoch 7. Time: 9.367, Train loss: 1.565
hello the the the the the the the the the the the t
Epoch 8. Time: 9.357, Train loss: 1.559
hello the the the the the the the the the the the t
Epoch 9. Time: 9.345, Train loss: 1.554
hello the the the the the the the the the the the t
Epoch 10. Time: 9.381, Train loss: 1.550
hello the the the the the the the the t