In [53]:
import pandas as pd
import time
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

## Рекуррентные нейросети

### Задание 1

In [54]:
def caesar_cipher(text, shift):
    result = []
    for char in text:
        if char.isalpha():  
            shift_base = ord('a') 
            result.append(chr((ord(char) - shift_base + shift) % 26 + shift_base))
        else:
            result.append(char)  
    return ''.join(result)

In [55]:
df = pd.read_csv('data/simpsons_script_lines.csv')


  df = pd.read_csv('data/simpsons_script_lines.csv')


In [56]:
phrases = df['normalized_text'].tolist()  

In [57]:
simpsons_data = {'orig': phrases}
df_simpsons = pd.DataFrame(simpsons_data)
df_simpsons['orig'] = df_simpsons['orig'].astype(str)

shift = 3
df_simpsons['encrypted'] = df_simpsons['orig'].apply(lambda x: caesar_cipher(x, shift))

df_simpsons

Unnamed: 0,orig,encrypted
0,no actually it was a little of both sometimes ...,qr dfwxdoob lw zdv d olwwoh ri erwk vrphwlphv ...
1,wheres mr bergstrom,zkhuhv pu ehujvwurp
2,i dont know although id sure like to talk to h...,l grqw nqrz dowkrxjk lg vxuh olnh wr wdon wr k...
3,that life is worth living,wkdw olih lv zruwk olylqj
4,the polls will be open from now until the end ...,wkh sroov zloo eh rshq iurp qrz xqwlo wkh hqg ...
...,...,...
158266,im back,lp edfn
158267,you see class my lyme disease turned out to be,brx vhh fodvv pb obph glvhdvh wxuqhg rxw wr eh
158268,psy-cho-so-ma-tic,svb-fkr-vr-pd-wlf
158269,does that mean you were crazy,grhv wkdw phdq brx zhuh fudcb


In [58]:
def text_to_sequence(text, char_to_idx):
    return [char_to_idx[char] for char in text]

In [59]:
unique_chars = set(' '.join(df_simpsons['orig']))
char_to_idx = {char: idx for idx, char in enumerate(sorted(unique_chars))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
char_to_idx['<PAD>'] = len(char_to_idx)  
idx_to_char[len(idx_to_char)] = '<PAD>'

df_simpsons['encrypted_seq'] = df_simpsons['encrypted'].apply(lambda x: text_to_sequence(x, char_to_idx))
df_simpsons['original_seq'] = df_simpsons['orig'].apply(lambda x: text_to_sequence(x, char_to_idx))

print(df_simpsons[['orig', 'encrypted', 'encrypted_seq', 'original_seq']])

                                                     orig  \
0       no actually it was a little of both sometimes ...   
1                                     wheres mr bergstrom   
2       i dont know although id sure like to talk to h...   
3                               that life is worth living   
4       the polls will be open from now until the end ...   
...                                                   ...   
158266                                            im back   
158267     you see class my lyme disease turned out to be   
158268                                  psy-cho-so-ma-tic   
158269                      does that mean you were crazy   
158270                    no that means she was faking it   

                                                encrypted  \
0       qr dfwxdoob lw zdv d olwwoh ri erwk vrphwlphv ...   
1                                     zkhuhv pu ehujvwurp   
2       l grqw nqrz dowkrxjk lg vxuh olnh wr wdon wr k...   
3                      

In [60]:
train, test = train_test_split(df_simpsons, test_size=0.3, random_state=42)

In [61]:
class CaesarDataset(Dataset):
    def __init__(self, data, char_to_idx):
        self.data = data
        self.char_to_idx = char_to_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        encrypted_seq = self.data.iloc[idx]['encrypted_seq']
        original_seq = self.data.iloc[idx]['original_seq']
        return torch.tensor(encrypted_seq, dtype=torch.long), torch.tensor(original_seq, dtype=torch.long)

In [62]:
def pad_sequences(batch):
    encrypted_seqs = [torch.tensor(item[0], dtype=torch.long) for item in batch]
    original_seqs = [torch.tensor(item[1], dtype=torch.long) for item in batch]
    
    padded_encrypted = pad_sequence(encrypted_seqs, batch_first=True, padding_value=char_to_idx['<PAD>'])
    padded_original = pad_sequence(original_seqs, batch_first=True, padding_value=char_to_idx['<PAD>'])
    
    return padded_encrypted, padded_original

In [63]:
train_dataset = CaesarDataset(train, char_to_idx)
test_dataset = CaesarDataset(test, char_to_idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_sequences)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=pad_sequences)

In [64]:
class CaesarRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CaesarRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True) 
        self.fc = nn.Linear(hidden_dim, vocab_size) 

    def forward(self, x):
        x = self.embedding(x) 
        output, hidden = self.rnn(x) 
        output = self.fc(output)  
        return output

In [65]:
vocab_size = len(char_to_idx)

embed_dim = 64
hidden_dim = 128

model = CaesarRNN(vocab_size, embed_dim, hidden_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [66]:
criterion = nn.CrossEntropyLoss(ignore_index=char_to_idx['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [67]:
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, device):
    for epoch in range(num_epochs):
        start_time = time.time()
        
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        for encrypted_seq, original_seq in train_loader:
            encrypted_seq = encrypted_seq.to(device)
            original_seq = original_seq.to(device)

            output = model(encrypted_seq)
            
            output = output.view(-1, output.size(-1))  
            original_seq = original_seq.view(-1)  

            loss = criterion(output, original_seq)
            train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        test_loss = 0
        test_correct = 0
        test_total = 0

        with torch.no_grad():
            for encrypted_seq, original_seq in test_loader:
                encrypted_seq = encrypted_seq.to(device)
                original_seq = original_seq.to(device)

                output = model(encrypted_seq)

                output = output.view(-1, output.size(-1))
                original_seq = original_seq.view(-1)

                loss = criterion(output, original_seq)
                test_loss += loss.item()

        epoch_time = time.time() - start_time

        print(f"Ep [{epoch+1}/{num_epochs}], Time: {epoch_time:.2f} sec, Train Loss: {train_loss/len(train_loader):.4f}, Test Loss: {test_loss/len(test_loader):.4f}")

In [68]:
num_epochs = 10
train_model(
    model=model,
    train_loader=train_loader,
    test_loader=test_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=num_epochs,
    device=device
)

  encrypted_seqs = [torch.tensor(item[0], dtype=torch.long) for item in batch]
  original_seqs = [torch.tensor(item[1], dtype=torch.long) for item in batch]


Ep [1/10], Time: 13.81 sec, Train Loss: 0.0667, Test Loss: 0.0013
Ep [2/10], Time: 13.77 sec, Train Loss: 0.0009, Test Loss: 0.0008
Ep [3/10], Time: 13.81 sec, Train Loss: 0.0006, Test Loss: 0.0007
Ep [4/10], Time: 14.07 sec, Train Loss: 0.0005, Test Loss: 0.0006
Ep [5/10], Time: 14.07 sec, Train Loss: 0.0004, Test Loss: 0.0005
Ep [6/10], Time: 14.02 sec, Train Loss: 0.0004, Test Loss: 0.0005
Ep [7/10], Time: 14.01 sec, Train Loss: 0.0003, Test Loss: 0.0005
Ep [8/10], Time: 13.46 sec, Train Loss: 0.0003, Test Loss: 0.0005
Ep [9/10], Time: 13.57 sec, Train Loss: 0.0002, Test Loss: 0.0005
Ep [10/10], Time: 13.52 sec, Train Loss: 0.0002, Test Loss: 0.0005


In [69]:
def decode_phrase(model, phrase, char_to_idx, idx_to_char, device):
    model.eval() 
    with torch.no_grad():
        input_seq = torch.tensor([char_to_idx[char] for char in phrase], dtype=torch.long).unsqueeze(0).to(device)

        output = model(input_seq)
        
        predicted_indices = torch.argmax(output, dim=2).squeeze(0).tolist()

        decoded_phrase = ''.join([idx_to_char[idx] for idx in predicted_indices])
        return decoded_phrase

In [72]:
encrypted_phrase = "khoor zruog"  

decoded_phrase = decode_phrase(model, encrypted_phrase, char_to_idx, idx_to_char, device)
print(f"Сдвиг: {shift}")
print(f"Зашифрованная фраза: {encrypted_phrase}")
print(f"Дешифрованная фраза: {decoded_phrase}")

Сдвиг: 3
Зашифрованная фраза: khoor zruog
Дешифрованная фраза: hello world


### Задание 2

#### Загрузка данных

In [73]:
df = pd.read_csv('data/simpsons_script_lines.csv')
df.head()

  df = pd.read_csv('data/simpsons_script_lines.csv')


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33


In [74]:
phrases = df['normalized_text'].tolist()  # колонка с предобработанными текстами
phrases[:10]

['no actually it was a little of both sometimes when a disease is in all the magazines and all the news shows its only natural that you think you have it',
 'wheres mr bergstrom',
 'i dont know although id sure like to talk to him he didnt touch my lesson plan what did he teach you',
 'that life is worth living',
 'the polls will be open from now until the end of recess now just in case any of you have decided to put any thought into this well have our final statements martin',
 'i dont think theres anything left to say',
 'bart',
 'victory party under the slide',
 nan,
 'mr bergstrom mr bergstrom']

In [75]:
text = [[c for c in ph] for ph in phrases if type(ph) is str]

#### Создаем массив с данными

In [76]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')  # все символы, которые мы хотим использовать для кодировки = наш словарь
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]  # все неизвестные символы будут получать тег none
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}  # словарь токен-индекс

In [78]:
len(INDEX_TO_CHAR)

28

In [79]:
MAX_LEN = 50  # мы хотим ограничить максимальную длину ввода
X = torch.zeros((len(text), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(text)):  # для каждого предложения
    for j, w in enumerate(text[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [80]:
X[0:5]

tensor([[ 6, 21, 11, 16,  5,  1, 14, 16,  7,  7, 13, 11, 27,  1, 11, 17, 16, 20,
         11, 16, 11,  7, 27,  1,  1,  7, 19, 11, 21,  9, 11, 15, 21,  1, 12, 11,
         20, 21, 23, 19,  1, 27, 23, 19, 20, 11, 17, 12, 19,  6],
        [17, 12, 19, 18, 19, 20, 11, 23, 18, 11, 15, 19, 18, 22, 20,  1, 18, 21,
         23,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [27, 11,  2, 21,  6,  1, 11,  3,  6, 21, 17, 11, 16,  7,  1, 12, 21, 14,
         22, 12, 11, 27,  2, 11, 20, 14, 18, 19, 11,  7, 27,  3, 19, 11,  1, 21,
         11,  1, 16,  7,  3, 11,  1, 21, 11, 12, 27, 23, 11, 12],
        [ 1, 12, 16,  1, 11,  7, 27,  9, 19, 11, 27, 20, 11, 17, 21, 18,  1, 12,
         11,  7, 27, 26, 27,  6, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 12, 19, 11, 25, 21,  7,  7, 20, 11, 17, 27,  7,  7, 11, 15, 19, 11,
       

#### Embedding и RNN ячейки

In [81]:
X[0:5].shape

torch.Size([5, 50])

In [82]:
embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)  # размер словаря * размер вектора для кодировки каждого слова
t = embeddings(X[0:5])
t.shape

torch.Size([5, 50, 28])

In [83]:
t.shape, X[0:5].shape

(torch.Size([5, 50, 28]), torch.Size([5, 50]))

In [84]:
rnn = torch.nn.RNN(28, 128, batch_first=True)  # на вход - размер эмбеддинга, размер скрытого состояния и порядок размерностей
o, s = rnn(t)
# вектора для слов: батч * число токенов * размер скрытого состояния
# вектор скрытого состояния: число вектров (один) * батч * размер скрытого состояния
o.shape, s.shape

(torch.Size([5, 50, 128]), torch.Size([1, 5, 128]))

In [85]:
o, s2 = rnn(t, s)
o.shape, s2.shape

(torch.Size([5, 50, 128]), torch.Size([1, 5, 128]))

#### Реализация сети с RNN

In [86]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 30)
        self.rnn = torch.nn.RNN(30, 128)
        self.out = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x) # берём выход с последнего слоя для всех токенов, а не скрытое состояние
        return self.out(x)

In [87]:
model = Network()

In [88]:
criterion = torch.nn.CrossEntropyLoss()  # типичный лосс многоклассовой классификации
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

In [89]:
for ep in range(20):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        # берём батч в 100 элементов
        batch = X[i * 100:(i + 1) * 100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 57.233, Train loss: 1.827
Epoch 1. Time: 59.447, Train loss: 1.729
Epoch 2. Time: 60.259, Train loss: 1.713
Epoch 3. Time: 59.961, Train loss: 1.703
Epoch 4. Time: 59.517, Train loss: 1.695
Epoch 5. Time: 59.810, Train loss: 1.688
Epoch 6. Time: 59.438, Train loss: 1.684
Epoch 7. Time: 60.340, Train loss: 1.681
Epoch 8. Time: 60.055, Train loss: 1.678
Epoch 9. Time: 59.759, Train loss: 1.675
Epoch 10. Time: 59.697, Train loss: 1.673
Epoch 11. Time: 59.851, Train loss: 1.671
Epoch 12. Time: 59.993, Train loss: 1.670
Epoch 13. Time: 59.510, Train loss: 1.668
Epoch 14. Time: 59.439, Train loss: 1.667
Epoch 15. Time: 60.134, Train loss: 1.665
Epoch 16. Time: 59.893, Train loss: 1.668
Epoch 17. Time: 59.521, Train loss: 1.666
Epoch 18. Time: 58.612, Train loss: 1.664
Epoch 19. Time: 60.518, Train loss: 1.662


#### Генерация

In [90]:
CHAR_TO_INDEX['none']

0

In [91]:
def generate_sentence(word):
    sentence = list(word)
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]
    answers = model.forward(torch.tensor(sentence))
    probas, indices = answers.topk(1)
    return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

In [97]:
generate_sentence('that life is')

' enhil   in '

In [96]:
generate_sentence('victory party')

'enohueotrte  '