##Lab 4. Text generation with deep learning
Melnikov, Malysheva, Selivanovskaya




In [None]:
from collections import Counter
import csv

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

###Считывание данных из файла


In [None]:
'''TRAIN_TEXT_FILE_PATH = 'train_text.txt'

with open(TRAIN_TEXT_FILE_PATH, 'r', encoding='cp1251') as text_file:
    text_sample = text_file.read()'''


"TRAIN_TEXT_FILE_PATH = 'train_text.txt'\n\nwith open(TRAIN_TEXT_FILE_PATH, 'r', encoding='cp1251') as text_file:\n    text_sample = text_file.read()"

In [None]:
def DataLoader(filename): #function to load arxiv.csv
    corpus = list()

    with open(filename, encoding='utf-8') as r_file:
        file_reader = csv.reader(r_file, delimiter = ",")
        count = 0
        for row in file_reader:
            if count != 0:
                line = row[7]
                line = line.replace('\n', ' ')
                corpus.append(line)
            count += 1
    return corpus


def text_to_seq(text_sample): #making dicts with words and indexes (char_to_idx, idx_to_char)
    char_counts = Counter(text_sample)
#     char_counts = sorted(char_counts.items(), key = lambda x: x[1], reverse=True)========--
    sorted_chars = [char for char, _ in char_counts]
    print(sorted_chars)
    char_to_idx = {char: index for index, char in enumerate(sorted_chars)}
    idx_to_char = {v: k for k, v in char_to_idx.items()}
    sequence = np.array([char_to_idx[char] for char in text_sample])
    
    return sequence, char_to_idx, idx_to_char

In [None]:
corpus_list = DataLoader('arxiv.csv') 
train_arxiv = ' '.join(corpus_list)

In [None]:
sequence, char_to_idx, idx_to_char = text_to_seq(train_arxiv)

[' ', 'e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'l', 'c', 'h', 'd', 'm', 'p', 'u', 'f', 'g', 'y', 'b', 'w', 'v', '.', ',', '-', 'k', 'x', 'T', '$', ')', '(', 'I', 'q', 'S', 'W', 'z', 'A', 'M', 'C', 'P', 'N', 'D', 'F', '\\', 'R', 'O', 'E', 'j', 'L', 'B', '1', 'G', 'H', '2', '0', '}', '{', "'", 'U', 'V', '3', '/', ':', '"', '_', '^', '5', '4', 'K', '6', 'Q', ';', '9', '%', '8', '7', '+', '=', 'J', 'X', '[', ']', '|', 'Z', 'Y', '?', '~', '<', '>', '`', '*', '&', '!', '#', '@', '\x7f']


###Генерация батчей для обучения текста

In [None]:
SEQ_LEN = 256
BATCH_SIZE = 16

def get_batch(sequence):
    trains = []
    targets = []
    for _ in range(BATCH_SIZE):
        batch_start = np.random.randint(0, len(sequence) - SEQ_LEN)
        chunk = sequence[batch_start: batch_start + SEQ_LEN]
        train = torch.LongTensor(chunk[:-1]).view(-1, 1)
        target = torch.LongTensor(chunk[1:]).view(-1, 1)
        trains.append(train)
        targets.append(target)
    return torch.stack(trains, dim=0), torch.stack(targets, dim=0)

###Функция генерирующая текст 
Удобно использовать при обучении

In [None]:
def evaluate(model, char_to_idx, idx_to_char, start_text=' ', prediction_len=200, temp=0.3):
    hidden = model.init_hidden()
    idx_input = [char_to_idx[char] for char in start_text]
    train = torch.LongTensor(idx_input).view(-1, 1, 1).to(device)
    predicted_text = start_text
    
    _, hidden = model(train, hidden)
        
    inp = train[-1].view(-1, 1, 1)
    
    for i in range(prediction_len):
        output, hidden = model(inp.to(device), hidden)
        output_logits = output.cpu().data.view(-1)
        p_next = F.softmax(output_logits / temp, dim=-1).detach().cpu().data.numpy()        
        top_index = np.random.choice(len(char_to_idx), p=p_next)
        inp = torch.LongTensor([top_index]).view(-1, 1, 1).to(device)
        predicted_char = idx_to_char[top_index]
        predicted_text += predicted_char
    
    return predicted_text

###Класс модели

In [None]:
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_size, self.input_size)
        
    def forward(self, x, hidden):
        x = self.encoder(x).squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc(out)
        return x, (ht1, ct1)
    
    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
               torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

###Обучение

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = RNN(input_size=len(idx_to_char), hidden_size=128, embedding_size=128, n_layers=2)

#model.load_state_dict(torch.load(TRAIN_MODEL_FILE_PATH)) 
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    patience=5, 
    verbose=True, 
    factor=0.5
)

n_epochs = 50000
loss_avg = []

for epoch in range(n_epochs):
    model.train()
    train, target = get_batch(sequence)
    train = train.permute(1, 0, 2).to(device)
    target = target.permute(1, 0, 2).to(device)
    hidden = model.init_hidden(BATCH_SIZE)

    output, hidden = model(train, hidden)
    loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    loss_avg.append(loss.item())
    if len(loss_avg) >= 50:
        mean_loss = np.mean(loss_avg)
        print(f'Loss: {mean_loss}')
        scheduler.step(mean_loss)
        loss_avg = []
        model.eval()
        predicted_text = evaluate(model, char_to_idx, idx_to_char)
        print(predicted_text)

Loss: 2.9256852436065675
 peret tes are the tir al the the the the the the or the or the to ter rerer an ror the te fo the an the the the the an te or prerin the the te an the renan the the the the tion the an ter ter the the 
Loss: 2.2634836196899415
 and the a the and the and of and modes the the resent and the the the dering the the conting and the decation the in and the the of the proal the dethe conding the the the the a the the the and the th
Loss: 1.9999976205825805
 a proposed the and complations and the and problems and a a the and a stection and and a dearned for complered the are constration and strate the present and bation and the the both the and distrate a
Loss: 1.8299010109901428
 the and the strances and and in the propose the interformal the the proposed and the a state the between and the experical the prove the strate to the strodum the stration the and be intermation to th
Loss: 1.720155074596405
 secess and sets for a computation and and the proposed to the demon

KeyboardInterrupt: ignored

###Сохранение модели 

In [None]:
torch.save(model.state_dict(), 'model_trained_arxiv.pt')
model = RNN(input_size=len(idx_to_char), hidden_size=128, embedding_size=128, n_layers=2)
model.load_state_dict(torch.load('model_trained_arxiv.pt')) 

###Гененирование текста на обученной модели 

In [None]:
model.eval()

print(evaluate(
    model, 
    char_to_idx, 
    idx_to_char, 
    temp=0.5, 
    prediction_len=1000, 
    start_text='. '
    )
)

.  Understanding for decomposition of a challenge of the second reconstruction of the minimum results detect the decision and a probability of some settings for the handles the distribution for the neural networks that expected on the parameters and design of the supervision and presently frequency of the interference of strongly strategy is a semantic case of the control links to many design system to find the proposed functions of the generating variables is the participant generation and the achieved data is achieved for the state-of-the-art of the experimental learning and linear set of our model of the state-of-the-art manipulation of the context level at the proposed algorithm (artificial network for the structure. The state development of the control problem of a controller complexity and $\epsilon}$ is even which the approaches and a simple models of the proposed approach to the case of the optimization of the layer of work system for a systems for the problem of deep learning 