## Задание 1

In [1]:
import requests
from nltk import sent_tokenize



In [16]:
url = 'https://s3.amazonaws.com/text-datasets/nietzsche.txt'
response = requests.get(url)
text = response.text.lower()

corpus_length = len(text)
print(f"А) Длина всего корпуса: {corpus_length} символов")

sentences = sent_tokenize(text)
num_sentences = len(sentences)
print(f"Б) Количество предложений: {num_sentences}")

chars = set(text)
num_unique_characters = len(chars)
print(f"В) Сколько всего символов используется: {num_unique_characters}")

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

А) Длина всего корпуса: 600901 символов
Б) Количество предложений: 2864
В) Сколько всего символов используется: 59


In [2]:
import numpy as np

## Задание 2

In [18]:
maxlen = 40
step = 3
sentences = []
next_chars = [] 
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


nb sequences: 200287
Vectorization...


In [3]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import LambdaCallback

## Задание 3

In [21]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Функция, вызываемая в конце каждой эпохи для генерации текста
def on_epoch_end(epoch, _):
    print()
    print('----- Генерация текста после эпохи: %d' % epoch)

    start_index = np.random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0]:
        print('----- Разнообразие:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Создание текста: "' + sentence + '"')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

        print(generated)

# Callback для вызова функции в конце каждой эпохи
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Обучение модели
model.fit(x, y, batch_size=128, epochs=20, callbacks=[print_callback])

# Проверка работы модели в онлайн-режиме
start_index = np.random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5, 1.0]:
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Создание текста: "' + sentence + '"')

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

    print(generated)

Epoch 1/20
----- Генерация текста после эпохи: 0
----- Разнообразие: 0.2
----- Создание текста: "far been the noblest and
remotest sentim"
far been the noblest and
remotest sentiment and of the the the the the the the the the the the the the the soun the the the the sere the the the the sere the the the the the the the the the ther and the the the the the the the the the the the mont and and the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the stich the the the the the the the the the th
----- Разнообразие: 0.5
----- Создание текста: "far been the noblest and
remotest sentim"
far been the noblest and
remotest sentime to er the rically--ace mact art and in seande of ar efalher wathe sicaldity go lere the the destorme the the mest of mand the dorede sters in the the whe hand of the the thim with hanter and it an andalle the and all thas gere ander andereline cotuliny of the hing the the
serent one the 

 of man through
a favourable accumulationed of tre of
whuthey
as ne groe
spery what  wand recarely
scal of is belienge bu wand are "histanciely ow tore wounters, ane laburedry scrugeing in the ofliofion tous,
herthing and (olt and caust, senpee.--bushing tas ow ond
in that such, as
farm"--ond insermage" tu were deiplody of choply a perune theur usenfering
this cangerents erment to there carded now. ivty dos the mayst, and in a vas he
ch
Epoch 6/20
----- Генерация текста после эпохи: 5
----- Разнообразие: 0.2
----- Создание текста: " certain custom has
been agreeable to th"
 certain custom has
been agreeable to the conself conself and the conself and of the conceral and and the conceral and be preasing and the still as a care and as it as a conceral and the constive of the conself contines of the conself the precape and the still and an the which as a self and is a still as a corsulf of the consention of the concearing the promine of the consince of the concear and as a concerion of the

d is doubtful
about it in most cases when the grest to the concention to more the inselves of the freen may deen in the exerate the sense of the lack of an into a way he the suct the such and in the compenter to under to the reflection of muther perropically and in the may reaster of conseal to the old be centered to the present in seciet of may the delied in the out of the constition are the old conseanous and constion of which sundly 
----- Разнообразие: 1.0
----- Создание текста: "d is doubtful
about it in most cases whe"
d is doubtful
about it in most cases when thoughting folunger afferdinpt of themselves tear spown
remos.

78. whom,
schrougoun;--but in modath, a will
bo compresion, "stience, the pathy in they epercans hesh shilsising and sastifice haugory fically for were)s
of this otterhilage then with masserops the lifes thren and a or. which hank in may dr for of is divent instand grow ghose acts its"ly oclequited",
ifchindle of wes all those
it a 
Epoch 11/20
----- Генерация 

Epoch 15/20
----- Генерация текста после эпохи: 14
----- Разнообразие: 0.2
----- Создание текста: "sive as their lack of shame, the easy in"
sive as their lack of shame, the easy in the the such as a strenger of the experiences and the such an all the sense of the conterman the such as a pleasion and despension and and the strenger the conscience of the contention of the such and the problem of the consequently the to a still and desire and its of the such a strunging the conscience is the such as a strenging and such as a preception to a strenger the sumple, the conscience 
----- Разнообразие: 0.5
----- Создание текста: "sive as their lack of shame, the easy in"
sive as their lack of shame, the easy in ourselves with which is contension of the conception the world and it are not not a my not of which is believes or at life wime exception of man
in the sense something, and in the and as belief has his origination of such all the promisence of the prodical
fine when a would not the outh

itarianism, one may at once assume that truth" phiehore and higher strongest,"
just
opposet, the agaits and freenthershoutude and is resint fellected that they divinis
ofthumpest vave: this poopives danger and scurcely, to the controus: it is be his general onessirity. in short, his
weald"--enouphing other under witk in opises us mele proviced upinutes sy desilfured have about a spirit, the migatimation, yolly also beind--that do for ou
Epoch 20/20
----- Генерация текста после эпохи: 19
----- Разнообразие: 0.2
----- Создание текста: "e men aware of
their inner relationship "
e men aware of
their inner relationship and in the serves of the sense of the seems and the respect of the supersion of the serves of the seems and propadition of the seems and have and in the seement of the contention of the contention of the superation of the consequently the propertion of the common to the consequently the sense of the superation of the stronger to the subject of the seems and even the
strengene

## Задание 4

In [4]:
import sqlite3
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
conn = sqlite3.connect('wikibooks.sqlite')
cursor = conn.cursor()

In [6]:
cursor.execute('SELECT body_text FROM ru')
text_data = [row[0].lower() for row in cursor.fetchall()]

In [7]:
num_samples = 5000
text = text_data[:num_samples]

In [18]:
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1

max_sequence_length = 50  
input_sequences = []

for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, min(max_sequence_length, len(token_list))):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

In [19]:
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)

model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(x, y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x206d36102e0>

In [23]:
# Генерация текста
seed_text = "Обработка текстов на естественных языках это "
next_words = 100

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted = np.argmax(predicted_probs, axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

Обработка текстов на естественных языках это  по ruby регистрация домашний текста править человек в комментарии языка не определено требуется для чего нет не средств данного этапы экзамена и способы файлов не пакетов язык и овд в википедии имеется статья по теме «повторное русского примеры используется термин reuse «повторное использование» активно используется программистами в практике например сделали от мероприятий г для 2009 до 1 января 1992 года с которым использует указания была россия через существительных 15 мая 1 года не блока нужно энциклопедии для одном файлов в латинском языке en она en править структуру — — система управления реляционными name веб blender 3d игра сервис для электронной
