In [1]:
from datetime import datetime

import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    print('GPU ok')
else:
    print('GPU not ok, CPU')

GPU ok


In [2]:
import pandas as pd

df = pd.read_csv("dialogues.tsv", sep="\t")

In [3]:
import re


def clean_text_simple(text):
    if not text:
        return []

    text = text.lower().replace("<br />", " ")
    text = re.sub(r"<span class=participant_\d+>", "", text)
    msgs = text.split("</span> ")

    replics = []

    for message in msgs:
        user_id = None
        if 'пользователь 1: ' in message:
            message = message.replace('пользователь 1: ', '')
            user_id = 1
        elif 'пользователь 2: ' in message:
            message = message.replace('пользователь 2: ', '')
            user_id = 2

        if user_id is not None:
            message = re.sub(r"(?<=[a-zA-Zа-яА-ЯёЁ])[^a-zA-Zа-яА-ЯёЁ ](?=[a-zA-Zа-яА-ЯёЁ])", " ", message)
            message = re.sub(r"[^a-zA-Zа-яА-ЯёЁ ]+", " ", message)
            message = re.sub(r"\s+", " ", message)
            message = message.strip()
            if message: replics.append([user_id, message])

    return replics


def merge_consecutive_replicas(replics):
    if not replics:
        return []

    merged = []
    current_user, current_msg = replics[0]

    for user_id, msg in replics[1:]:
        if user_id == current_user:
            current_msg += " " + msg
        else:
            merged.append(current_msg.strip())
            current_user, current_msg = user_id, msg

    merged.append(current_msg.strip())
    return merged

In [4]:
pairs = []

min_repl_len = 2
max_repl_len = 40

for d in df["dialogue"]:
    reps = clean_text_simple(d)
    reps = merge_consecutive_replicas(reps)
    for i in range(len(reps) - 1):
        q = reps[i].strip()
        a = reps[i + 1].strip()
        if min_repl_len <= len(q.split()) <= max_repl_len and min_repl_len <= len(a.split()) <= max_repl_len:
            pairs.append((q, a))


In [5]:
len(pairs)

130884

In [6]:
from collections import Counter
import numpy as np
from itertools import chain

all_reps = list(chain.from_iterable(pairs))

lengths = [len(r.split()) for r in all_reps]

print("Всего реплик:", len(all_reps))
print("Средняя длина:", np.mean(lengths))
print("Медиана длины:", np.median(lengths))
print("Максимальная длина:", np.max(lengths))
print("Минимальная длина:", np.min(lengths))

hist = Counter(lengths)
print("\nТоп частот длин:")
for length, freq in hist.most_common(20):
    print(length, freq)

all_tokens = list(chain.from_iterable(r.split() for r in all_reps))
vocab = Counter(all_tokens)

print("\nРазмер словаря:", len(vocab))
print("Топ-20 самых частых слов:")
for tok, freq in vocab.most_common(20):
    print(tok, freq)

print("\nПримеры редких слов (встречаются 1 раз):")
for tok, freq in list(vocab.items())[:20]:
    if freq == 1:
        print(tok)


Всего реплик: 261768
Средняя длина: 9.505810488676996
Медиана длины: 7.0
Максимальная длина: 40
Минимальная длина: 2

Топ частот длин:
3 25912
4 25760
5 22893
6 20376
2 20175
7 17539
8 15461
9 13869
10 12447
11 10818
12 9781
13 8335
14 7484
15 6539
16 5781
17 4913
18 4296
19 3817
20 3237
21 2864

Размер словаря: 55695
Топ-20 самых частых слов:
я 109454
а 87588
в 57041
у 54465
и 48962
не 42013
люблю 41881
ты 41844
на 35452
меня 34204
как 28188
есть 26994
с 25803
очень 23765
это 23075
тебя 22111
что 22064
да 21910
но 18919
чем 18287

Примеры редких слов (встречаются 1 раз):


In [7]:
from collections import Counter
import numpy as np

q_lengths = [len(q.split()) for q, a in pairs]
a_lengths = [len(a.split()) for q, a in pairs]

all_lengths = q_lengths + a_lengths

print("Всего пар:", len(pairs))
print("Средняя длина вопроса:", np.mean(q_lengths))
print("Средняя длина ответа:", np.mean(a_lengths))
print("Минимальная длина:", np.min(all_lengths))
print("Максимальная длина:", np.max(all_lengths))
print("Медиана длины:", np.median(all_lengths))

# распределение длин
hist = Counter(all_lengths)
print("\nТоп частот длин:")
for length, freq in hist.most_common(20):
    print(length, freq)

short_pairs = sum(1 for q, a in pairs if len(q.split()) <= min_repl_len or len(a.split()) <= min_repl_len)
long_pairs = sum(1 for q, a in pairs if len(q.split()) > max_repl_len or len(a.split()) > max_repl_len)

print("\nКоротких пар:", short_pairs)
print("Длинных пар:", long_pairs)
print("Доля коротких:", short_pairs / len(pairs))
print("Доля длинных:", long_pairs / len(pairs))


Всего пар: 130884
Средняя длина вопроса: 9.342578160814156
Средняя длина ответа: 9.669042816539838
Минимальная длина: 2
Максимальная длина: 40
Медиана длины: 7.0

Топ частот длин:
3 25912
4 25760
5 22893
6 20376
2 20175
7 17539
8 15461
9 13869
10 12447
11 10818
12 9781
13 8335
14 7484
15 6539
16 5781
17 4913
18 4296
19 3817
20 3237
21 2864

Коротких пар: 18862
Длинных пар: 0
Доля коротких: 0.14411234375477522
Доля длинных: 0.0


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

questions, answers = zip(*pairs)

min_num_words = 8000

tokenizer = Tokenizer(num_words=min_num_words, oov_token="<OOV>")
tokenizer.fit_on_texts(questions + answers)

word_index = tokenizer.word_index
vocab_size = min(min_num_words, len(word_index)) + 1

questions_seq = tokenizer.texts_to_sequences(questions)
answers_seq = tokenizer.texts_to_sequences(answers)

max_len = max(max(len(seq) for seq in questions_seq),
              max(len(seq) for seq in answers_seq))

questions_pad = pad_sequences(questions_seq, maxlen=max_len, padding='post')
answers_pad = pad_sequences(answers_seq, maxlen=max_len, padding='post')

decoder_input = answers_pad[:, :-1]
decoder_target = answers_pad[:, 1:]

print("Размер словаря:", vocab_size)
print("Максимальная длина последовательности:", max_len)
print("Форма входных данных:", questions_pad.shape, decoder_input.shape, decoder_target.shape)


Размер словаря: 8001
Максимальная длина последовательности: 40
Форма входных данных: (130884, 40) (130884, 39) (130884, 39)


In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, AdditiveAttention, Concatenate
from tensorflow.keras.optimizers import Adam

embedding_dim = 128
latent_dim = 128

encoder_inputs = Input(shape=(max_len,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_len - 1,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

attention = AdditiveAttention(name='attention_layer')
context_vector = attention([decoder_outputs, encoder_outputs])
decoder_combined = Concatenate(axis=-1)([decoder_outputs, context_vector])

decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_combined)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name='seq2seq_attention')
model.compile(optimizer=Adam(learning_rate=0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "seq2seq_attention"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 40)]         0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, 39)]         0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, 40, 128)      1024128     ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embedding (Embedding)  (None, 39, 128)      1024128     ['decoder_inputs[0][0]']         
                                                                                  

In [10]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=1,
    restore_best_weights=True
)

history = model.fit(
    [questions_pad, decoder_input],
    decoder_target,
    batch_size=128,
    epochs=20,
    validation_split=0.1,
    callbacks=[early_stop]
)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate

encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim,), name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(latent_dim,), name='decoder_state_input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,), name='decoder_inputs_single')
decoder_embedding_layer = model.get_layer('decoder_embedding')
decoder_embedded = decoder_embedding_layer(decoder_inputs_single)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_embedded, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

attention_layer = model.get_layer('attention_layer')
context_vector = attention_layer([decoder_outputs2, encoder_outputs])
decoder_combined = Concatenate(axis=-1)([decoder_outputs2, context_vector])

decoder_outputs2 = decoder_dense(decoder_combined)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs + [encoder_outputs],
    [decoder_outputs2] + decoder_states2
)


In [12]:
import heapq
import numpy as np

def beam_search_decode(input_seq, beam_width=3, max_response_len=7):
    encoder_outs, state_h, state_c = encoder_model.predict(input_seq)
    states_value = [state_h, state_c]
    start_token = tokenizer.word_index.get('я', 1)

    sequences = [(0.0, [start_token], states_value)]

    for _ in range(max_response_len):
        all_candidates = []
        for score, seq, states in sequences:
            target_seq = np.array([[seq[-1]]])
            output_tokens, h, c = decoder_model.predict([target_seq] + states + [encoder_outs])

            log_probs = np.log(output_tokens[0, -1, :] + 1e-8)
            top_indices = np.argsort(log_probs)[-beam_width:]

            for i in top_indices:
                candidate = (score + log_probs[i], seq + [i], [h, c])
                all_candidates.append(candidate)

        sequences = heapq.nlargest(beam_width, all_candidates, key=lambda tup: tup[0])

    best_seq = sequences[0][1]
    decoded_sentence = []
    for token_idx in best_seq[1:]:
        word = tokenizer.index_word.get(token_idx, '')
        if word == '' or word == '<OOV>':
            continue
        if decoded_sentence and word == decoded_sentence[-1]:
            continue
        decoded_sentence.append(word)

    return ' '.join(decoded_sentence)


In [13]:
import ipywidgets as widgets
from IPython.display import display

user_input = widgets.Text(
    value='',
    placeholder='Напиши сообщение...',
    description='Ты:',
    layout=widgets.Layout(width='80%')
)

chat_output = widgets.Output(layout=widgets.Layout(width='80%', border='1px solid black', padding='5px'))

send_button = widgets.Button(description="Отправить")


def on_send_clicked(b):
    msg = user_input.value.strip()
    if not msg:
        return

    with chat_output:
        print(f"Ты: {msg}")

    seq = pad_sequences(tokenizer.texts_to_sequences([msg]), maxlen=max_len, padding='post')
    response = beam_search_decode(seq)

    with chat_output:
        print(f"Бот: {response}\n")

    user_input.value = ''


send_button.on_click(on_send_clicked)

display(chat_output, user_input, send_button)


Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_right='1px solid b…

Text(value='', description='Ты:', layout=Layout(width='80%'), placeholder='Напиши сообщение...')

Button(description='Отправить', style=ButtonStyle())

In [14]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
encoder_model.save(f'checkpoints/{timestamp}_20epch/encoder_model.h5')
decoder_model.save(f'checkpoints/{timestamp}_20epch/decoder_model.h5')


