In [19]:
# Install the required packages
# %pip install numpy pandas tensorflow pyyaml gensim
import keras
import numpy as np # linear algebra
import pandas as pd
import tensorflow as tf
import pickle
from tensorflow.keras import layers, activations, models, preprocessing
from tensorflow.keras import utils
import os
import yaml
from gensim.models import Word2Vec
import re

In [20]:
dir_path = '../datasets_yml/'
files_list = os.listdir(dir_path + os.sep)

In [21]:
questions = list()
answers = list()

for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( questions + answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

VOCAB SIZE : 1894


In [22]:
questions

['What is AI?',
 'What is AI?',
 'Are you sentient?',
 'Are you sentient?',
 'Are you sentient?',
 'Are you sapient?',
 'Are you sapient?',
 'Are you sapient?',
 'Are you sapient?',
 'What language are you written in?',
 'What language are you written in?',
 'You sound like Data',
 'You sound like Data',
 'You are an artificial linguistic entity',
 'You are an artificial linguistic entity',
 'You are not immortal',
 'You are not immortal',
 'You are not immortal',
 'You are not making sense',
 'You are not making sense',
 'You are not making sense',
 'You are not making sense',
 'You are not making sense',
 'You are immortal',
 'You are immortal',
 'You are immortal',
 'You do not make any sense',
 'You can not clone',
 'You can not clone',
 'You can not move',
 'You can not move',
 'Bend over',
 'Bend over',
 'Robots laugh',
 'Robots should die',
 'Robots',
 'Robots are stupid',
 'Robots are not allowed to lie',
 'Robots are not allowed to lie',
 'Robots are not allowed to lie',
 'Rob

In [23]:
answers

['<START> Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think. <END>',
 '<START> AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind. <END>',
 '<START> Sort of. <END>',
 "<START> By the strictest dictionary definition of the word 'sentience', I may be. <END>",
 "<START> Even though I'm a construct I do have a subjective experience of the universe, as simplistic as it may be. <END>",
 "<START> In all probability, I am not.  I'm not that sophisticated. <END>",
 '<START> Do you think I am? <END>',
 '<START> How would you feel about me if I told you I was? <END>',
 '<START> No. <END>',
 '<START> Python. <END>',
 '<START> I am written in Python. <END>',
 "<START> Yes I am inspired by commander Data's artificial personality. <END>",
 '<START> The character of Lt. Commander Data was written to come across as being software-like, so it is natural that there 

In [24]:
vocab = []
for word in tokenizer.word_index:
  vocab.append(word)

def tokenize(sentences):
  tokens_list = []
  vocabulary = []
  for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    tokens = sentence.split()
    vocabulary += tokens
    tokens_list.append(tokens)
  return tokens_list, vocabulary

In [25]:
tokenized_questions = tokenizer.texts_to_sequences( questions )
maxlen_questions = max( [len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions, maxlen = maxlen_questions, padding = 'post')
encoder_input_data = np.array(padded_questions)
print(encoder_input_data.shape, maxlen_questions)

(564, 22) 22


In [26]:
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape , maxlen_answers )

(564, 74) 74


In [27]:
tokenized_answers = tokenizer.texts_to_sequences( answers )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )
decoder_output_data = np.array( onehot_answers )
print( decoder_output_data.shape )

(564, 74, 1894)


In [28]:
encoder_inputs = tf.keras.layers.Input(shape=( maxlen_questions , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( maxlen_answers ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

In [30]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=50 ) 
model.save( 'model.h5' )

Epoch 1/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154ms/step - loss: 5.7351
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 152ms/step - loss: 5.6926
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 163ms/step - loss: 5.6499
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154ms/step - loss: 5.5790
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154ms/step - loss: 5.5276
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 157ms/step - loss: 5.4477
Epoch 7/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 158ms/step - loss: 5.4074
Epoch 8/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 159ms/step - loss: 5.4163
Epoch 9/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 156ms/step - loss: 5.3578
Epoch 10/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 157ms/step - lo



In [32]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    
    decoder_states = [state_h, state_c]

    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model


In [33]:
def str_to_tokens( sentence : str ):

    words = sentence.lower().split()
    tokens_list = list()
  
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [34]:
enc_model , dec_model = make_inference_models()

for _ in range(5):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
 what do you get when you cross a lot end
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━

In [39]:
def get_model_response(question, enc_model, dec_model, tokenizer, maxlen_answers):
    # Получаем состояния энкодера
    states_values = enc_model.predict(str_to_tokens(question, tokenizer))
    
    # Инициализация начального токена для декодера
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition:
        # Пропускаем через декодер
        dec_outputs, h, c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_word_index, None)
        
        # Добавляем слово к результату
        if sampled_word:
            decoded_translation += ' {}'.format(sampled_word)
        
        # Условие завершения
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
        
        # Подготовка следующего входного токена и обновление состояний
        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index
        states_values = [h, c]
    
    # Возвращаем ответ без `<start>` и `<end>`
    return decoded_translation.replace('start', '').replace('end', '').strip()


In [40]:
def str_to_tokens(sentence, tokenizer):
    tokens = tokenizer.texts_to_sequences([sentence])
    return tf.keras.preprocessing.sequence.pad_sequences(tokens, maxlen=20, padding='post')


In [45]:
# Параметры
maxlen_answers = 50  # Максимальная длина ответа

# Вопрос
input_question = "How are you?"
response = get_model_response(input_question, enc_model, dec_model, tokenizer, maxlen_answers)

print(response)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
i am a lot


In [46]:
def debug_tokenizer(sentence):
    print(f"Original sentence: {sentence}")
    tokens = tokenizer.texts_to_sequences([sentence])
    print(f"Tokenized sequence: {tokens}")
    padded_tokens = preprocessing.sequence.pad_sequences(tokens, maxlen=maxlen_questions, padding='post')
    print(f"Padded tokens: {padded_tokens}")
    return padded_tokens


In [47]:
input_sequence = debug_tokenizer("What is AI?")
input_sequence

Original sentence: What is AI?
Tokenized sequence: [[10, 7, 269]]
Padded tokens: [[ 10   7 269   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


array([[ 10,   7, 269,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [48]:
def debug_encoder(input_sequence):
    encoder_states_debug = enc_model.predict(input_sequence)
    print(f"Encoder hidden states (h): {encoder_states_debug[0]}")
    print(f"Encoder cell states (c): {encoder_states_debug[1]}")
    return encoder_states_debug


In [49]:
encoder_states = debug_encoder(input_sequence)
encoder_states

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Encoder hidden states (h): [[ 0.13759765 -0.08878367 -0.29691693 -0.1721236  -0.3013364  -0.1574576
  -0.1313536  -0.21782836 -0.1462235  -0.15880936 -0.28980276  0.10675406
   0.1788387  -0.19541262 -0.1342054  -0.02830322 -0.17270377  0.24859446
  -0.00360117 -0.06947795 -0.04265946  0.12803285  0.08388395 -0.16822286
  -0.20926444 -0.0926588  -0.2279072   0.0802166  -0.22857833 -0.11920651
  -0.09538712 -0.17249173 -0.22898975  0.02718627  0.17691314  0.06868801
   0.06027987  0.1477258   0.24260963 -0.25850263 -0.02016528 -0.24510092
   0.13895604  0.06411073  0.15752171  0.26299945 -0.18769072  0.18669567
   0.0285204  -0.23816535 -0.15180454 -0.12684779  0.24284387 -0.18949299
  -0.11090693  0.23467967  0.04694201 -0.36391184  0.02318462  0.09582072
   0.31860757  0.15854606  0.02906001  0.23592936 -0.08868545  0.19717304
   0.10799362 -0.15104675 -0.23940083 -0.14099713  0.01181754 -0.09032498
   0.04581105 

[array([[ 0.13759765, -0.08878367, -0.29691693, -0.1721236 , -0.3013364 ,
         -0.1574576 , -0.1313536 , -0.21782836, -0.1462235 , -0.15880936,
         -0.28980276,  0.10675406,  0.1788387 , -0.19541262, -0.1342054 ,
         -0.02830322, -0.17270377,  0.24859446, -0.00360117, -0.06947795,
         -0.04265946,  0.12803285,  0.08388395, -0.16822286, -0.20926444,
         -0.0926588 , -0.2279072 ,  0.0802166 , -0.22857833, -0.11920651,
         -0.09538712, -0.17249173, -0.22898975,  0.02718627,  0.17691314,
          0.06868801,  0.06027987,  0.1477258 ,  0.24260963, -0.25850263,
         -0.02016528, -0.24510092,  0.13895604,  0.06411073,  0.15752171,
          0.26299945, -0.18769072,  0.18669567,  0.0285204 , -0.23816535,
         -0.15180454, -0.12684779,  0.24284387, -0.18949299, -0.11090693,
          0.23467967,  0.04694201, -0.36391184,  0.02318462,  0.09582072,
          0.31860757,  0.15854606,  0.02906001,  0.23592936, -0.08868545,
          0.19717304,  0.10799362, -0.

In [50]:
def debug_decoder(encoder_states):
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    step = 0

    while not stop_condition:
        dec_outputs, h, c = dec_model.predict([empty_target_seq] + encoder_states)

        # Найти индекс слова с наибольшей вероятностью
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None

        # Отобразить индекс на слово
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                sampled_word = word
                break
        
        print(f"Step {step}: Predicted word '{sampled_word}' (index: {sampled_word_index})")

        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
        else:
            decoded_translation += ' ' + sampled_word
        
        # Подготовить вход для следующего шага
        empty_target_seq[0, 0] = sampled_word_index
        encoder_states = [h, c]
        step += 1

    return decoded_translation


In [51]:
decoded_response = debug_decoder(encoder_states)
print(f"Final decoded response: {decoded_response}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Step 0: Predicted word 'what' (index: 10)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Step 1: Predicted word 'do' (index: 12)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Step 2: Predicted word 'you' (index: 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Step 3: Predicted word 'get' (index: 24)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Step 4: Predicted word 'when' (index: 26)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Step 5: Predicted word 'you' (index: 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Step 6: Predicted word 'cross' (index: 43)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Step 7: Predicted word 'a' (index: 4)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Step 8: Predicted word 'lo

In [53]:
def get_debugged_response(question):
    print("----- Tokenizer Debug -----")
    input_sequence = debug_tokenizer(question)

    print("\n----- Encoder Debug -----")
    encoder_states = debug_encoder(input_sequence)

    print("\n----- Decoder Debug -----")
    response = debug_decoder(encoder_states)

    print("\nFinal Response:")
    return response

# Пример вызова
response = get_debugged_response("What is AI?")
print(response)


----- Tokenizer Debug -----
Original sentence: What is AI?
Tokenized sequence: [[10, 7, 269]]
Padded tokens: [[ 10   7 269   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]

----- Encoder Debug -----
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Encoder hidden states (h): [[ 0.13759765 -0.08878367 -0.29691693 -0.1721236  -0.3013364  -0.1574576
  -0.1313536  -0.21782836 -0.1462235  -0.15880936 -0.28980276  0.10675406
   0.1788387  -0.19541262 -0.1342054  -0.02830322 -0.17270377  0.24859446
  -0.00360117 -0.06947795 -0.04265946  0.12803285  0.08388395 -0.16822286
  -0.20926444 -0.0926588  -0.2279072   0.0802166  -0.22857833 -0.11920651
  -0.09538712 -0.17249173 -0.22898975  0.02718627  0.17691314  0.06868801
   0.06027987  0.1477258   0.24260963 -0.25850263 -0.02016528 -0.24510092
   0.13895604  0.06411073  0.15752171  0.26299945 -0.18769072  0.18669567
   0.0285204  -0.23816535 -0.15180454 -0.12684779  0.24284387 -0.18949299
  -0.1

In [69]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Пример данных
questions = [
    "Сколько дней готовится документ?",
    "Какие услуги предоставляет нотариус?",
    "Где вы находитесь?",
]
answers = [
    "Документ готовится 3-5 рабочих дней.",
    "Мы предоставляем услуги заверения документов.",
    "Мы находимся по адресу ул. Ленина, 10.",
]

# Токенизация
tokenizer = Tokenizer(filters='', lower=True)
tokenizer.fit_on_texts(questions + answers)

# Добавляем специальные токены в словарь
start_token = tokenizer.word_index['<start>'] = len(tokenizer.word_index) + 1
end_token = tokenizer.word_index['<end>'] = len(tokenizer.word_index) + 1

# Преобразуем текст в последовательности
questions_seq = tokenizer.texts_to_sequences(questions)
answers_seq = tokenizer.texts_to_sequences(answers)

# Добавляем <start> и <end> к ответам
answers_seq = [[start_token] + seq + [end_token] for seq in answers_seq]

# Паддинг
max_len = 9  # максимальная длина последовательности
questions_seq = pad_sequences(questions_seq, maxlen=max_len, padding='post')
answers_seq = pad_sequences(answers_seq, maxlen=max_len, padding='post')

# Создание словаря токенов
vocab_size = len(tokenizer.word_index) + 1

# Создание модели seq2seq
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, AdditiveAttention, Concatenate

# Энкодер
encoder_input = Input(shape=(max_len,))
encoder_embedding = Embedding(vocab_size, 256)(encoder_input)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)

# Декодер
decoder_input = Input(shape=(max_len,))
decoder_embedding = Embedding(vocab_size, 256)(decoder_input)
decoder_lstm, _, _ = LSTM(256, return_sequences=True, return_state=True)(decoder_embedding, initial_state=[state_h, state_c])

# Внимание
attention = AdditiveAttention()([decoder_lstm, encoder_lstm])  # Внимание между декодером и энкодером

# Конкатенация результата внимания с выходом декодера
decoder_context = Concatenate(axis=-1)([decoder_lstm, attention])

# Выходной слой
output = Dense(vocab_size, activation='softmax')(decoder_context)

# Создаём модель
model = Model([encoder_input, decoder_input], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Размерность данных
batch_size = 32
vocab_size = len(tokenizer.word_index) + 1  # размер словаря

# Подготовка данных для обучения
decoder_target = answers_seq[:, 1:]  # Шифтированные ответы (для целей обучения)
decoder_input_seq = answers_seq[:, :-1]  # Входные данные для декодера (сдвиг на 1 влево)

# Паддинг декодера
decoder_input_seq = pad_sequences(decoder_input_seq, maxlen=max_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_len, padding='post')

# Проверим размерности данных
print(f"Questions shape: {questions_seq.shape}")
print(f"Decoder input shape: {decoder_input_seq.shape}")
print(f"Decoder target shape: {decoder_target.shape}")

# Обучение
model.fit(
    [questions_seq, decoder_input_seq],
    decoder_target,
    batch_size=32,
    epochs=20
)


Questions shape: (3, 9)
Decoder input shape: (3, 9)
Decoder target shape: (3, 9)
Epoch 1/20




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 3.3317
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 3.2790
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 3.2219
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 3.1501
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 3.0519
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 2.9135
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2.7310
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 2.6034
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 2.6266
Epoch 10/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 2.4710
Epoch 11/20
[1m1/1[0m [32m━━━

<keras.src.callbacks.history.History at 0x2174855bbe0>

In [72]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Пример данных
questions = [
    "Сколько дней готовится документ?",
    "Какие услуги предоставляет нотариус?",
    "Где вы находитесь?",
]
answers = [
    "Документ готовится 3-5 рабочих дней.",
    "Мы предоставляем услуги заверения документов.",
    "Мы находимся по адресу ул. Ленина, 10.",
]

# Токенизация
tokenizer = Tokenizer(filters='', lower=True)
tokenizer.fit_on_texts(questions + answers)

# Добавляем специальные токены в словарь
start_token = len(tokenizer.word_index) + 1
end_token = len(tokenizer.word_index) + 2
tokenizer.word_index['<start>'] = start_token
tokenizer.word_index['<end>'] = end_token

# Преобразуем текст в последовательности
questions_seq = tokenizer.texts_to_sequences(questions)
answers_seq = tokenizer.texts_to_sequences(answers)

# Добавляем <start> и <end> к ответам
answers_seq = [[start_token] + seq + [end_token] for seq in answers_seq]

# Паддинг
max_len = 12  # увеличиваем максимальную длину последовательности
questions_seq = pad_sequences(questions_seq, maxlen=max_len, padding='post')
answers_seq = pad_sequences(answers_seq, maxlen=max_len, padding='post')

# Создание словаря токенов
vocab_size = len(tokenizer.word_index) + 1

# Создание модели seq2seq
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, AdditiveAttention, Concatenate

# Энкодер
encoder_input = Input(shape=(max_len,))
encoder_embedding = Embedding(vocab_size, 256)(encoder_input)
encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(encoder_embedding)

# Декодер
decoder_input = Input(shape=(max_len,))
decoder_embedding = Embedding(vocab_size, 256)(decoder_input)
decoder_lstm, _, _ = LSTM(256, return_sequences=True, return_state=True)(decoder_embedding, initial_state=[state_h, state_c])

# Внимание
attention = AdditiveAttention()([decoder_lstm, encoder_lstm])  # Внимание между декодером и энкодером

# Конкатенация результата внимания с выходом декодера
decoder_context = Concatenate(axis=-1)([decoder_lstm, attention])

# Выходной слой
output = Dense(vocab_size, activation='softmax')(decoder_context)

# Создаём модель
model = Model([encoder_input, decoder_input], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Размерность данных
batch_size = 32
vocab_size = len(tokenizer.word_index) + 1  # размер словаря

# Подготовка данных для обучения
decoder_target = answers_seq[:, 1:]  # Шифтированные ответы (для целей обучения)
decoder_input_seq = answers_seq[:, :-1]  # Входные данные для декодера (сдвиг на 1 влево)

# Паддинг декодера
decoder_input_seq = pad_sequences(decoder_input_seq, maxlen=max_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_len, padding='post')

# Проверим размерности данных
print(f"Questions shape: {questions_seq.shape}")
print(f"Decoder input shape: {decoder_input_seq.shape}")
print(f"Decoder target shape: {decoder_target.shape}")

# Обучение
model.fit(
    [questions_seq, decoder_input_seq],
    decoder_target,
    batch_size=32,
    epochs=20
)

# Функция для преобразования текста в последовательность токенов
def preprocess_text(text, tokenizer, max_len):
    text_seq = tokenizer.texts_to_sequences([text])
    text_seq = pad_sequences(text_seq, maxlen=max_len, padding='post')
    return text_seq

# Функция для преобразования последовательности токенов в текст
def decode_sequence(seq, tokenizer):
    reverse_word_index = {value: key for key, value in tokenizer.word_index.items()}
    decoded_text = ' '.join([reverse_word_index.get(i, '?') for i in seq])
    return decoded_text

# Пример нового вопроса
new_question = "Как долго нужно ждать получения документа?"

# Предобработка нового вопроса
new_question_seq = preprocess_text(new_question, tokenizer, max_len)

# Создание начальной последовательности для декодера (с токеном <start>)
decoder_input_seq = np.zeros((1, max_len))
decoder_input_seq[0, 0] = start_token

# Генерация ответа
output_seq = []
for i in range(max_len):
    predictions = model.predict([new_question_seq, decoder_input_seq])
    sampled_token_index = np.argmax(predictions[0, i, :])
    output_seq.append(sampled_token_index)
    if sampled_token_index == end_token:
        break
    decoder_input_seq[0, i + 1] = sampled_token_index

# Преобразование последовательности токенов в текст
output_text = decode_sequence(output_seq, tokenizer)

print(f"Question: {new_question}")
print(f"Answer: {output_text}")


Questions shape: (3, 12)
Decoder input shape: (3, 12)
Decoder target shape: (3, 12)
Epoch 1/20




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 3.3462
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 3.2393
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 3.1248
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 2.9743
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 2.7541
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 2.4405
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 2.2354
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 2.3478
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 2.1237
Epoch 10/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 1.9452
Epoch 11/20
[1m1/1[0m [32m━━━



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Question: Как долго нужно ждать получения документа?
Answer: мы мы мы мы <end>
