In [None]:
import keras
from keras.api.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate
from keras.api.models import Model
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.api.preprocessing.sequence import pad_sequences

print(keras.__version__)
print(Tokenizer)


3.8.0
<class 'keras.src.legacy.preprocessing.text.Tokenizer'>


In [None]:
import keras
from keras.api.layers import Input, LSTM, Dense, Embedding, Attention,  AdditiveAttention, Concatenate, Masking, Reshape
from keras.api.models import Model
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.api.preprocessing.sequence import pad_sequences
import numpy as np
import os
import zipfile
import urllib

# 1. Download and prepare Cornell Movie Dialog dataset
def download_and_prepare_data():
    # Download dataset
    url = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
    zip_path = "cornell_movie_dialogs_corpus.zip"
    extract_dir = "cornell movie-dialogs corpus"

    if not os.path.exists(extract_dir):
        urllib.request.urlretrieve(url, zip_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(".")
        os.remove(zip_path)

    # Read conversations
    lines = {}
    with open(os.path.join(extract_dir, "movie_lines.txt"), encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            lines[parts[0]] = parts[-1].strip()

    # Get question-answer pairs
    conversations = []
    with open(os.path.join(extract_dir, "movie_conversations.txt"), encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")[-1][1:-2].replace("'", "").split(", ")
            for i in range(len(parts)-1):
                conversations.append((lines[parts[i]], lines[parts[i+1]]))

    questions = [pair[0] for pair in conversations]
    answers = [pair[1] for pair in conversations]

    return questions[:5000], answers[:5000]  # Use subset for faster training

# Load data
questions, answers = download_and_prepare_data()

# Add special tokens
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
answers = [f"{SOS_TOKEN} {a} {EOS_TOKEN}" for a in answers]

#Tokenization
tokenizer =  Tokenizer(num_words=5000, oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(questions + answers)

#convert text to sequences
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

# Pad after splitting
max_len = 20
decoder_input_sequences = [seq[:-1] for seq in answer_sequences]
decoder_target_sequences = [seq[1:] for seq in answer_sequences]

X = pad_sequences(question_sequences, maxlen=max_len, padding='post')
y_output = pad_sequences(decoder_target_sequences, maxlen=max_len, padding='post')

vocab_size = len(tokenizer.word_index) + 1

#Encoder
encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(input_dim=vocab_size, output_dim=256, mask_zero=True)(encoder_inputs)
masked_emb = Masking(mask_value=0.0)(enc_emb)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(masked_emb)

#Decoder with Attention
decoder_inputs = Input(shape=(max_len,))
dec_emb = Embedding(input_dim=vocab_size, output_dim=256, mask_zero=True)(decoder_inputs)
masked_dec = Masking(mask_value=0.0)(dec_emb)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(masked_dec, initial_state=[state_h, state_c])
'''
# Reshape decoder_outputs
decoder_outputs = Reshape((-1, decoder_outputs.shape[-1]))(decoder_outputs)
# Reshape encoder_outputs
encoder_outputs = Reshape((-1, encoder_outputs.shape[-1]))(encoder_outputs)
'''
#Print the shape
print(encoder_outputs.shape)
print(decoder_outputs.shape)

#Attention layer
#attention_layer = AdditiveAttention()
#attention = attention_layer([decoder_outputs, encoder_outputs])
attention_layer = Attention(use_scale=True)
attention = attention_layer([decoder_outputs, encoder_outputs])
'''
# Reshape attention output back
attention = Reshape((-1, attention.shape[-1]))(attention)
'''
# Concatenate attention with the original decoder outputs
decoder_concat = Concatenate()([decoder_outputs, attention])

#Output layer
output = Dense(vocab_size, activation='softmax')(decoder_concat)

#Define Model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs = output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

#y_output = np.expand_dims(y_output, -1)  # for sparse_categorical_crossentropy
y_output = np.expand_dims(y_output, -1)
decoder_input_data = pad_sequences(decoder_input_sequences, maxlen=max_len, padding='post')
print(X.shape)
print(decoder_input_data.shape)
print(y_output.shape)
#Training
model.fit(
    [X, decoder_input_data],     #Teacher forcing : decoder input is shifted left (None, 20)
    y_output,                             # Target is next word in sequence (None, 19, 1)
    epochs = 30,
    batch_size = 64,
)

#Inference(Prediction Loop) setup

#Define encoder model
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])
encoder_inf_outputs = Input(shape=(max_len, 256))

#Define decoder model(for step by step prediction)
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_inf_inputs = Input(shape=(1,)) #for one word at a time

#reuse embedding and LSTM
dec_emb_layer = Embedding(input_dim=vocab_size, output_dim=256)
dec_emb_inf = dec_emb_layer(decoder_inf_inputs)
decoder_inf_outputs, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb_inf, initial_state=[decoder_state_input_h, decoder_state_input_c]
)
#apply attention
attention_inf = Attention(use_scale=True)([decoder_inf_outputs, encoder_inf_outputs])
decoder_inf_concat = Concatenate()([decoder_inf_outputs, attention_inf])

#Output layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_inf_output = decoder_dense(decoder_inf_concat)

#Final decoder inference model
decoder_model = Model(
    [decoder_inf_inputs, decoder_state_input_h, decoder_state_input_c, encoder_inf_outputs],
    [decoder_inf_output, state_h_inf, state_c_inf]
)

#Generate Responses
def generate_response(input_text):
  #Tokenize input
  input_seq = tokenizer.texts_to_sequences([input_text])
  input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

  #Encode input
  encoder_output, state_h, state_c = encoder_model.predict(input_seq)

  # Start decoding with "<sos>" token
  target_seq = np.zeros((1,1))
  target_seq[0,0] = tokenizer.word_index[SOS_TOKEN]

  #Generate response word by word
  response = []
  for _ in range(max_len):
    output, state_h, state_c = decoder_model.predict([target_seq, state_h,state_c, encoder_output])
    predicted_word_index = np.argmax(output[0,-1, :])
    if predicted_word_index == tokenizer.word_index[EOS_TOKEN]:
      break
    word = tokenizer.index_word.get(predicted_word_index, "<OOV>")
    response.append(word)

    target_seq[0,0] = predicted_word_index

  return " ".join(response)

#test

print(generate_response("Hello, How Are You?"))

In [2]:
import keras
import tensorflow as tf
from keras.api.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate, Masking, Lambda
from keras.api.models import Model
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.api.preprocessing.sequence import pad_sequences
import numpy as np
import os
import zipfile
import urllib

# Custom Attention Layer (simplified version)
class SimpleAttention(keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.units = units
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, decoder_output, encoder_output):
        # decoder_output shape: (batch_size, max_len, units) #Modified to match actual shape
        # encoder_output shape: (batch_size, max_len, units)

        #decoder_output = keras.ops.expand_dims(decoder_output, axis=1) #change-1 remove this line to avoid the extra dimension

        # Score shape: (batch_size, max_len, 1)
        score = self.V(keras.ops.tanh(
            self.W1(decoder_output) + self.W2(encoder_output)))

        # attention_weights shape: (batch_size, max_len, 1)
        attention_weights = keras.ops.softmax(score, axis=1)

        # context_vector shape: (batch_size, units)
        context_vector = attention_weights * encoder_output
        #context_vector = keras.ops.sum(context_vector, axis=1) #Removed: This cause the shape mismatch

        # Change-2 : expand dimension to be compatible with decoder_outputs
        #context_vector = keras.ops.expand_dims(context_vector, axis=1) # Remove : No need to expand dimensions anymore

        return context_vector, attention_weights

# 1. Download and prepare Cornell Movie Dialog dataset
def download_and_prepare_data():
    # Download dataset
    url = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
    zip_path = "cornell_movie_dialogs_corpus.zip"
    extract_dir = "cornell movie-dialogs corpus"

    if not os.path.exists(extract_dir):
        urllib.request.urlretrieve(url, zip_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(".")
        os.remove(zip_path)

    # Read conversations
    lines = {}
    with open(os.path.join(extract_dir, "movie_lines.txt"), encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")
            lines[parts[0]] = parts[-1].strip()

    # Get question-answer pairs
    conversations = []
    with open(os.path.join(extract_dir, "movie_conversations.txt"), encoding='iso-8859-1') as f:
        for line in f:
            parts = line.split(" +++$+++ ")[-1][1:-2].replace("'", "").split(", ")
            for i in range(len(parts)-1):
                conversations.append((lines[parts[i]], lines[parts[i+1]]))

    questions = [pair[0] for pair in conversations]
    answers = [pair[1] for pair in conversations]

    # Filter out short Q/A pairs (e.g., < 3 words)
    filtered_qa = [
        (q, a) for q, a in zip(questions, answers)
        if len(q.split()) >= 3 and len(a.split()) >= 3
    ]

    # Limit to top 20000 for faster training
    filtered_qa = filtered_qa[:20000]

    questions = [q for q, a in filtered_qa]
    answers = [a for q, a in filtered_qa]

    return questions, answers

# Load data
questions, answers = download_and_prepare_data()

# Add special tokens
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"
tokenizer = Tokenizer(
    num_words=5000,
    oov_token=UNK_TOKEN,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
)
# Manually add special tokens
#tokenizer.word_index = {SOS_TOKEN: 1, EOS_TOKEN: 2, UNK_TOKEN: 3}
#tokenizer.index_word = {1: SOS_TOKEN, 2: EOS_TOKEN, 3: UNK_TOKEN}
tokenizer.fit_on_texts(questions + answers)
tokenizer.word_index[SOS_TOKEN] = len(tokenizer.word_index) + 1
tokenizer.word_index[EOS_TOKEN] = len(tokenizer.word_index) + 1
tokenizer.index_word[tokenizer.word_index[SOS_TOKEN]] = SOS_TOKEN
tokenizer.index_word[tokenizer.word_index[EOS_TOKEN]] = EOS_TOKEN
#Debug tokenizer
print(tokenizer.word_index.get(SOS_TOKEN))
print(tokenizer.word_index.get(EOS_TOKEN))

# Update vocabulary size to include special tokens
vocab_size = len(tokenizer.word_index) + 1

answers = [f"{SOS_TOKEN} {a} {EOS_TOKEN}" for a in answers]
#Debug answer
print(answers[:5])
'''
# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
# Add special tokens to tokenizer.word_index before fitting
tokenizer.word_index[SOS_TOKEN] = len(tokenizer.word_index) + 1
tokenizer.word_index[EOS_TOKEN] = len(tokenizer.word_index) + 1
'''
# Convert text to sequences
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

# Pad sequences
max_len = 20
X = pad_sequences(question_sequences, maxlen=max_len, padding='post')
decoder_input_data = pad_sequences(answer_sequences, maxlen=max_len, padding='post')

# Shift targets by one word
y_output = np.zeros_like(decoder_input_data)
y_output[:, :-1] = decoder_input_data[:, 1:]
y_output = np.expand_dims(y_output, -1)  # Add extra dimension for sparse_categorical_crossentropy

vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(input_dim=vocab_size, output_dim=256)(encoder_inputs)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(input_dim=vocab_size, output_dim=256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Custom Attention
attention_layer = SimpleAttention(256)
context_vector, _ = attention_layer(decoder_outputs, encoder_outputs)
#decoder_concat = Concatenate()([decoder_outputs, keras.ops.expand_dims(context_vector, axis=1)])
# Change-3: Remove this line. decoder_concat = Concatenate()([decoder_outputs, keras.ops.expand_dims(context_vector, axis=1)])
# Change-4: Added this line for concatenation
decoder_concat = Concatenate()([decoder_outputs, context_vector]) # Concatenate context_vector with decoder_outputs
'''
# Simplified Attention implementation
attention = Attention(use_scale=True)([decoder_outputs, encoder_outputs])
decoder_concat = Concatenate()([decoder_outputs, attention])
'''
# Output layer
output = Dense(vocab_size, activation='softmax')(decoder_concat)

# Define Model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Training
model.fit(
    [X, decoder_input_data],
    y_output,
    epochs=10,  # Reduced for testing
    batch_size=64,
)

# Inference Model Setup
# Encoder inference model
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_inf_inputs = Input(shape=(1,))
dec_emb_inf = Embedding(vocab_size, 256)(decoder_inf_inputs)
decoder_inf_outputs, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb_inf, initial_state=[decoder_state_input_h, decoder_state_input_c]
)

# Attention in inference
encoder_inf_outputs = Input(shape=(max_len, 256))
attention_inf = SimpleAttention(256)
context_vector_inf, _ = attention_inf(decoder_inf_outputs, encoder_inf_outputs)
context_vector_inf = Lambda(lambda x: tf.expand_dims(x, axis=1))(context_vector_inf)
#Change: Concatenate context_vector_inf directly with decoder_inf_outputs in inference model
context_vector_inf = Lambda(lambda x: tf.reduce_sum(x, axis=2))(context_vector_inf) # sum across timesteps to get (batch_size, 1, units)
decoder_inf_concat = Concatenate()([decoder_inf_outputs, context_vector_inf])
decoder_inf_output = Dense(vocab_size, activation='softmax')(decoder_inf_concat)
'''
# Attention in inference #using Attention()
encoder_inf_outputs = Input(shape=(max_len, 256))
attention_inf = Attention(use_scale=True)([decoder_inf_outputs, encoder_inf_outputs])
decoder_inf_concat = Concatenate()([decoder_inf_outputs, attention_inf])
decoder_inf_output = Dense(vocab_size, activation='softmax')(decoder_inf_concat)
'''
decoder_model = Model(
    [decoder_inf_inputs, decoder_state_input_h, decoder_state_input_c, encoder_inf_outputs],
    [decoder_inf_output, state_h_inf, state_c_inf]
)

# Generate Responses
def generate_response(input_text):
    # Tokenize input
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

    # Encode input
    enc_output, state_h, state_c = encoder_model.predict(input_seq)

    # Start decoding
    target_seq = np.zeros((1, 1))
    #sos_token_index = tokenizer.word_index.get(SOS_TOKEN, 1)
    #target_seq[0, 0] = sos_token_index
    target_seq[0, 0] = tokenizer.word_index[SOS_TOKEN]

    response = []
    for _ in range(max_len):
        output, state_h, state_c = decoder_model.predict(
            [target_seq, state_h, state_c, enc_output],
            verbose=0
        )
        predicted_word_idx = np.argmax(output[0, -1, :])

        if predicted_word_idx == tokenizer.word_index[EOS_TOKEN]:
            break

        response.append(tokenizer.index_word.get(predicted_word_idx, UNK_TOKEN))
        target_seq[0, 0] = predicted_word_idx

    return " ".join(response)

# Test
print("Response 1 :")
print(generate_response("hello how are you?"))
print("Response 2 :")
print(generate_response("hello"))
model.summary()



17364
17365
["<sos> Well, I thought we'd start with pronunciation, if that's okay with you. <eos>", '<sos> Not the hacking and gagging and spitting part.  Please. <eos>', "<sos> Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? <eos>", '<sos> Seems like she could get a date easy enough... <eos>', "<sos> That's a shame. <eos>"]
Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 81ms/step - loss: 4.4063
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 80ms/step - loss: 2.8806
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - loss: 2.7533
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - loss: 2.6048
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - loss: 2.5337
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - loss: 2.4637
Epoch 7/10
[1m313/313[0m [