In [1]:
# ==========================
# STEP 1: Install & Import
# ==========================
!pip install tensorflow nltk numpy

import numpy as np
import tensorflow as tf
import re
import nltk
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')

# ==========================
# STEP 2: Load Dataset
# ==========================
# We'll use Cornell Movie Dialogs dataset
# Download link: http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

# Example dataset (you can replace with Cornell dataset)
conversations = {
    "hi": "hello",
    "how are you": "i am fine, thank you",
    "what is your name": "i am a chatbot",
    "bye": "goodbye"
}

questions = list(conversations.keys())
answers = list(conversations.values())

# ==========================
# STEP 3: Data Preprocessing
# ==========================
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    return text

clean_questions = [clean_text(q) for q in questions]
clean_answers = [clean_text(a) for a in answers]

# Add <START> and <END> tokens for answers
clean_answers = ["<START> " + ans + " <END>" for ans in clean_answers]

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_questions + clean_answers)

VOCAB_SIZE = len(tokenizer.word_index) + 1

# Convert to sequences
question_seq = tokenizer.texts_to_sequences(clean_questions)
answer_seq = tokenizer.texts_to_sequences(clean_answers)

# Padding
MAX_LEN = max([len(seq) for seq in question_seq + answer_seq])
question_padded = pad_sequences(question_seq, maxlen=MAX_LEN, padding='post')
answer_padded = pad_sequences(answer_seq, maxlen=MAX_LEN, padding='post')

# ==========================
# STEP 4: Build Seq2Seq Model
# ==========================
EMBEDDING_DIM = 256
LATENT_DIM = 512

# Encoder
encoder_inputs = Input(shape=(MAX_LEN,))
enc_emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(LATENT_DIM, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(MAX_LEN,))
dec_emb_layer = Embedding(VOCAB_SIZE, EMBEDDING_DIM)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy")

# ==========================
# STEP 5: Training
# ==========================
# Prepare decoder target data (shifted by one timestep)
answer_target = np.zeros_like(answer_padded)
answer_target[:, :-1] = answer_padded[:, 1:]

model.fit([question_padded, answer_padded],
          np.expand_dims(answer_target, -1),
          batch_size=16,
          epochs=300,
          verbose=1)

# ==========================
# STEP 6: Inference Model
# ==========================
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs2] + decoder_states2)

# ==========================
# STEP 7: Chatbot Response Function
# ==========================
reverse_word_index = {i: word for word, i in tokenizer.word_index.items()}

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, MAX_LEN))
    target_seq[0, 0] = tokenizer.word_index['start']

    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, "")

        if sampled_word == 'end' or len(decoded_sentence.split()) > MAX_LEN:
            stop_condition = True
        else:
            decoded_sentence += " " + sampled_word

        target_seq = np.zeros((1, MAX_LEN))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.strip()

# ==========================
# STEP 8: Test Chatbot
# ==========================
def chatbot_response(text):
    seq = tokenizer.texts_to_sequences([clean_text(text)])
    seq_padded = pad_sequences(seq, maxlen=MAX_LEN, padding='post')
    response = decode_sequence(seq_padded)
    return response

print("Chatbot is ready! Type 'quit' to stop.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    print("Bot:", chatbot_response(user_input))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Epoch 1/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 2.9981
Epoch 2/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step - loss: 2.8706
Epoch 3/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step - loss: 2.6742
Epoch 4/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step - loss: 2.0990
Epoch 5/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 330ms/step - loss: 2.1605
Epoch 6/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332ms/step - loss: 2.0749
Epoch 7/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step - loss: 1.5999
Epoch 8/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step - loss: 1.5445
Epoch 9/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step - loss: 1.4964
Epoch 10/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 305ms/step - loss: 1.4482
Ep