In [1]:
import kagglehub
import pandas as pd

path = kagglehub.dataset_download("niraliivaghani/chatbot-dataset")

json_file = f"{path}/intents.json"

data = pd.read_json(json_file)

print(data.head())


Downloading from https://www.kaggle.com/api/v1/datasets/download/niraliivaghani/chatbot-dataset?dataset_version_number=1...


100%|██████████| 4.57k/4.57k [00:00<00:00, 7.17MB/s]

Extracting files...
                                             intents
0  {'tag': 'greeting', 'patterns': ['Hi', 'How ar...
1  {'tag': 'goodbye', 'patterns': ['cya', 'see yo...
2  {'tag': 'creator', 'patterns': ['what is the n...
3  {'tag': 'name', 'patterns': ['name', 'your nam...
4  {'tag': 'hours', 'patterns': ['timing of colle...





In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import nltk
import os

In [3]:
from google.colab import files
uploaded = files.upload()


Saving dialogs.txt to dialogs.txt


In [4]:
def load_data(filepath):
    questions = []
    answers = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            question, answer = line.strip().split('\t')
            questions.append(question)
            answers.append(answer)
    return questions, answers


In [5]:
def tokenize(texts):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(texts)
    return tokenizer

In [6]:
questions, answers = load_data('dialogs.txt')  # Ganti dengan nama file yang sesuai
input_tokenizer = tokenize(questions)
output_tokenizer = tokenize(answers)

In [7]:
max_input_len = max([len(q.split()) for q in questions])
max_output_len = max([len(a.split()) for a in answers])

input_sequences = input_tokenizer.texts_to_sequences(questions)
output_sequences = output_tokenizer.texts_to_sequences(answers)

In [8]:
encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_input_len)
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(output_sequences, maxlen=max_output_len)


In [9]:
decoder_output_data = np.zeros((len(answers), max_output_len, len(output_tokenizer.word_index)+1))


In [10]:
def create_model(input_vocab_size, output_vocab_size, input_timesteps, output_timesteps, latent_dim=256):
    encoder_inputs = Input(shape=(input_timesteps,))
    encoder_embedding = Embedding(input_vocab_size, latent_dim)(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(output_timesteps,))
    decoder_embedding = Embedding(output_vocab_size, latent_dim)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [11]:
model = create_model(len(input_tokenizer.word_index)+1, len(output_tokenizer.word_index)+1, max_input_len, max_output_len)
model.summary()

In [12]:
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, epochs=50, batch_size=64)


Epoch 1/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 2/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 941ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 3/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 926ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 4/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 1s/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 5/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 948ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 6/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 967ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 7/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 889ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 8/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 952ms/step - accuracy: 0.00

<keras.src.callbacks.history.History at 0x793f77410e50>