In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
data_path = "/content/drive/MyDrive/Week 4/data"
dataset_files = [
    "police_db.json",
    "slot_descriptions.json",
    "ontology.json",
    "attraction_db.json",
    "hospital_db.json",
    "hotel_db.json",
    "restaurant_db.json",
    "taxi_db.json",
    "train_db.json"
]


structured_data = {}

def load_data(file_name):
    file_path = os.path.join(data_path, file_name)
    try:
        with open(file_path, 'r', encoding="utf-8") as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"file not found: {file_path}")
        return None
    except json.JSONDecodeError as e:
        print(f"error decoding JSON from file {file_path}: {e}")
        return None
    except Exception as e:
        print(f" error occurred while processing file {file_path}: {e}")
        return None


def extract_keys(data):
    keys = set()
    if isinstance(data, dict):
        for key, value in data.items():
            keys.add(key)
            keys.update(extract_keys(value))
    elif isinstance(data, list):
        for item in data:
            keys.update(extract_keys(item))
    return keys

for file_name in dataset_files:
    data = load_data(file_name)

    if data is not None:
        keys = extract_keys(data)
        print(f"Keys in {file_name}:")
        print(keys)
        print("-" * 50)

        domain = file_name.replace("_db.json", "").replace(".json", "")

        if "_db.json" in file_name:
            structured_data[domain] = data

output_file = os.path.join(data_path, "processed_database.json")
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(structured_data, file, indent=4, ensure_ascii=False)

print(f" processed database data saved to '{output_file}'.")

Keys in police_db.json:
{'address', 'phone', 'id', 'name'}
--------------------------------------------------
Keys in slot_descriptions.json:
{'restaurant-area', 'attraction-area', 'train-day', 'restaurant-book people', 'taxi-leaveat', 'bus-day', 'hotel-book people', 'hospital-department', 'restaurant-name', 'train-departure', 'hotel-internet', 'bus-arriveBy', 'hotel-stars', 'bus-departure', 'train-book people', 'taxi-arriveby', 'attraction-type', 'train-arriveby', 'train-leaveat', 'attraction-name', 'hotel-parking', 'hotel-name', 'taxi-departure', 'hotel-pricerange', 'bus-people', 'hotel-book stay', 'restaurant-food', 'train-destination', 'restaurant-book day', 'restaurant-pricerange', 'hotel-book day', 'bus-destination', 'hotel-type', 'taxi-destination', 'bus-leaveAt', 'hotel-area', 'restaurant-book time'}
--------------------------------------------------
Keys in ontology.json:
{'train-semi-arriveBy', 'hotel-semi-parking', 'attraction-semi-area', 'train-book-people', 'restaurant-sem

In [3]:
import os
import json
data_path = "/content/drive/MyDrive/Week 4/data"
slot_file = os.path.join(data_path, "slot_descriptions.json")

with open(slot_file, "r", encoding="utf-8") as file:
    slot_data = json.load(file)

slot_dict = {}

for slot in slot_data.keys():
    domain = slot.split("-")[0]
    if domain not in slot_dict:
        slot_dict[domain] = []
    slot_dict[domain].append(slot)

output_slot_file = os.path.join(data_path, "processed_slots.json")
with open(output_slot_file, "w", encoding="utf-8") as file:
    json.dump(slot_dict, file, indent=4, ensure_ascii=False)

print(f"conversational slots processed and saved to '{output_slot_file}'.")

conversational slots processed and saved to '/content/drive/MyDrive/Week 4/data/processed_slots.json'.


In [4]:
import os
import json
data_path = "/content/drive/MyDrive/Week 4/data"
ontology_file = os.path.join(data_path, "ontology.json")

with open(ontology_file, "r", encoding="utf-8") as file:
    ontology_data = json.load(file)

ontology_dict = {}

for slot, values in ontology_data.items():
    domain = slot.split("-")[0]
    if domain not in ontology_dict:
        ontology_dict[domain] = {}

    ontology_dict[domain][slot] = values
output_ontology_file = os.path.join(data_path, "processed_ontology.json")
with open(output_ontology_file, "w", encoding="utf-8") as file:
    json.dump(ontology_dict, file, indent=4, ensure_ascii=False)

print(f"ontology values processed and saved to '{output_ontology_file}'.")

ontology values processed and saved to '/content/drive/MyDrive/Week 4/data/processed_ontology.json'.


In [5]:
import os
import json
import random
data_path = "/content/drive/MyDrive/Week 4/data"

with open(os.path.join(data_path, "processed_database.json"), "r", encoding="utf-8") as file:
    database = json.load(file)

with open(os.path.join(data_path, "processed_slots.json"), "r", encoding="utf-8") as file:
    slots = json.load(file)

with open(os.path.join(data_path, "processed_ontology.json"), "r", encoding="utf-8") as file:
    ontology = json.load(file)

training_data = []

for domain, entries in database.items():
    if domain in slots:
        for entry in entries:
            for slot in slots[domain]:
                slot_key = slot.replace(domain + "-", "")

                if slot_key in entry:
                    question = f"What is the {slot_key} of the {domain}?"
                    answer = str(entry[slot_key])

                    training_data.append({"input": question, "output": answer})

output_file = os.path.join(data_path, "chatbot_training_data.json")
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(training_data, file, indent=4, ensure_ascii=False)

print(f"chatbot training data prepared and saved to '{output_file}'!")

chatbot training data prepared and saved to '/content/drive/MyDrive/Week 4/data/chatbot_training_data.json'!


In [6]:
import os
import json
import sentencepiece as spm
import numpy as np

data_path = "/content/drive/MyDrive/Week 4/data"
tokenizer_model_prefix = os.path.join(data_path, "tokenizer")
training_data_file = os.path.join(data_path, "chatbot_training_data.json")

with open(training_data_file, "r", encoding="utf-8") as file:
    training_data = json.load(file)

text_corpus = [item["input"] for item in training_data] + [item["output"] for item in training_data]

corpus_file = os.path.join(data_path, "corpus.txt")
with open(corpus_file, "w", encoding="utf-8") as file:
    file.write("\n".join(text_corpus))

spm.SentencePieceTrainer.train(
    input=corpus_file,
    model_prefix=tokenizer_model_prefix,
    vocab_size=500
)
sp = spm.SentencePieceProcessor(model_file=f"{tokenizer_model_prefix}.model")

vocab = {sp.id_to_piece(i): i for i in range(sp.get_piece_size())}

vocab_file = os.path.join(data_path, "vocab.json")
with open(vocab_file, "w", encoding="utf-8") as file:
    json.dump(vocab, file, indent=4)

print(f"vocabulary saved: {vocab_file}")
tokenized_data = [
    {
        "input_ids": sp.encode(item["input"], out_type=int),
        "output_ids": sp.encode(item["output"], out_type=int)
    }
    for item in training_data
]

tokenized_data_file = os.path.join(data_path, "tokenized_training_data.json")
with open(tokenized_data_file, "w", encoding="utf-8") as file:
    json.dump(tokenized_data, file, indent=4)

print(f"tokenized data saved: {tokenized_data_file}")
print(tokenized_data[:5])


vocabulary saved: /content/drive/MyDrive/Week 4/data/vocab.json
tokenized data saved: /content/drive/MyDrive/Week 4/data/tokenized_training_data.json
[{'input_ids': [3, 6, 12, 9, 3, 8, 4, 11, 50, 83, 3, 7, 4, 3, 14, 63, 68, 5], 'output_ids': [3, 19, 113, 170, 53, 84]}, {'input_ids': [3, 6, 12, 9, 3, 8, 4, 70, 3, 7, 4, 3, 14, 63, 68, 5], 'output_ids': [3, 157, 9]}, {'input_ids': [3, 6, 12, 9, 3, 8, 4, 3, 71, 3, 7, 4, 3, 14, 63, 68, 5], 'output_ids': [145, 151, 151, 94, 50, 3, 84, 3, 128, 3, 465, 46, 108, 185, 9, 112]}, {'input_ids': [3, 6, 12, 9, 3, 8, 4, 11, 50, 83, 3, 7, 4, 3, 14, 63, 68, 5], 'output_ids': [184]}, {'input_ids': [3, 6, 12, 9, 3, 8, 4, 70, 3, 7, 4, 3, 14, 63, 68, 5], 'output_ids': [95]}]


In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import json
import numpy as np

MAX_SEQUENCE_LENGTH = 128
BATCH_SIZE = 32
BUFFER_SIZE = 10000
VOCAB_SIZE = 5000
D_MODEL = 512
DROPOUT_RATE = 0.1
DFF = 2048
NUM_HEADS = 8
NUM_LAYERS = 6

class TokenEmbedding(layers.Layer):
    def __init__(self, vocab_size, d_model, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.emb = layers.Embedding(input_dim=vocab_size, output_dim=d_model)

    def call(self, x):
        return self.emb(x)

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "d_model": self.d_model
        })
        return config

class LSHSelfAttention(layers.Layer):
    def __init__(self, d_model, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.num_heads = num_heads
        self.wq = layers.Dense(d_model // 2)
        self.wk = layers.Dense(d_model // 2)
        self.wv = layers.Dense(d_model // 2)
        self.dense = layers.Dense(d_model // 2)

    def call(self, x):
        q, k, v = tf.split(x, num_or_size_splits=3, axis=-1)
        q, k, v = self.wq(q), self.wk(k), self.wv(v)
        attention_weights = tf.nn.softmax(tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(tf.shape(k)[-1], tf.float32)), axis=-1)
        return self.dense(tf.matmul(attention_weights, v))

    def get_config(self):
        config = super().get_config()
        config.update({
            "d_model": self.d_model,
            "num_heads": self.num_heads
        })
        return config

class FeedForward(layers.Layer):
    def __init__(self, d_model, dff, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.dff = dff
        self.dense1 = layers.Dense(dff, activation='gelu')
        self.dense2 = layers.Dense(d_model // 2)

    def call(self, x):
        return self.dense2(self.dense1(x))

    def get_config(self):
        config = super().get_config()
        config.update({
            "d_model": self.d_model,
            "dff": self.dff
        })
        return config

class ReversibleResidualLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.f = LSHSelfAttention(d_model, num_heads)
        self.g = FeedForward(d_model, dff)

    def call(self, inputs):
        x1, x2 = tf.split(inputs, num_or_size_splits=2, axis=-1)
        y1 = x1 + self.f(tf.concat([x2, x2, x2], axis=-1))
        y2 = x2 + self.g(y1)
        return tf.concat([y1, y2], axis=-1)

    def get_config(self):
        config = super().get_config()
        config.update({
            "d_model": self.d_model,
            "num_heads": self.num_heads,
            "dff": self.dff
        })
        return config

class ReformerBlock(layers.Layer):
    def __init__(self, d_model, num_heads, dff, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.residual = ReversibleResidualLayer(d_model, num_heads, dff)

    def call(self, x):
        return self.residual(x)

    def get_config(self):
        config = super().get_config()
        config.update({
            "d_model": self.d_model,
            "num_heads": self.num_heads,
            "dff": self.dff
        })
        return config

class Reformer(keras.Model):
    def __init__(self, vocab_size, d_model, num_heads, dff, num_layers, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.num_layers = num_layers

        self.embedding = TokenEmbedding(vocab_size, d_model)
        self.reformer_blocks = [ReformerBlock(d_model, num_heads, dff)
                              for _ in range(num_layers)]
        self.final_layer = layers.Dense(vocab_size)

    def call(self, x):
        x = self.embedding(x)
        for layer in self.reformer_blocks:
            x = layer(x)
        return self.final_layer(x)

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "d_model": self.d_model,
            "num_heads": self.num_heads,
            "dff": self.dff,
            "num_layers": self.num_layers
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

data_path = "/content/drive/MyDrive/Week 4/data/tokenized_training_data.json"
with open(data_path, "r", encoding="utf-8") as file:
    tokenized_data = json.load(file)

input_sequences = [item['input_ids'] for item in tokenized_data]
target_sequences = [item['output_ids'] for item in tokenized_data]

input_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    input_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    target_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

reformer = Reformer(VOCAB_SIZE, D_MODEL, NUM_HEADS, DFF, NUM_LAYERS)
reformer.compile(
    optimizer=keras.optimizers.Adam(learning_rate=3e-4),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

history = reformer.fit(
    dataset,
    epochs=10,
    callbacks=[
        keras.callbacks.ModelCheckpoint(
            'reformer_checkpoint_{epoch}.keras',
            save_best_only=True,
            monitor='loss'
        ),
        keras.callbacks.EarlyStopping(
            monitor='loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)


reformer.save('reformer_final_model.keras')

custom_objects = {
    "Reformer": Reformer,
    "ReformerBlock": ReformerBlock,
    "ReversibleResidualLayer": ReversibleResidualLayer,
    "LSHSelfAttention": LSHSelfAttention,
    "FeedForward": FeedForward,
    "TokenEmbedding": TokenEmbedding
}


try:
    loaded_model = keras.models.load_model('reformer_final_model.keras',
                                         custom_objects=custom_objects)
    print("model successfully saved and loaded!")
except Exception as e:
    print(f"rrror loading model: {e}")

Epoch 1/10


Exception ignored in: <function ConcreteFunctionGarbageCollector.__del__ at 0x7f13960b19e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/polymorphic_function/concrete_function.py", line 1749, in __del__
    def __del__(self):
  
KeyboardInterrupt: 


[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 116ms/step - accuracy: 0.9553 - loss: 0.6376
Epoch 2/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - accuracy: 0.9758 - loss: 0.1190
Epoch 3/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - accuracy: 0.9766 - loss: 0.1065
Epoch 4/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 103ms/step - accuracy: 0.9782 - loss: 0.0898
Epoch 5/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - accuracy: 0.9778 - loss: 0.0890
Epoch 6/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - accuracy: 0.9779 - loss: 0.0873
Epoch 7/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - accuracy: 0.9779 - loss: 0.0866
Epoch 8/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 102ms/step - accuracy: 0.9781 - loss: 0.0853
Epoch 9/10
[1m296/296[0m

In [8]:
import os
print("Model saved at:", os.path.abspath("reformer_final_model.h5"))


Model saved at: /content/reformer_final_model.h5


In [9]:
import os
print(os.listdir("/content"))


['.config', 'reformer_checkpoint_7.h5', 'reformer_checkpoint_10.h5', 'reformer_checkpoint_2.h5', 'reformer_checkpoint_9.h5', 'reformer_checkpoint_6.h5', 'reformer_checkpoint_3.h5', 'reformer_checkpoint_4.h5', 'reformer_final_model.h5', 'reformer_checkpoint_1.h5', 'drive', 'reformer_checkpoint_5.h5', 'reformer_checkpoint_8.h5', 'sample_data']


In [10]:
reformer.save('reformer_final_model.keras')


In [32]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import sentencepiece as spm
import json
import os
from typing import Dict, List, Optional, Tuple

class DatabaseManager:
    def __init__(self, data_path: str):
        with open(os.path.join(data_path, "processed_slots.json"), "r", encoding="utf-8") as f:
            self.slots = json.load(f)
        with open(os.path.join(data_path, "processed_ontology.json"), "r", encoding="utf-8") as f:
            self.ontology = json.load(f)
        with open(os.path.join(data_path, "processed_database.json"), "r", encoding="utf-8") as f:
            self.database = json.load(f)

    def extract_slots(self, text: str) -> Dict[str, str]:
        slots_found = {}
        text = text.lower()

        categories = ['hotel', 'restaurant', 'train', 'attraction', 'taxi', 'bus', 'hospital']
        category = next((cat for cat in categories if cat in text), None)

        if category:
            for slot in self.slots.get(category, []):
                slot_values = self.ontology.get(category, {}).get(f"{category}-semi-{slot.split('-')[-1]}", [])
                for value in slot_values:
                    if value.lower() in text:
                        slots_found[slot] = value

        return slots_found

    def query_database(self, category: str, slots: Dict[str, str]) -> List[Dict]:
        results = []
        entries = self.database.get(category, [])

        for entry in entries:
            matches = True
            for slot, value in slots.items():
                if slot.split('-')[-1] in entry:
                    if entry[slot.split('-')[-1]].lower() != value.lower():
                        matches = False
                        break
            if matches:
                results.append(entry)

        return results

class EnhancedChatbot:
    def __init__(self, model_path: str, tokenizer_path: str, data_path: str):
        self.max_sequence_length = 128

        self.model = keras.models.load_model(model_path, custom_objects={
            "Reformer": Reformer,
            "ReformerBlock": ReformerBlock,
            "ReversibleResidualLayer": ReversibleResidualLayer,
            "LSHSelfAttention": LSHSelfAttention,
            "FeedForward": FeedForward,
            "TokenEmbedding": TokenEmbedding
        })

        self.tokenizer = spm.SentencePieceProcessor()
        self.tokenizer.load(tokenizer_path)
        self.db_manager = DatabaseManager(data_path)

    def preprocess_input(self, text: str) -> np.ndarray:
        """Preprocess input text for the model."""
        tokens = self.tokenizer.encode_as_ids(text)
        if len(tokens) > self.max_sequence_length:
            tokens = tokens[:self.max_sequence_length]
        padded_tokens = np.zeros((1, self.max_sequence_length), dtype=np.int32)
        padded_tokens[0, :len(tokens)] = tokens
        return padded_tokens

    def postprocess_output(self, predictions: np.ndarray) -> str:
        predicted_ids = np.argmax(predictions[0], axis=-1)
        if self.tokenizer.eos_id() in predicted_ids:
            predicted_ids = predicted_ids[:np.where(predicted_ids == self.tokenizer.eos_id())[0][0]]
        return self.tokenizer.decode_ids(predicted_ids.tolist())

    def format_database_response(self, category: str, results: List[Dict]) -> str:
        if not results:
            return f"I couldn't find any {category} matching your criteria."

        response = f"I found {len(results)} matching {category}(s):\n"
        for i, result in enumerate(results, 1):
            response += f"\n{i}. "
            if 'name' in result:
                response += f"{result['name']}"
            if 'type' in result:
                response += f" ({result['type']})"
            if 'area' in result:
                response += f" in {result['area']}"
            if 'pricerange' in result:
                response += f" - {result['pricerange']} price range"
            if 'phone' in result:
                response += f"\n   phone: {result['phone']}"
            if 'address' in result:
                response += f"\n   address: {result['address']}"

        return response

    def handle_query(self, user_input: str) -> str:
        slots = self.db_manager.extract_slots(user_input)

        if slots:
            category = next((cat for cat in self.db_manager.slots.keys()
                           if any(slot.startswith(cat) for slot in slots)), None)
            if category:
                results = self.db_manager.query_database(category, slots)
                if results:
                    return self.format_database_response(category, results)

        try:
            preprocessed_input = self.preprocess_input(user_input)
            print(f"preprocessed input: {preprocessed_input}")
            with tf.device('/CPU:0'):
                prediction = self.model.predict(preprocessed_input, verbose=0)
            print(f"model prediction: {prediction}")
            return self.postprocess_output(prediction)
        except Exception as e:
            return f"I apologize, but Im facing an error: {str(e)}"

    def chat(self):
        print("welcome! Type 'exit' to end the chat.")

        while True:
            try:
                user_input = input("You: ").strip()

                if user_input.lower() == 'exit':
                    print("Goodbye! have a nice day !")
                    break

                if not user_input:
                    print("please type something!")
                    continue

                response = self.handle_query(user_input)
                print(f"assistant: {response}")

            except KeyboardInterrupt:
                print("\nChat ended by user.")
                break
            except Exception as e:
                print(f"error occurred: {str(e)}")
                print("try again!")

if __name__ == "__main__":
    MODEL_PATH = 'reformer_final_model.keras'
    TOKENIZER_PATH = "/content/drive/MyDrive/Week 4/data/tokenizer.model"
    DATA_PATH = "/content/drive/MyDrive/Week 4/data"

    chatbot = EnhancedChatbot(MODEL_PATH, TOKENIZER_PATH, DATA_PATH)
    chatbot.chat()

welcome! Type 'exit' to end the chat.
You: i need cheap restaurant 
assistant: I found 22 matching restaurant(s):

1. pizza hut city centre (restaurant) in centre - cheap price range
   phone: 01223323737
   address: Regent Street City Centre
2. the missing sock (restaurant) in east - cheap price range
   phone: 01223812660
   address: Finders Corner Newmarket Road
3. charlie chan (restaurant) in centre - cheap price range
   phone: 01223361763
   address: Regent Street City Centre
4. ask restaurant (restaurant) in centre - cheap price range
   phone: 01223364917
   address: 12 Bridge Street City Centre
5. kohinoor (restaurant) in centre - cheap price range
   phone: 01223323639
   address: 74 Mill Road City Centre
6. rice house (restaurant) in centre - cheap price range
   phone: 01223367755
   address: 88 Mill Road City Centre
7. thanh binh (restaurant) in west - cheap price range
   phone: 01223362456
   address: 17 Magdalene Street City Centre
8. da vinci pizzeria (restaurant) in n