<a href="https://colab.research.google.com/github/dhanu902/FoodieChat-Bot/blob/main/BOT_PREPROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import json
import os

from google.colab import drive
drive.mount('/content/drive')

def load_data(folder_path):
  data = {}

  for filename in sorted(os.listdir(folder_path)):
    if filename.endswith('.json'):
      with open(os.path.join(folder_path, filename), 'r') as f:
        dialogues = json.load(f)
        for dialogue in dialogues:
          data[dialogue['dialogue_id']] = dialogue

  return data


base_path = '/content/drive/MyDrive/ChatBot/MultiWOZ_2.2'

train_data = load_data(os.path.join(base_path, 'train'))
val_data = load_data(os.path.join(base_path, 'dev'))
test_data = load_data(os.path.join(base_path, 'test'))

print(f"Total dialogues loaded for Train: {len(train_data)}")
print(f"Total dialogues loaded for Test: {len(test_data)}")
print(f"Total dialogues loaded for Validation: {len(val_data)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total dialogues loaded for Train: 8437
Total dialogues loaded for Test: 1000
Total dialogues loaded for Validation: 1000


## **General NLP Pre-Processing**

In [4]:
def clean_text(text, remove_stopwords=True, do_lemmatize=True):
  text = text.lower()

  text = re.sub(r'[^a-zA-Z\s]', '', text)

  tokens = nltk.word_tokenize(text)

  if remove_stopwords:
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

  if do_lemmatize:
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

  return ' '.join(tokens)

## **Intent Classification PreProcessing**

In [5]:
from sklearn.preprocessing import LabelEncoder

def intents_pp(data, split):

  ### ---- Capture Intent ----
  intent_data = []

  for dialogue in data.values():
    for turn in dialogue["turns"]:
      if turn["speaker"] == "USER":
        for frame in turn.get("frames", []):
          intent = frame.get("state", {}).get("active_intent", "NONE")
          if intent != "NONE":
            original = turn["utterance"]
            cleaned = clean_text(original)
            intent_data.append((original, cleaned, intent))

  ### ---- Convert to DataFrame ----
  DF_intents = pd.DataFrame(intent_data, columns=["original", "cleaned", "intent"])
  ### ---- Label Encode the Intent ----
  DF_intents["intent_ID"] = LabelEncoder().fit_transform(DF_intents["intent"])

  DF_intents.to_csv(f"DF_intents_{split}.csv", index=False)

  return DF_intents

## **NER PreProcessing/ Slot Filling**

In [6]:
from nltk.tokenize import word_tokenize

### ---- BIO Tagging ----

def bio_tag_utt(utterance, slot_values):

  tokens = word_tokenize(utterance.lower())
  tags = ["O"] * len(tokens)

  for slot, values in slot_values.items():
    for value in values:
      value_tokens = word_tokenize(value.lower())

      for i in range(len(tokens) - len(value_tokens) + 1):
        if tokens[i : i + len(value_tokens)] == value_tokens:
          tags[i] = f"B-{slot}"

          for j in range(1, len(value_tokens)):
            tags[i+j] = f"I-{slot}"

  return tokens, tags

In [7]:
### ---- Apply to Dataset ----
def ner_pp(data, split):

  bio_samples = []

  for dialogue in data.values():
    for turn in dialogue["turns"]:
      if turn["speaker"] == "USER":
        for frame in turn.get("frames", []):
          slot_values = frame.get("state", {}).get("slot_values", {})

          if slot_values:
            utt = turn["utterance"]
            tokens, tags = bio_tag_utt(utt, slot_values)
            bio_samples.append({"tokens": tokens, "tags": tags})

  with open(f"BIO_{split}.json", "w") as f:
    json.dump(bio_samples, f, indent=2)

  return bio_samples

## **DST PreProcessing**

In [8]:
def dst_pp(data, split):
  dst_data = []

  for dialogue in data.values():
    history = []

    for turn in dialogue["turns"]:
      speaker = turn["speaker"]
      utt = turn["utterance"]
      history.append(f"{speaker.lower()}: {utt.strip()}")

      if speaker == "USER":
        combined_history = " ".join(history)
        full_state = {}

        for frame in turn.get("frames", []):
          slot_values = frame.get("state", {}).get("slot_values", {})

          for slot, values in slot_values.items():
            if values:
              full_state[slot] = values[-1]

        if full_state:
          dst_data.append({"history": combined_history, "belief_state": full_state})

  with open(f"DST_{split}.json", "w") as f:
    json.dump(dst_data, f, indent=2)

  return dst_data

## **Response Generation PreProcessing**

In [9]:
def response_pp(data, split):
    response_data = []
    for dialogue in data.values():
        for i, turn in enumerate(dialogue["turns"]):
            if turn["speaker"] == "SYSTEM" and i > 0:
                user_turn = dialogue["turns"][i-1]
                input_components = []
                for frame in user_turn.get("frames", []):
                    intent = frame.get("state", {}).get("active_intent", "NONE")
                    if intent != "NONE":
                        input_components.append(f"intent={intent}")
                    slot_values = frame.get("state", {}).get("slot_values", {})
                    for slot, values in slot_values.items():
                        if values:
                            input_components.append(f"{slot}={values[-1]}")
                input_text = " ".join(input_components)
                output_text = turn["utterance"]
                if input_text and output_text:
                    response_data.append({"input": input_text, "output": output_text})
    with open(f"response_{split}.json", "w") as f:
        json.dump(response_data, f, indent=2)
    return response_data

## **DataSet Spliting**

In [10]:
splits = {
    "train": train_data,
    #"val": val_data,
    #"test": test_data
}

for split_name, data in splits.items():
    print(f"🔄 Preprocessing {split_name.upper()}...")
    df_intent = intents_pp(data, split_name)
    bio_data = ner_pp(data, split_name)
    dst_data = dst_pp(data, split_name)
    response_data = response_pp(data, split_name)

🔄 Preprocessing TRAIN...


In [12]:
from sklearn.model_selection import train_test_split

import pandas as pd
from sklearn.model_selection import train_test_split
import json

# ---------- INTENT SPLIT ----------
df = pd.read_csv("DF_intents_train.csv")  # file written by intents_pp()

X = df["cleaned"]
y = df["intent_ID"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

pd.DataFrame({"text": X_train, "label": y_train}).to_csv("intent_train.csv", index=False)
pd.DataFrame({"text": X_val, "label": y_val}).to_csv("intent_val.csv", index=False)
pd.DataFrame({"text": X_test, "label": y_test}).to_csv("intent_test.csv", index=False)

# ---------- BIO SPLIT ----------
with open("BIO_train.json", "r") as f:
    bio_samples = json.load(f)

bio_train, bio_temp = train_test_split(bio_samples, test_size=0.3, random_state=42)
bio_val, bio_test = train_test_split(bio_temp, test_size=0.5, random_state=42)

with open("bio_train.json", "w") as f: json.dump(bio_train, f, indent=2)
with open("bio_val.json", "w") as f: json.dump(bio_val, f, indent=2)
with open("bio_test.json", "w") as f: json.dump(bio_test, f, indent=2)

# ---------- DST SPLIT ----------
with open("DST_train.json", "r") as f:
    dst_samples = json.load(f)

dst_train, dst_temp = train_test_split(dst_samples, test_size=0.3, random_state=42)
dst_val, dst_test = train_test_split(dst_temp, test_size=0.5, random_state=42)

with open("dst_train.json", "w") as f: json.dump(dst_train, f, indent=2)
with open("dst_val.json", "w") as f: json.dump(dst_val, f, indent=2)
with open("dst_test.json", "w") as f: json.dump(dst_test, f, indent=2)

# ---------- RESPONSE SPLIT ----------
with open("response_train.json", "r") as f:
    response_samples = json.load(f)

res_train, res_temp = train_test_split(response_samples, test_size=0.3, random_state=42)
res_val, res_test = train_test_split(res_temp, test_size=0.5, random_state=42)

with open("response_train.json", "w") as f: json.dump(res_train, f, indent=2)
with open("response_val.json", "w") as f: json.dump(res_val, f, indent=2)
with open("response_test.json", "w") as f: json.dump(res_test, f, indent=2)

## **DEBUG**

In [13]:
df_train = pd.read_csv("intent_train.csv")
df_val = pd.read_csv("intent_val.csv")
df_test = pd.read_csv("intent_test.csv")

print("Train set shape:", df_train.shape)
print("Validation set shape:", df_val.shape)
print("Test set shape:", df_test.shape)

print("\nSample rows: ")
print(df_train.head())

print("\nLabel value counts: ")
print(df_train["label"].value_counts())

Train set shape: (38603, 2)
Validation set shape: (8272, 2)
Test set shape: (8273, 2)

Sample rows: 
                                                text  label
0                       look hotel east free parking      6
1        would perfect get reservation monday people      1
2                     get thhe phone number postcode      5
3         yes phone number lan hong house restaurant      8
4  thanks also looking restaurant serf lebanese f...      8

Label value counts: 
label
8     7541
6     7378
10    6716
3     5693
1     3086
0     3082
9     2586
2     1686
5      495
7      334
4        6
Name: count, dtype: int64


In [14]:
with open("bio_train.json", "r") as f:
    bio_data = json.load(f)

print("Total BIO samples:", len(bio_data))
print("Sample:")
print(bio_data[0])

# Validate format
assert all(len(entry["tokens"]) == len(entry["tags"]) for entry in bio_data), "Mismatch between tokens and tags"

Total BIO samples: 58107
Sample:
{'tokens': ['i', "'m", 'also', 'looking', 'for', 'a', 'cinema', 'to', 'visit', '?'], 'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'B-attraction-type', 'O', 'O', 'O']}


In [15]:
with open("dst_train.json", "r") as f:
    dst_data = json.load(f)

print("Total DST samples:", len(dst_data))
print("Sample:")
print(dst_data[0])

# Basic checks
assert isinstance(dst_data[0]["history"], str)
assert isinstance(dst_data[0]["belief_state"], dict)

Total DST samples: 38249
Sample:
{'history': 'user: Find a budget hotel with free parking in Cambridge. system: There are 10 hotels that meet your needs. Would you like to narrow your search by area? user: I would like it to have a four star rating and be located on the west side. system: The Cambridge Belfry fits your requirements, would you like to book a reservation there? user: Please book it for 5 people and 5 nights starting from wednesday.', 'belief_state': {'hotel-area': 'west', 'hotel-bookday': 'wednesday', 'hotel-bookpeople': '5', 'hotel-bookstay': '5', 'hotel-name': 'cambridge belfry', 'hotel-parking': 'yes', 'hotel-pricerange': 'cheap', 'hotel-stars': '4'}}


In [16]:
with open("response_train.json", "r") as f:
    res_data = json.load(f)

print("Total response pairs:", len(res_data))
print("Sample:")
print(res_data[0])

assert "input" in res_data[0] and "output" in res_data[0]

Total response pairs: 39404
Sample:
{'input': 'intent=book_hotel hotel-bookday=sunday hotel-bookpeople=5 hotel-bookstay=3 hotel-name=ashley hotel attraction-name=cherry hinton hall and grounds attraction-type=entertainment', 'output': 'I was able to book it, reference number 6BIQ6UWS'}


## **Save**

In [17]:
# 📂 Define your target folder path
preprocessed_folder = '/content/drive/MyDrive/ChatBot/Preprocessed'

# ✅ Create the folder if it doesn’t exist
os.makedirs(preprocessed_folder, exist_ok=True)

# ---------------------- INTENT ---------------------- #
# Save intent train/val/test CSVs
df_train = pd.read_csv("intent_train.csv")
df_val = pd.read_csv("intent_val.csv")
df_test = pd.read_csv("intent_test.csv")

df_train.to_csv(f"{preprocessed_folder}/intent_train.csv", index=False)
df_val.to_csv(f"{preprocessed_folder}/intent_val.csv", index=False)
df_test.to_csv(f"{preprocessed_folder}/intent_test.csv", index=False)

# ---------------------- BIO ---------------------- #
with open("bio_train.json", "r") as f: bio_train = json.load(f)
with open("bio_val.json", "r") as f: bio_val = json.load(f)
with open("bio_test.json", "r") as f: bio_test = json.load(f)

with open(f"{preprocessed_folder}/bio_train.json", "w") as f: json.dump(bio_train, f, indent=2)
with open(f"{preprocessed_folder}/bio_val.json", "w") as f: json.dump(bio_val, f, indent=2)
with open(f"{preprocessed_folder}/bio_test.json", "w") as f: json.dump(bio_test, f, indent=2)

# ---------------------- DST ---------------------- #
with open("dst_train.json", "r") as f: dst_train = json.load(f)
with open("dst_val.json", "r") as f: dst_val = json.load(f)
with open("dst_test.json", "r") as f: dst_test = json.load(f)

with open(f"{preprocessed_folder}/dst_train.json", "w") as f: json.dump(dst_train, f, indent=2)
with open(f"{preprocessed_folder}/dst_val.json", "w") as f: json.dump(dst_val, f, indent=2)
with open(f"{preprocessed_folder}/dst_test.json", "w") as f: json.dump(dst_test, f, indent=2)

# ---------------------- Response Generation ---------------------- #
with open("response_train.json", "r") as f: res_train = json.load(f)
with open("response_val.json", "r") as f: res_val = json.load(f)
with open("response_test.json", "r") as f: res_test = json.load(f)

with open(f"{preprocessed_folder}/response_train.json", "w") as f: json.dump(res_train, f, indent=2)
with open(f"{preprocessed_folder}/response_val.json", "w") as f: json.dump(res_val, f, indent=2)
with open(f"{preprocessed_folder}/response_test.json", "w") as f: json.dump(res_test, f, indent=2)

print("✅ All preprocessed files saved to:", preprocessed_folder)

✅ All preprocessed files saved to: /content/drive/MyDrive/ChatBot/Preprocessed
