<a href="https://colab.research.google.com/github/dhanu902/FoodieChat-Bot/blob/main/BOT_PREPROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
import json
import os

from google.colab import drive
drive.mount('/content/drive')

train_path = '/content/drive/MyDrive/ChatBot/MultiWOZ_2.2/train'

train_data = {}

for filename in sorted(os.listdir(train_path)):
  if filename.endswith('.json'):
    with open(os.path.join(train_path, filename), 'r') as f:
      dialogues = json.load(f)
      for dialogue in dialogues:
        train_data[dialogue['dialogue_id']] = dialogue

print(f"Total dialogues loaded: {len(train_data)}")

Mounted at /content/drive
Total dialogues loaded: 8437


## **General NLP Pre-Processing**

In [5]:
def clean_text(text, remove_stopwords=True, do_lemmatize=True):
  text = text.lower()

  text = re.sub(r'[^a-zA-Z\s]', '', text)

  tokens = nltk.word_tokenize(text)

  if remove_stopwords:
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

  if do_lemmatize:
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

  return ' '.join(tokens)

In [6]:
cleaned_utt = []

for dialogue in train_data.values():
  for turn in dialogue["turns"]:
    if turn["speaker"] == "USER":
      original = turn["utterance"]
      cleaned = clean_text(original)
      cleaned_utt.append((original, cleaned))

## **Intent Classification PreProcessing**

In [11]:
### ---- Capture Intent ----

intent_data = []

for dialogue in train_data.values():
  for turn in dialogue["turns"]:
    if turn["speaker"] == "USER":
      for frame in turn.get("frames", []):
        intent = frame.get("state", {}).get("active_intent", "NONE")
        if intent != "NONE":
          original = turn["utterance"]
          cleaned = clean_text(original)
          intent_data.append((original, cleaned, intent))

In [12]:
### ---- Convert to DataFrame ----

import pandas as pd
DF_intens = pd.DataFrame(intent_data, columns=["original", "cleaned", "intent"])

In [13]:
### ---- Label Encode the Intent ----

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
DF_intens["intent_ID"] = label_encoder.fit_transform(DF_intens["intent"])

## **NER PreProcessing/ Slot Filling**

In [14]:
### ---- BIO Tagging ----

from nltk.tokenize import word_tokenize

def bio_tag_utt(utterance, slot_values):
  tokens = word_tokenize(utterance.lower())
  tags = ["O"] * len(tokens)

  slot_map = {} # flat slot values
  for slot, values in slot_values.items():
    for value in values:
      value_tokens = word_tokenize(value.lower())
      for i in range(len(tokens) - len(value_tokens) + 1):
        if tokens[i : i + len(value_tokens)] == value_tokens:
          tags[i] = f"B-{slot}"
          for j in range(1, len(value_tokens)):
            tags[i+j] = f"I-{slot}"

  return tokens, tags

In [15]:
### ---- Apply to Dataset ----

bio_samples = []

for dialogue in train_data.values():
  for turn in dialogue["turns"]:
    if turn["speaker"] == "USER":
      for frame in turn.get("frames", []):
        slot_values = frame.get("state", {}).get("slot_values", {})
        if slot_values:
          utt = turn["utterance"]
          tokens, tags = bio_tag_utt(utt, slot_values)
          bio_samples.append({"tokens": tokens, "tags": tags})

In [16]:
with open("BIO_train_data.json", "w") as f:
  json.dump(bio_samples, f, indent=2)

## **DST PreProcessing**