Dataset generation code for training the intent model

In [4]:
# generate_dataset.py
import random
import csv
import json
import re
from itertools import product
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

random.seed(42)

# placeholder lists
destinations = ["home", "office", "airport", "downtown", "station", "school", "market", "college"]
contacts = ["mom", "dad", "ravi", "anita", "michael", "support"]
preferences = ["fastest route", "scenic route", "avoid tolls", "shortest route"]

# templates per intent
TEMPLATES = {
    "NAVIGATE": [
        "Take me to {destination}",
        "Navigate to {destination}",
        "Drive to {destination} via {preference}",
        "Get directions to {destination}",
        "Route to {destination} avoiding tolls"
    ],
    "NAVIGATE_QUERY": [
        "What's the ETA to {destination}?",
        "How long will it take to reach {destination}?",
        "Distance to {destination}?"
    ],
    "CALL_CONTACT": [
        "Call {contact}",
        "Phone {contact}",
        "Make a call to {contact}"
    ],
    "PLAY_MUSIC": [
        "Play {song}",
        "Play the playlist {song}",
        "Next song",
        "Play some music"
    ],
    "STOP": [
        "Stop",
        "Cancel",
        "Stop navigation",
        "Stop music"
    ],
    "VEHICLE_STATUS": [
        "What's the tyre pressure?",
        "Show fuel level",
        "Check battery status",
        "Engine temperature status"
    ],
    "REPORT_FAULT": [
        "My engine is making a knocking sound",
        "There is a strange noise when idling",
        "Check engine light is on"
    ],
    "NONE": [
        "I was at the market yesterday",
        "That's a nice song",
        "What time is it?",
        "I like the color of this car"
    ]
}

# small polite paraphrase swaps to create variants
POLITE_SWAPS = [
    ("Could you", "Can you"),
    ("Please", ""),
    ("Would you", "Can you"),
    ("I want to", "Please start")
]

# simple synonym replace (non-named entities)
def synonym_replace(sentence, p=0.2):
    words = re.findall(r"\w+|\W+", sentence)  # preserve punctuation
    new_words = []
    for w in words:
        if not w.isalpha():
            new_words.append(w)
            continue
        if random.random() < p:
            syns = wordnet.synsets(w)
            lemmas = {l.name().replace('_', ' ') for s in syns for l in s.lemmas()}
            lemmas.discard(w)
            if lemmas:
                choice = random.choice(list(lemmas))
                new_words.append(choice)
                continue
        new_words.append(w)
    return "".join(new_words)

def generate_from_templates(num_per_intent=50):
    rows = []
    for intent, templates in TEMPLATES.items():
        # expand templates with placeholders
        candidates = []
        for t in templates:
            if "{destination}" in t:
                for d in destinations:
                    for pref in preferences + [""]:
                        s = t.format(destination=d, preference=pref) if "{preference}" in t else t.format(destination=d)
                        s = re.sub('  +', ' ', s).strip()
                        candidates.append((s, {"destination": d}))
            elif "{contact}" in t:
                for c in contacts:
                    s = t.format(contact=c)
                    candidates.append((s, {"contact": c}))
            elif "{song}" in t:
                # use some dummy songs/playlists
                songs = ["Top Hits", "Chill Vibes", "Road Trip Playlist"]
                for song in songs:
                    s = t.format(song=song)
                    candidates.append((s, {}))
            else:
                candidates.append((t, {}))
        # sample/generate multiples with paraphrases + synonym replacement
        selected = []
        while len(selected) < num_per_intent:
            base, slots = random.choice(candidates)
            # apply polite swaps sometimes
            if random.random() < 0.3:
                swap = random.choice(POLITE_SWAPS)
                base = base.replace(swap[0], swap[1]).strip()
            # maybe synonym replace
            aug = synonym_replace(base, p=0.25) if random.random() < 0.4 else base
            # lowercasing and normalize spaces
            aug_norm = re.sub(r'\s+', ' ', aug).strip()
            selected.append((aug_norm, slots))
        # deduplicate
        unique = []
        seen = set()
        for u, s in selected:
            if u.lower() not in seen:
                seen.add(u.lower())
                unique.append((u, s))
        # append rows
        for u, s in unique:
            slots_json = json.dumps(s) if s else ""
            rows.append((u, intent, slots_json))
    return rows

if __name__ == "__main__":
    rows = generate_from_templates(num_per_intent=60)  # tweak to reach 300-500 utterances
    random.shuffle(rows)
    with open("intents.csv", "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["utterance", "intent", "slots"])
        for r in rows:
            writer.writerow(r)
    print("Saved intents.csv with", len(rows), "rows")


Saved intents.csv with 184 rows


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ojasv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ojasv\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
