## Dummy Dataset Generation

In [None]:
import json
import time
import random
import os
from pathlib import Path
from groq import Groq
import re

In [None]:
client = Groq(api_key="dummy_key_for_demo")

In [None]:
'''
Dummy data generation for NLU training in real estate chatbot context.
'''

CLEAN_LOCATIONS = [
    "Baker Street 221B, 12345 London, UK",
    "Paris",
    "Nueva España, Madrid",
    "Gran Vía 10, 28013 Madrid, Spain",
    "Milan",
    "Fifth Avenue 725, 10022 New York, USA",
    "Rua Augusta 20, 1100-053 Lisbon, Portugal",
    "Kärntner Straße 15, 1010 Vienna, Austria"
]

AMENITY_SYNONYMS = {
        'AIR_CONDITIONING': ['air conditioning', 'ac', 'a/c', 'climate control', 'cooling'],
        'BALCONY': ['balcony', 'outdoor space', 'private terrace'],
        'COFFEE_MACHINE': ['coffee machine', 'nespresso', 'espresso maker'],
        'FITNESS_CENTER': ['fitness center', 'gym', 'workout room', 'weights'],
        'INTERNET': ['internet', 'wifi', 'wi-fi', 'high-speed connection'],
        'KITCHEN': ['kitchen', 'fully equipped kitchen', 'kitchenette'],
        'PET_POLICY': ['pet friendly', 'pets allowed', 'dog friendly', 'cat friendly'],
        'POOL': ['pool', 'swimming pool', 'rooftop pool'],
        'SMART_TV': ['smart tv', 'netflix', 'streaming apps', 'hbo'],
        'WORK_DESK': ['work desk', 'home office', 'monitor setup', 'working area'],
        'WASHING_MACHINE': ['washer', 'laundry machine', 'washing machine'],
        
        'STUDIO': ['studio', 'studio apartment', 'loft'],
        'ONE_BEDROOM': ['1 bedroom', 'one bedroom', '1-bed'],
        'TWO_BEDROOM': ['2 bedroom', 'two bedroom', '2-bed'],
        'THREE_BEDROOM': ['3 bedroom', 'three bedroom', 'family apartment']
}

REAL_APARTMENTS = [
    "SPLENDOM Downtown", 
    "SPLENDOM Central Park", 
    "SPLENDOM Riverside", 
    "SPLENDOM Financial District",
    "SPLENDOM Old Town",
    "SPLENDOM Harbor View",
    "SPLENDOM University",
    "SPLENDOM West End",
    "SPLENDOM Opera",
    "SPLENDOM Market Square"
]

In [None]:
def get_random_synonyms(count=6):
    """
    Selects random amenities and their synonyms to diversify training vocabulary.
    """
    keys = random.sample(list(AMENITY_SYNONYMS.keys()), k=min(count, len(AMENITY_SYNONYMS)))
    flat_list = []
    for k in keys:
        flat_list.append(random.choice(AMENITY_SYNONYMS[k]))
    return flat_list

def get_random_apartments(count=4):
    """
    Simulate noise in apartment names by randomly altering the casing of "SPLENDOM" in some names.
    """
    raw_names = random.sample(REAL_APARTMENTS, k=min(count, len(REAL_APARTMENTS)))
    processed = []
    for name in raw_names:
        if random.random() < 0.2 and "SPLENDOM" in name:
            variation = random.choice(["splendom", "Splendom", "Splenom"]) 
            name = name.replace("SPLENDOM", variation)
        processed.append(name)
    return processed

In [None]:
def generate_nlu_samples(intent_type, count=50):
#Generate noise and diversity in training data.    
    current_synonyms = get_random_synonyms(count=6)
    current_apartments = get_random_apartments(count=3)
    
    # Select random locations directly from the clean list
    current_locations = random.sample(CLEAN_LOCATIONS, k=min(5, len(CLEAN_LOCATIONS)))
    
    fake_amenities = ["underwater room", "gold toilet", "butler named James"]

    #System prompts based on intent type
    if intent_type == "DISCOVERY":
        system_msg = f"""
        Generate synthetic user queries for a real estate chatbot.
        Intent: DISCOVERY (User wants to find an apartment by CRITERIA).
        
        INSTRUCTIONS:
        1. Use ENGLISH syntax but preserve location names exactly as provided.
        2. Inject these LOCATIONS: {', '.join(current_locations)}.
        3. Inject these AMENITIES: {', '.join(current_synonyms)}.
        4. Queries should look like: "flat near [LOCATION] with [AMENITY]", "studio in [LOCATION]".
        """
        
    elif intent_type == "DETAILS":
        system_msg = f"""
        Generate synthetic user queries for a real estate chatbot.
        Intent: DETAILS (User asks specific info about a NAMED apartment).
        
        INSTRUCTIONS:
        1. Queries MUST include one of these identifiers: {', '.join(current_apartments)}.
        2. Ask about policies: "Does [NAME] allow pets?", "check-in for [NAME]".
        """

    elif intent_type == "OTHER":
        system_msg = f"""
        Intent: OTHER (Out of Domain).
        Generate queries the bot should ignore:
        1. Greetings ("Hello").
        2. Off-topic ("How is the weather?").
        3. Impossible requests: {', '.join(fake_amenities)}.
        """

    prompt = f"""
    Generate {count} unique training examples.
    Output strictly a JSON list of objects: [{{"text": "query...", "label": "{intent_type}"}}, ...]
    """
    
    try:
        #API call to Llama3
        completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": prompt}
            ],
            model="llama-3.3-70b-versatile",
            temperature=0.85, 
            response_format={"type": "json_object"}
        )
        content = completion.choices[0].message.content
        
        data = json.loads(content)
        
        if isinstance(data, list): 
            return data
        if isinstance(data, dict): 
            for k in data: 
                if isinstance(data[k], list): 
                    return data[k]
        return []
            
    except Exception as e:
        print(f"Error during API call: {e}")
        return []

In [None]:
OUTPUT_FILE = "data/nlu_dummy_dataset.json"
TARGET_SAMPLES_PER_INTENT = 20 #In production this is set to thousands 
    
Path("data").mkdir(exist_ok=True)
all_data = []
intent_list = ["DISCOVERY", "DETAILS", "OTHER"]

for intent in intent_list:
    print(f"\nGenerating samples for Intent: {intent}...")
    current_count = 0
    while current_count < TARGET_SAMPLES_PER_INTENT:
        batch = generate_nlu_samples(intent, count=50) 
        if not batch:
            time.sleep(1)
            continue
        
        all_data.extend(batch)
        current_count += len(batch)
        time.sleep(0.5)

#Save results
random.shuffle(all_data) 
        
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)
print(f"\nGeneration completed with {len(all_data)} samples to {OUTPUT_FILE}")