In [8]:
import json

def load_seed(path):
    seed = []
    with open(path) as f:
        for line in f:
            seed.append(json.loads(line))
    return seed

seed_train = load_seed("data/train.jsonl")
sample_texts = [x["text"] for x in seed_train]

In [7]:
print(sample_texts[:10])

['this is dhruv joshi my phone number is seven nine six three eight six three nine six zero please call me tomorrow', 'this is pooja chatterjee from mumbai my phone is 83315 41006 and email is pooja dot chatterjee at rediffmail dot co.in we can meet on 14/07/2026', 'tomorrow problem delivery please information payment issue update balance complaint resolve checking plan status yesterday', 'this is amit chatterjee my phone number is 8150939834 please call me tomorrow', 'this is rohan gupta from hyderabad my phone is 66636 51559 and email is rohan dot gupta at yahoo dot co.in we can meet on 09-01-2026', 'complaint order tomorrow ticket feedback please support balance issue today', 'my name is vijay verma i am from gurgaon my credit card number is three four eight eight eight one zero seven nine eight three three four two three three and it expires on 21-06-2023 you can email me on vijay dot verma at rediffmail dot co.in', 'this is sunita agarwal my phone number is 7622444721 please call 

In [9]:
# Common entity examples extracted from your sample dataset
person_names = set()
cities = set()
locations = set()
emails = set()
phones = set()
credit_cards = set()
dates = set()

In [10]:
for example in seed_train:
    text = example["text"]
    for ent in example["entities"]:
        span = text[ent["start"]: ent["end"]]
        label = ent["label"]

        if label == "PERSON_NAME":
            person_names.add(span)
        elif label == "CITY":
            cities.add(span)
        elif label == "LOCATION":
            locations.add(span)
        elif label == "EMAIL":
            emails.add(span)
        elif label == "PHONE":
            phones.add(span)
        elif label == "CREDIT_CARD":
            credit_cards.add(span)
        elif label == "DATE":
            dates.add(span)

In [29]:
# Convert sets to lists
person_names = list(person_names)
cities = list(cities)
locations = list(locations)
emails = list(emails)
phones = list(phones)
credit_cards = list(credit_cards)
dates = list(dates)

# ---------------------------
# Templates
# ---------------------------
templates = [
    "my name is {NAME} and my email is {EMAIL}",
    "please update my phone number it is {PHONE}",
    "i used my credit card number {CC} yesterday",
    "i live in {CITY} near {LOC}",
    "my appointment is on {DATE}",
    "contact {NAME} in {CITY} at email {EMAIL}",
    "the delivery address is {LOC} in {CITY}",
    "call {NAME} on {PHONE} for confirmation"
]


In [34]:
import json
import random
import re
import string
from pathlib import Path

# ---------------------------
# Number → word conversion
# ---------------------------
num_words = {
    "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
    "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine"
}

def number_to_words(num_str):
    words = []
    i = 0
    while i < len(num_str):
        # Double numbers: 99 → "double nine"
        if i < len(num_str)-1 and num_str[i] == num_str[i+1]:
            words.append("double " + num_words[num_str[i]])
            i += 2
        else:
            words.append(num_words[num_str[i]])
            i += 1
    return " ".join(words)

# ---------------------------
# STT Noise Functions
# ---------------------------
fillers = ["uh", "you know", "basically", "like", "right", "umm"]
common_typos = {
    "mumbai": "mumbay",
    "delhi": "dilli",
    "kolkata": "kolkataa",
    "bangalore": "banglore",
    "krishnan": "krishan",
    "kumar": "komar"
}

def add_fillers(text):
    if random.random() < 0.30:
        return random.choice(fillers) + " " + text
    return text

def stutter_words(text):
    if random.random() < 0.15:
        words = text.split()
        idx = random.randint(0, len(words)-1)
        words[idx] = words[idx] + " " + words[idx]
        return " ".join(words)
    return text

def remove_vowels(text):
    if random.random() < 0.10:
        return re.sub(r"[aeiou]", "", text)
    return text

def typo(text):
    words = text.split()
    for i,w in enumerate(words):
        if w.lower() in common_typos and random.random() < 0.3:
            words[i] = common_typos[w.lower()]
    return " ".join(words)

def drop_words(text):
    words = text.split()
    if len(words) > 5 and random.random() < 0.15:
        drop_idx = random.randint(0, len(words)-1)
        words.pop(drop_idx)
    return " ".join(words)

def merge_words(text):
    if random.random() < 0.10:
        words = text.split()
        idx = random.randint(0, len(words)-2)
        words[idx] = words[idx] + words[idx+1]
        words.pop(idx+1)
        return " ".join(words)
    return text

def add_noise(text):
    text = add_fillers(text)
    text = stutter_words(text)
    text = typo(text)
    text = drop_words(text)
    text = merge_words(text)
    text = remove_vowels(text)
    return text

# ---------------------------
# Entity Generators
# ---------------------------
first_names = [
    "rahul", "aditya", "neha", "sanjay", "deepak",
    "aarav", "riya", "priya", "ananya", "mehul",
    "vikas", "arjun", "kiran", "varun", "tanya"
]

last_names = [
    "kumar", "rao", "dubey", "krishnan", "sharma",
    "mehta", "singh", "verma", "joshi", "patel",
    "reddy", "nair", "desai", "kapoor", "bhatt"
]

def gen_person_name():
    return random.choice(first_names) + " " + random.choice(last_names)

cities = ["mumbai", "delhi", "bangalore", "kolkata", "pune", "hyderabad"]
locations = ["andheri east", "koramangala", "salt lake", "baner", "sector five"]

def gen_email(name):
    # spoken STT version
    spoken = name.replace(" ", " dot ") + " at gmail dot com"
    
    # symbol version
    first, last = name.split()
    symbol = f"{first}.{last}@gmail.com"
    
    # randomly choose between STT or symbol-style (30% symbol, 70% spoken)
    if random.random() < 0.30:
        return symbol
    else:
        return spoken


def gen_phone():
    digits = "".join(random.choice("9876543210") for _ in range(10))
    return number_to_words(digits)

def gen_credit_card():
    digits = "".join(random.choice("1234567890") for _ in range(12))
    return number_to_words(digits)

def gen_date():
    days = ["one", "two", "seven", "ten", "twelve", "twenty four"]
    months = ["january", "march", "may", "june", "july"]
    years = ["twenty twenty", "twenty twenty one", "twenty twenty two"]
    return f"{random.choice(days)} {random.choice(months)} {random.choice(years)}"

# ---------------------------
# Build example + compute spans
# ---------------------------
def build_example():
    name = gen_person_name()
    email = gen_email(name)
    phone = gen_phone()
    cc = gen_credit_card()
    city = random.choice(cities)
    loc = random.choice(locations)
    date = gen_date()

    template = random.choice(templates)
    base = template.format(
        NAME=name, EMAIL=email, PHONE=phone,
        CC=cc, CITY=city, LOC=loc, DATE=date
    )

    # Add STT noise
    noisy = add_noise(base.lower())

    return noisy, {
        "PERSON_NAME": name,
        "EMAIL": email,
        "PHONE": phone,
        "CREDIT_CARD": cc,
        "CITY": city,
        "LOCATION": loc,
        "DATE": date
    }

def find_entities(text, mapping):
    entities = []
    for label, value in mapping.items():
        if value is None:
            continue
        idx = text.find(value)
        if idx != -1:
            entities.append({
                "start": idx,
                "end": idx + len(value),
                "label": label
            })
    return entities

def generate_dataset(n, path):
    with open(path, "w") as f:
        for i in range(n):
            text, mapping = build_example()
            ents = find_entities(text, mapping)
            json.dump({
                "id": f"utt_gen_{i:05d}",
                "text": text,
                "entities": ents
            }, f)
            f.write("\n")


In [35]:
generate_dataset(500, "data/train.jsonl")
generate_dataset(150, "data/dev.jsonl")

print("STT-style noisy datasets generated successfully!")

STT-style noisy datasets generated successfully!


In [36]:
count = sum(1 for _ in open("data/train.jsonl"))
print(count)


500


In [47]:
def sample_jsonl(path, n=1):
    with open(path) as f:
        lines = [json.loads(line) for line in f]
    for item in random.sample(lines, n):
        print(json.dumps(item, indent=4), "\n")

# Show 1 random example
sample_jsonl("data/train.jsonl", 1)

{
    "id": "utt_gen_00087",
    "text": "right i used my credit card number double three five zero nine six two four four nine six nine three yesterday",
    "entities": []
} 

