In [None]:
LABELS = {
    "info": "general information, opening hours, ticket price, introduction",
    "media": "play audio, play video, open media, audio guide",
    "location": "where is, address, directions, maps",
    "chitchat": "casual conversation, greeting, small talk"
}

In [None]:
TEMPLATES = {
    "media": [
        # rõ ràng
        "Play audio for {x}",
        "Open audio guide for {x}",
        "Play video about {x}",

        # dễ nhầm
        "Can I listen to something about {x}",
        "I want to hear more about {x}",
        "Show me something about {x}"
    ],

    "location": [
        # rõ ràng
        "Where is {x}",
        "Address of {x}",
        "How to get to {x}",

        # dễ nhầm
        "How do I find {x}",
        "Where can I find {x}",
        "Is {x} nearby"
    ],

    "info": [
        # rõ ràng
        "Opening hours of {x}",
        "Ticket price of {x}",
        "Information about {x}",

        # dễ nhầm
        "Tell me about {x}",
        "I want to know more about {x}",
        "What should I know about {x}"
    ],

    "chitchat": [
        # rõ ràng
        "Hello",
        "How are you",
        "Nice to meet you",

        # dễ nhầm (hay bị route nhầm sang info)
        "Can you help me",
        "I have a question",
        "Hey there"
    ]
}


In [None]:
ENTITIES = [
    "Independence Palace",
    "Notre Dame Cathedral",
    "National Museum"
]


In [None]:
LANGS = [
    "ar","hi","en","pt","de","ko","hu","id","ms","ru",
    "ja","fi","fr","fil","es","th","tr","zh-CN","zh-HK","vi","it"
]


In [None]:
from deep_translator import GoogleTranslator
import time

CACHE = {}

def translate(text, lang):
    key = (text, lang)
    if key in CACHE:
        return CACHE[key]

    try:
        out = GoogleTranslator(source="en", target=lang).translate(text)
    except Exception:
        out = text  # fallback an toàn

    CACHE[key] = out
    time.sleep(0.15)
    return out


In [None]:
def gen_intent_samples():
    data = []
    for intent, templates in TEMPLATES.items():
        for ent in ENTITIES:
            for t in templates:
                base = t.format(x=ent)

                for lang in LANGS:
                    q = translate(base, lang)

                    data.append({
                        "task": "intent",
                        "anchor": f"query: {q}",
                        "positive": f"passage: {LABELS[intent]}"
                    })
    return data


In [None]:
def gen_crosslingual_samples():
    data = []
    for intent, templates in TEMPLATES.items():
        for ent in ENTITIES:
            base = templates[0].format(x=ent)

            for i in range(len(LANGS) - 1):
                q1 = translate(base, LANGS[i])
                q2 = translate(base, LANGS[i + 1])

                data.append({
                    "task": "crosslingual",
                    "anchor": f"query: {q1}",
                    "positive": f"query: {q2}"
                })
    return data


In [None]:
import random

def gen_hard_negative_samples():
    data = []
    intents = list(TEMPLATES.keys())

    for intent in intents:
        for ent in ENTITIES:
            neg_intent = random.choice([i for i in intents if i != intent])
            base = random.choice(TEMPLATES[intent]).format(x=ent)

            for lang in LANGS:
                q = translate(base, lang)

                data.append({
                    "task": "hard_negative",
                    "anchor": f"query: {q}",
                    "positive": f"passage: {LABELS[intent]}",
                    "negative": f"passage: {LABELS[neg_intent]}"
                })
    return data


In [None]:
import json
import random

def build_dataset():
    data = []
    data += gen_intent_samples()
    data += gen_crosslingual_samples()
    data += gen_hard_negative_samples()
    random.shuffle(data)
    return data


if __name__ == "__main__":
    dataset = build_dataset()

    with open("multilingual_embedding_dataset.json", "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=2)

    print("Total samples:", len(dataset))
