In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import json
import random
from torch.utils.data import DataLoader
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses
)

In [2]:
cd /content/drive/MyDrive/finetune

/content/drive/MyDrive/finetune


In [15]:
DATASET_PATH = "multilingual_embedding_dataset.json"
MODEL_NAME = "intfloat/multilingual-e5-base"
OUTPUT_DIR = "outputs/e5_intent_router"

In [16]:
EPOCHS = 3
LR = 1e-5
PAIR_BATCH_SIZE = 128
TRIPLET_BATCH_SIZE = 64
WARMUP_STEPS = 300
MAX_SEQ_LENGTH = 128

In [17]:
with open(DATASET_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

random.shuffle(data)

In [18]:
pair_examples = []
triplet_examples = []

for sample in data:
    task = sample.get("task")

    if task in ["intent", "crosslingual"]:
        pair_examples.append(
            InputExample(
                texts=[
                    sample["anchor"],
                    sample["positive"]
                ]
            )
        )

    elif task == "hard_negative":
        triplet_examples.append(
            InputExample(
                texts=[
                    sample["anchor"],
                    sample["positive"],
                    sample["negative"]
                ]
            )
        )

print("Pair samples    :", len(pair_examples))
print("Triplet samples :", len(triplet_examples))

Pair samples    : 5840
Triplet samples : 840


In [19]:
model = SentenceTransformer(MODEL_NAME)
model.max_seq_length = MAX_SEQ_LENGTH

In [20]:
train_objectives = []

if len(pair_examples) > 0:
    pair_loader = DataLoader(
        pair_examples,
        shuffle=True,
        batch_size=PAIR_BATCH_SIZE,
        drop_last=True
    )
    pair_loss = losses.MultipleNegativesRankingLoss(model)
    train_objectives.append((pair_loader, pair_loss))

if len(triplet_examples) > 0:
    triplet_loader = DataLoader(
        triplet_examples,
        shuffle=True,
        batch_size=TRIPLET_BATCH_SIZE,
        drop_last=True
    )
    triplet_loss = losses.TripletLoss(model)
    train_objectives.append((triplet_loader, triplet_loss))

In [21]:
model.fit(
    train_objectives=train_objectives,
    epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,
    optimizer_params={"lr": LR},
    output_path=OUTPUT_DIR,
    use_amp=True,
    show_progress_bar=True
)

print("✅ Training finished!")
print("Model saved to:", OUTPUT_DIR)

Computing widget examples:   0%|          | 0/2 [00:00<?, ?example/s]

Step,Training Loss


✅ Training finished!
Model saved to: outputs/e5_intent_router


In [7]:
from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer("outputs/e5_intent_router")

LABELS = {
    "info": "general information, opening hours, ticket price",
    "media": "play audio, play video, open media",
    "location": "where is, address, directions",
    "chitchat": "casual conversation, greeting, small talk"
}

label_texts = [f"passage: {v}" for v in LABELS.values()]
label_embs = model.encode(label_texts, normalize_embeddings=True)

query = "query: How far from there to Singapore River"
q_emb = model.encode(query, normalize_embeddings=True)

scores = q_emb @ label_embs.T
for name, score in zip(LABELS.keys(), scores):
    print(name, float(score))


The tokenizer you are loading from 'outputs/e5_intent_router' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


info 0.7007967233657837
media 0.677935779094696
location 0.7232661843299866
chitchat 0.693950891494751


In [30]:
from sentence_transformers import SentenceTransformer

m = SentenceTransformer("outputs/e5_intent_router")
print("model_type:", m._first_module().auto_model.config.model_type)
print("tokenizer class:", type(m._first_module().tokenizer))
print("tokenizer path:", m._first_module().tokenizer.name_or_path)


The tokenizer you are loading from 'outputs/e5_intent_router' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


model_type: xlm-roberta
tokenizer class: <class 'transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast'>
tokenizer path: outputs/e5_intent_router


In [5]:
!hf auth login




    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
The token `anansupercuteeeee` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-

In [6]:
from sentence_transformers import SentenceTransformer

# load model đã train
model = SentenceTransformer("outputs/e5_intent_router")

# push lên Hugging Face
model.push_to_hub(
    repo_id="anansupercuteeeee/multilingual-traveling",
    commit_message="v1.0 - multilingual travel intent embedding (E5)"
)


The tokenizer you are loading from 'outputs/e5_intent_router' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...g/sentencepiece.bpe.model: 100%|##########| 5.07MB / 5.07MB            

  ...mpdohc69fg/tokenizer.json: 100%|##########| 17.1MB / 17.1MB            

  ...ohc69fg/model.safetensors:   1%|          | 6.62MB / 1.11GB            

'https://huggingface.co/anansupercuteeeee/multilingual-traveling/commit/6ec3640d9b5ce32e02fc66352bf2e27f0bb9bbbb'