In [None]:
from typing import List
from numpy import mean
from sentence_transformers import SentenceTransformer, util

embed_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
# embed_model = SentenceTransformer('sentence-transformers/LaBSE')
# embed_model = SentenceTransformer('sentence-transformers/gtr-t5-large')

In [None]:
def is_in_topic(
    new_msg: str,
    ctx_list: List[str],
    decay_rate: float = 0.85,
    max_threshold: float = 0.85,
    mean_threshold: float = 0.75,
    verbose: bool = True,
):
    if not ctx_list: return True

    ctx_emb = embed_model.encode(ctx_list, convert_to_tensor=True)
    msg_emb = embed_model.encode(new_msg, convert_to_tensor=True)

    similarities = util.cos_sim(msg_emb, ctx_emb)[0]

    weights = [decay_rate**i for i in reversed(range(len(ctx_list)))]
    weighted_similarities = [sim.item() * w for sim, w in zip(similarities, weights)]

    max_sim = max(weighted_similarities)
    mean_sim = mean(weighted_similarities)

    if verbose:
        print(f"Current message: {new_msg}")
        print(f"Weighted Similarities")

        for i, (msg, score) in enumerate(zip(ctx_list, weighted_similarities)):
            print(f" [{i}] {msg[:40]}... -> {score:.4f}")

        print(f"\n Max Sim: {max_sim:.4f}, Mean Sim: {mean_sim:.4f}")
        print(
            f" Decision: {'In Context' if (max_sim > max_threshold and mean_sim > mean_threshold) else 'New Topic'}\n"
        )

    return max_sim > max_threshold and mean_sim > mean_threshold

# Simulate message input
messages = [
    # "我上星期去了台北玩。",
    # "天氣很熱，但是很好玩。",
    # "我還去了故宮，裡面有很多古董。",
    
    # "下週學校要開始期中考了。",
    # "我數學還沒讀完，有點緊張。",
    # "最近睡不好，常常熬夜。",
    # "醫生建議我早點睡，多運動。",
    
    # "我昨天遇到小美。",
    # "她說她也去了台北出差。",
    
    "今天天氣怎麼樣？",
    "今天天氣很好，陽光明媚。",
    "是啊，這樣的天氣真讓人心情愉快。",
    "對，適合外出走走。",
    
    "你最近有去哪裡旅行嗎？",
    "有，我去了京都。",
    "哇，那裡怎麼樣？",
    "非常美麗，特別是秋天的紅葉。",
    
    "你最近身體怎麼樣？",
    "還不錯，就是有點累。",
    "那你有去運動嗎？",
    "有，我每天早上跑步。",
    
    "你最近看過什麼好電影嗎？",
    "有，我看了《流浪地球》。",
    "怎麼樣？好看嗎？",
    "非常精彩，特效做得很棒。",
]

conversation = []

for message in messages:
    res = is_in_topic(message, conversation)
    
    if (res):
        conversation.append(message)
    else:
        conversation = [message]

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset, load_dataset
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# Step 1: Define intent labels and mappings
labels = ["store_in_stm", "store_in_ltm", "retrieve_stm", "retrieve_ltm", "none"]
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Step 2: Load training data
raw_dataset = load_dataset("csv", data_files="./models/intent_classifier/training_data.csv")
training_dataset = raw_dataset["train"].to_list()

# Convert labels to IDs
for data in training_dataset:
    data["label"] = label2id[data["label"]]

# Step 3: Convert to HuggingFace Dataset
dataset = Dataset.from_list(training_dataset)
train_test = dataset.train_test_split(test_size=0.2)

# Step 4: Loading
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

# Step 5: Tokenize
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
    )

tokenized = train_test.map(tokenize)


# Step 6: Define training args
training_args = TrainingArguments(
    output_dir="./models/intent_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)


# Optional: Evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }


# Step 7: Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

model.save_pretrained("./models/intent_classifier/model")
tokenizer.save_pretrained("./models/intent_classifier/model")

In [None]:
from transformers import pipeline

# Load the model and tokenizer
classifier = pipeline(
    "text-classification",
    model="./models/intent_classifier/model",
    tokenizer="./models/intent_classifier/model",
    return_all_scores=False  # Set to True if you want all class probabilities
)


In [None]:
result = classifier("請暫時儲存這個地址")[0]
print(f"Intent: {result['label']} (Confidence: {result['score']:.2f})")


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset, load_dataset
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# Step 1: Define intent labels and mappings
labels = ["WORKING", "SEMANTIC", "PERSONAL"]
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Step 2: Load training data
raw_dataset = load_dataset(
    "csv", data_files="./models/mem_type_classifier/training_data.csv"
)
training_dataset = raw_dataset["train"].to_list()

# Convert labels to IDs
for data in training_dataset:
    data["label"] = label2id[data["label"]]

# Step 3: Convert to HuggingFace Dataset
dataset = Dataset.from_list(training_dataset)
train_test = dataset.train_test_split(test_size=0.3)

# Step 4: Loading
model_name = "ckiplab/bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)


# Step 5: Tokenize
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
    )

tokenized = train_test.map(tokenize)


# Step 6: Define training args
training_args = TrainingArguments(
    output_dir="./models/mem_type_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Optional: Evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }


# Step 7: Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# Train
trainer.train()

model.save_pretrained("./models/mem_type_classifier/model")
tokenizer.save_pretrained("./models/mem_type_classifier/model")

In [None]:
from transformers import pipeline

# Load the model and tokenizer
classifier = pipeline(
    "text-classification",
    model="./models/mem_type_classifier/model",
    tokenizer="./models/mem_type_classifier/model",
    return_all_scores=False  # Set to True if you want all class probabilities
)

In [None]:
# Run inference
text = "台灣總統是誰？"
result = classifier(text)[0]

print(f"Intent: {result['label']} (Confidence: {result['score']:.2f})")