# üç≥ CookBot Training Notebook

Train chatbot AI ri√™ng cho CookShare - **KH√îNG C·∫¶N API KEY!**

## M·ª•c ti√™u:
1. Fine-tune model GPT-2 v·ªõi dataset m√≥n ƒÉn Vi·ªát Nam
2. Upload model l√™n Hugging Face (FREE)
3. S·ª≠ d·ª•ng mi·ªÖn ph√≠, ho√†n to√†n ƒë·ªôc l·∫≠p

## Y√™u c·∫ßu:
- Google Colab (FREE GPU)
- T√†i kho·∫£n Hugging Face (FREE)


In [None]:
# 1. C√†i ƒë·∫∑t th∆∞ vi·ªán
!pip install transformers datasets accelerate peft -q
!pip install huggingface_hub -q

import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


In [None]:
# 2. Upload dataset t·ª´ local
from google.colab import files
print("Upload file: dataset_cookbot.jsonl v√† dataset_cookbot_part2.jsonl")
uploaded = files.upload()


In [None]:
# 3. Load v√† chu·∫©n b·ªã data
import json
import os
from datasets import Dataset

def load_jsonl_files(files):
    data = []
    for f in files:
        if os.path.exists(f):
            with open(f, 'r', encoding='utf-8') as file:
                for line in file:
                    if line.strip():
                        data.append(json.loads(line))
    return data

def format_conversation(example):
    messages = example['messages']
    text = ""
    for msg in messages:
        role, content = msg['role'], msg['content']
        if role == 'system':
            text += f"<|system|>{content}</s>\n"
        elif role == 'user':
            text += f"<|user|>{content}</s>\n"
        elif role == 'assistant':
            text += f"<|assistant|>{content}</s>\n"
    return text

# Load data
files_list = ['dataset_cookbot.jsonl', 'dataset_cookbot_part2.jsonl']
data = load_jsonl_files(files_list)
formatted = [{'text': format_conversation(d)} for d in data]
dataset = Dataset.from_list(formatted)
print(f"Loaded {len(dataset)} samples")


In [None]:
# 4. Load model GPT-2
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'additional_special_tokens': ['<|system|>', '<|user|>', '<|assistant|>']})

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model.resize_token_embeddings(len(tokenizer))

# LoRA config
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["c_attn", "c_proj"], lora_dropout=0.1, task_type="CAUSAL_LM")
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [None]:
# 5. Tokenize v√† train
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

def tokenize_fn(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512, padding='max_length')

tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=['text'])
split = tokenized.train_test_split(test_size=0.1)

training_args = TrainingArguments(
    output_dir="./cookbot-model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split['train'],
    eval_dataset=split['test'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# TRAIN!
trainer.train()


In [None]:
# 6. Test model
def generate(prompt):
    inp = f"<|system|>B·∫°n l√† CookBot - AI t∆∞ v·∫•n m√≥n ƒÉn.</s>\n<|user|>{prompt}</s>\n<|assistant|>"
    inputs = tokenizer(inp, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=200, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(out[0], skip_special_tokens=False)
    if '<|assistant|>' in response:
        response = response.split('<|assistant|>')[-1].split('</s>')[0]
    return response.strip()

# Test
for q in ["Xin ch√†o", "H√¥m nay ƒÉn g√¨", "T√¥i mu·ªën gi·∫£m c√¢n"]:
    print(f"User: {q}")
    print(f"CookBot: {generate(q)}\n")


In [None]:
# 7. Upload l√™n Hugging Face
from huggingface_hub import login
login()  # Nh·∫≠p token t·ª´ https://huggingface.co/settings/tokens

# Merge v√† save
merged = model.merge_and_unload()
merged.save_pretrained("./cookbot-final")
tokenizer.save_pretrained("./cookbot-final")

# Push to hub - Username: uduptit
HF_USER = "uduptit"  # <-- ƒê√£ thay ƒë·ªïi
merged.push_to_hub(f"{HF_USER}/cookbot-vietnamese")
tokenizer.push_to_hub(f"{HF_USER}/cookbot-vietnamese")
print(f"Done! Model: https://huggingface.co/{HF_USER}/cookbot-vietnamese")
