In [2]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_from_disk

from dotenv import load_dotenv
import os 

load_dotenv()
os.environ['HF_DATASETS_OFFLINE'] = '0'
os.environ['TRANSFORMERS_OFFLINE'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
WORK_DIR = os.path.join(os.environ['WORK_DIR'], 'tw_llama_tags')

In [3]:
tokenizer = AutoTokenizer.from_pretrained('yentinglin/Taiwan-LLM-7B-v2.1-chat', 
                                          torch_dtype=torch.float16, 
                                          token=os.environ['HUGGINGFACE_TOKEN'],
                                          device_map='auto')
model = AutoModelForCausalLM.from_pretrained('yentinglin/Taiwan-LLM-7B-v2.1-chat', 
                                             torch_dtype=torch.float16, 
                                             token=os.environ['HUGGINGFACE_TOKEN'],
                                             device_map='auto')
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model.device

device(type='cuda', index=0)

In [5]:
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16, device_map='auto')

# # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
# messages = [
#     {
#         "role": "system",
#         "content": "你是一個人工智慧助理",
#     },
#     {   
#         "role": "user", 
#         "content": "東北季風如何影響台灣氣候？"
#     },
# ]
# prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
# print(outputs[0]["generated_text"])

In [6]:
dataset = load_dataset('csv', data_files='tags.csv', split='train')
dataset

Dataset({
    features: ['review', 'tags'],
    num_rows: 227
})

In [7]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right' # Fix weird overflow issue with fp16 training

In [8]:
# def generate_training_prompt(example):
#     if example['tags'] == None: 
#         example['tags'] = ''
#     proc_str = ''.join([
#         '你是一個人工智慧助理,',
#         '你的任務是根據以下幾個對某間餐廳的評論來產生5~10個有關這間餐廳的標籤,',
#         '每個標籤大約三到四字, 輸出得遵守以下格式, 不需要額外的訊息:',
#         '#tag1 #tag2 #tag3 #tag4 #tag5',
#         '</s>USER:',
#         '\n\n'.join(eval(example['review'])).strip(), 
#         '</s>ASSISTANT:', 
#         example['tags']
#     ])
    
#     return proc_str

In [9]:
def generate_text(example):
    if example['tags'] == None: 
        example['tags'] = ''
        
    proc_str = ''.join([
        '你是一個人工智慧助理,',
        '你的任務是根據以下幾個對某間餐廳的評論來產生5~10個有關這間餐廳的標籤,',
        '每個標籤大約三到四字, 輸出得遵守以下格式, 不需要額外的訊息:',
        '#tag1 #tag2 #tag3 #tag4 #tag5',
        '</s>USER:',
        '\n\n'.join(eval(example['review'])).strip(), 
        '</s>ASSISTANT:', 
        example['tags']
    ])
    
    return {
        'review': example['review'],
        'tags': example['tags'],
        'text': proc_str,
    }

In [10]:
def get_processed_dataset():
    return (
        dataset.shuffle(seed=42)
               .map(generate_text)
        # .remove_columns()
    )

In [11]:
dataset = get_processed_dataset()
dataset

Dataset({
    features: ['review', 'tags', 'text'],
    num_rows: 227
})

In [12]:
from peft import LoraConfig, TaskType
from trl import SFTTrainer

In [13]:
# Load LoRA configuration
peft_config = LoraConfig(r=16, task_type="CAUSAL_LM")

In [14]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=os.path.join(WORK_DIR, 'llama_tags'),
    num_train_epochs=15,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    save_steps=-1,
    logging_steps=50,
    fp16=True,
    save_safetensors=True,
    # report_to='tensorboard',
    seed=42,
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
    dataset_text_field='text',
    max_seq_length=2048,
    # packing=False,
)

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [15]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
50,1.5359
100,1.3022
150,1.2487
200,1.2372
250,1.2157
300,1.2051
350,1.1979
400,1.1945


TrainOutput(global_step=420, training_loss=1.2636299224126906, metrics={'train_runtime': 3773.5643, 'train_samples_per_second': 0.902, 'train_steps_per_second': 0.111, 'total_flos': 1.6392363391790285e+17, 'train_loss': 1.2636299224126906, 'epoch': 14.8})

In [36]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, torch_dtype=torch.float16, device_map='auto')

review = dataset['review'][0]

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        'role': 'system',
        'content': ''.join([# '你是一個人工智慧助理,',
                            '你的任務是根據以下幾個對某間餐廳的評論來產生5~10個有關這間餐廳的標籤,',
                            '每個標籤大約三到四字, 輸出得遵守以下格式, 不需要額外的訊息:',
                            '#tag1 #tag2 #tag3 #tag4 #tag5']),
    },
    {   
        'role': 'user', 
        'content': '\n\n'.join(eval(review)).strip()
    },
]

prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=50, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]['generated_text'])

你的任務是根據以下幾個對某間餐廳的評論來產生5~10個有關這間餐廳的標籤,每個標籤大約三到四字, 輸出得遵守以下格式, 不需要額外的訊息:#tag1 #tag2 #tag3 #tag4 #tag5</s>USER: 印象中這應該是二訪

第一次點牛丼飯印象不深刻，只覺得好像份量少偏鹹

這次想說看到評價說蛋好吃，因此決定咖喱和滑蛋一次機會
所以點的是招牌雞飯

首先看到包裝方式分開裝覺得很用心，這樣就不會要吃的時候覺得肉和飯都濕濕的粘膩

雞的話是類似唐揚雞，覺得炸得很剛好很不錯，也不會有皮太多的問題，印象中是七塊左右

滑蛋也是真的好吃量很多！
咖喱的話不好辣也覺得好吃
飯煮的比較粒粒分明，搭配著吃很不錯

雖然看到評論很多人說很貴，確實但就包裝的方式會覺得好像很空虛，但其實在吃其他家丼飯之後會發現其實價格是差不多的

大概最主要的差別會是這邊如果內用就是在露天的車庫吃，所以用餐環境很難比不上連鎖丼飯店

所以整體來說是推薦的！但如果很餓的話，大胃口的人有可能就不會飽，然後夏天不建議內用，冬天可能可以試看看

直接講結論好了
單論味道絕對值得一吃 :double exclamation mark:  :double exclamation mark: 
可以解解想吃丼飯的纏
但就是吃不太飽 :grinning face with sweat: 

我本次是點了炸豬排丼飯，我覺得整體而言還不錯，滑蛋的水準真的十分頂，配上些許點綴的海苔粉，真的是超級下飯，但問題來了，飯量根本不夠，吃完感覺會有點空虛，大食量的男生一定吃不飽，此外價位cp值也是一大硬傷，份量不夠就算了，如果能附一碗味噌湯或許感受就會好很多。

阿如果是夏天要來的要考慮一下時間，因為用餐環境算是半露天的，夏日炙熱的艷陽絕對會讓人吃完滿頭大汗。

吃起來就是咖哩塊溶於水，沒有其他調味，炸雞肉粉還蠻多的，吃起來有點空虛，以這樣來說160元一份有點overpriced。這也可以看出新竹物價比台北高很多，在台北用160元可以吃到way better的咖哩飯，但老闆人還不錯就是了

位在寶山路巷子內的咖喱飯！
咖喱味道偏甘甜 沒有辣味！
豬排炸得恰到好處！
價格感受因人而意 看大家能不能接受！

 :round pushpin: 豬排咖哩飯$140
很喜歡豬排，超嫩！
只是份量有點偏少
老闆非常的親切～～～</s>

In [19]:
trainer.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
                in_featur

In [23]:
from peft import AutoPeftModelForCausalLM
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    os.path.join(WORK_DIR, 'llama_tags'),
    low_cpu_mem_usage=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
merged_model = trained_model.merge_and_unload()
merged_model.save_pretrained(save_directory=os.path.join(WORK_DIR, 'llama_tags_merged'), 
                             safe_serialization=True)

In [38]:
# tokenizer.save_pretrained(save_directory=os.path.join(WORK_DIR, 'llama_tags_merged'), 
#                           safe_serialization=True)

TypeError: Object of type dtype is not JSON serializable