In [1]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM

import pandas as pd
import pickle

from dotenv import load_dotenv
import os 

load_dotenv()
os.environ['HF_DATASETS_OFFLINE'] = '0'
os.environ['TRANSFORMERS_OFFLINE'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
WORK_DIR = os.path.join(os.environ['WORK_DIR'], 'tw_llama_tags')

2023-12-11 13:22:21.684319: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 13:22:21.684368: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 13:22:21.685726: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-11 13:22:21.693266: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
auto_args =lambda x, y: x.from_pretrained(y, 
                                          torch_dtype=torch.float16, 
                                          device_map='auto')

model = auto_args(AutoModelForCausalLM, os.path.join(WORK_DIR, 'llama_tags_merged'))
tokenizer = auto_args(AutoTokenizer, 'yentinglin/Taiwan-LLM-7B-v2.1-chat')

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [3]:
with open('proc_results.pickle', 'rb') as f:
    results = pickle.load(f)
results

[{'name': '肥仔龍無煙鐵板燒',
  'rating': 3.2,
  'distance': 1.2064086848736124,
  'reviews': ['2023/11\n很難想到現在還有120元可以吃到的鐵板燒\n店位在清華夜市裡\n旁邊有停車場（只能悠遊卡付款）\n店裡有兩大區料理平台\n看起來舊舊的\n平日中午1點左右到\n大概只坐了1/3\n\n點了學生套餐-牛肉（120元）包含豆芽菜、高麗菜、鯛魚片、牛肉片（不是學生也可以點學生套餐，另外有豬肉、羊肉可以選）\n點完餐後店家會送上白飯、濃湯、飲料杯\n濃湯、飲料喝到飽\n餐點味道都算正常\n豆芽菜、高麗菜都很脆口\n鯛魚片略乾\n牛肉片是一般黑胡椒口味\n每個人的餐點都集中放在一個盤子裡\n師傅料理完就直接堆疊起來\n會有味道互相干擾的情形\n個人比較不喜歡\n\n現場看不管是加辣、不吃豆芽菜換高麗菜等，都能配合客製調整，服務真的不錯\n\n整體來說CP值超高\n靠近學校的學生美食\n有順路經過值得一試',
   '便宜\n吃的很飽\n只是有點油膩\n畢竟要服務的人太多了\n一個學生餐，可能把附近餐飲打趴\n值得來試試CP值超高的本土鐵板燒',
   '可以吃得飽的地方\n還有120的學生餐，就有肉跟魚片\n來吃過就知道，什麼叫做吃飽\n店家真是佛心來的\n\n與對面的大埔完全不一樣的風格\n值得一試……',
   '每每經過人潮洶湧，絡繹不絕的～But.就是很不錯鐵板燒餐廳，雖然環境有太多不乾淨，但是呢，但是呢\n他們餐點是真的很好吃 :face savoring food: ，價格算可以，真的很好吃這一家也是跟同學甲粗飽唷！\n價格甜，餐點甜，吃飽也甜。\n這家店家甚麼都ok唯獨環境就～～需要改善了。',
   '有兩隻很可愛的小熊維尼師傅在炒菜\n只能說很油很鹹然後又很不乾淨\n但坦白來說在鐵板燒界這個價格真的算便宜了～\n\n120元（羊肉+豆芽+高麗菜+飯）另外濃湯紅茶喝到飽！非常划算！但沒有到很好吃\n\n看到把放在地上的桶子拿到鐵板上來放\n然後口罩沒戴好⋯\n\n肥仔龍\n吃了可能會變不健康的小肥仔']},
 {'name': '經濟部專業人員研究中心餐廳',
  'rating': 0,
  'distance': 0.7045454962066899,
  'rev

In [6]:
pipe = pipeline('text-generation', 
                model=model, 
                tokenizer=tokenizer, 
                torch_dtype=torch.float16, 
                device_map='auto',
                return_full_text=False)

def generate_prompt(reviews):
    # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
    messages = [
        {
            'role': 'system',
            'content': ''.join(['你的任務是根據以下幾個對某間餐廳的評論，',
                                '以100字以內總結這些評論。',
                                '主要針對餐廳的特點去做摘要，不要把有關餐廳地點的論述包括進來。',
                                ]),        
        },
        {   
            'role': 'user', 
            'content': '\n\n'.join(reviews).strip()
        },
    ]

    return pipe.tokenizer.apply_chat_template(messages, 
                                              tokenize=False, 
                                              add_generation_prompt=True)
    
def get_tags_from_llama(reviews):
    if len(reviews) == 0:
        return ''
    
    outputs = pipe(generate_prompt(reviews), 
                   max_new_tokens=300, 
                   do_sample=True, 
                   temperature=0.6, 
                   top_k=50, 
                   top_p=0.95)
    return outputs[0]['generated_text']

In [7]:
data = []
tags = []

for idx, result in enumerate(results):
    reviews = result['reviews']
    tags_from_llama = get_tags_from_llama(reviews)
    print(idx, tags_from_llama)
    
    tags.append(tags_from_llama)
    data.append({
        'reviews': reviews,
        'summarization': tags_from_llama
    })