In [None]:
## This is the UDN news GPT

In [None]:
import pandas as pd
import os
import re
import time
import tiktoken
import numpy as np
import openai
import jieba
from openai.embeddings_utils import get_embedding, cosine_similarity
from TCSP import read_stopwords_list
from pandarallel import pandarallel
from tqdm import tqdm
from time import sleep
from tqdm import tqdm

In [None]:
# pandarallel.initial
pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=5)

tqdm.pandas(desc='GPT Processing')

In [None]:
def num_tokens_from_messages(messages, model="gpt-4-32k-0314"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [None]:
def send_message(content, model_name, max_response_tokens=1000):
    
    # Insert your personal API information
    openai.api_type = [YOUR API TYPE]
    openai.api_key = [YOUR API KEY]
    openai.api_base = [YOUR API BASE]
    openai.api_version = [YOUR API VERSION]

    # Below is your prompt engineering
    question = f'''
                [INSERT YOUR OWN PROMPT ENGINEERING]
                '''
            
    prompt = f"你是一位專業的NLP工程師,請你參照表格內容請你幫我執行NER任務，表格：{content}，根據{question}處理"

    messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content":question }
            ]

    print(num_tokens_from_messages(messages))
    response = openai.ChatCompletion.create(
        engine=model_name,
        messages=messages,
        temperature=0,
        max_tokens=max_response_tokens,
        top_p=0.9,
        frequency_penalty=0,
        presence_penalty=0,
    )
  
    return response

In [None]:
def c2d(title,herf,full,date,tag):
    df = pd.DataFrame({'title': [title], 'href': [f'<a href="{herf}">{herf}</a>'], 'full': [full], 'Date': [date], 'tag': [tag]})
    md = df.to_markdown()
    return md

In [None]:
CHAT_MODEL_NAME = "gpt4-32k"
EMBED_MODEL_NAME = "ada002"
MAX_RESPONSE_TOKENS = 10000
overall_max_tokens = 32000
prompt_max_token = overall_max_tokens - MAX_RESPONSE_TOKENS

In [None]:
def extract_generated_info(response_message):
    extracted_info = {}
    
    # Extracting 罪名
    match_crime = re.search(r"罪名: (.*?)\n", response_message)
    if match_crime:
        extracted_info["罪名"] = match_crime.group(1)     
    # Extracting 觸犯法規
    match_law = re.search(r"觸犯法規: (.*?)\n", response_message)
    if match_law:
        extracted_info["觸犯法規"] = match_law.group(1)   
    # Extracting 姓名
    match_name = re.search(r"姓名: (.*?)\n", response_message)
    if match_name:
        extracted_info["姓名"] = match_name.group(1)   
    # Extracting 年齡
    match_age = re.search(r"年齡: (.*?)\n", response_message)
    if match_age:
        extracted_info["年齡"] = match_age.group(1) 
    # Extracting 頭銜
    match_title = re.search(r"頭銜: (.*?)\n", response_message)
    if match_title:
        extracted_info["頭銜"] = match_title.group(1)   
    # Extracting 職/產業
    match_profession = re.search(r"職/產業: (.*?)\n", response_message)
    if match_profession:
        extracted_info["職/產業"] = match_profession.group(1)   
    # Extracting 新聞報導地點
    match_newslocation = re.search(r"新聞報導地點: (.*?)\n", response_message)
    if match_newslocation:
        extracted_info["新聞報導地點"] = match_newslocation.group(1) 
    # Extracting 發生地點
    match_location = re.search(r"發生地點: (.*?)\n", response_message)
    if match_location:
        extracted_info["發生地點"] = match_location.group(1) 
    # Extracting 內文摘要
    match_summary = re.search(r"內文摘要: (.*?)\n", response_message)
    if match_summary:
        extracted_info["內文摘要"] = match_summary.group(1) 

    print(response_message)
    print(extracted_info)    

    return extracted_info    

In [None]:
def generate_content(content):
    response = send_message(content['md'], CHAT_MODEL_NAME, MAX_RESPONSE_TOKENS)
    response_message = response['choices'][0]['message']['content']
    generated_info = extract_generated_info(response_message)

    return generated_info

In [None]:
res = pd.read_csv('udn_news.csv')
content = res[['title','href','content',"Date",'tag']][15:20]
#creates new column in datafram called 'md'
content['md'] = content.apply(lambda x: c2d(x['title'],x['href'],x['content'],x['Date'],x['tag']),axis=1)

start_time = time.time()
end_time = time.time()
execution_time = end_time - start_time

print(f"GPT總結{len(content)}篇新聞所需時間：{execution_time:.6f} 秒")

# Insert the provided code block to extract generated_info
content['generated_info'] = content.apply(generate_content, axis=1)
for col in tqdm(content['generated_info'].iloc[0].keys()):
    content[col] = content['generated_info'].apply(lambda x: x[col])
content.drop(['generated_info'], axis=1, inplace=True)

content.drop(['md'],axis=1,inplace=True)

content.to_csv('udn_news_Categorized.csv',index=False, encoding="utf-8_sig")