In [16]:
import os
import re
from utils import read_json
from langchain.document_loaders import PyPDFLoader

from langchain.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage

In [12]:
configs = read_json('configs.json')
DATA_PATH = configs["DATA_PATH"]
DB_PATH = configs['DB_PATH']
files_list = [file for file in os.listdir(DATA_PATH) if os.path.isfile(os.path.join(DATA_PATH, file))]

def read_pdf(files_list, data_path=DATA_PATH):
    
    for file in files_list:
        loader = PyPDFLoader(data_path + file)
        content = loader.load()
        whole_pdf = ""
        for i in range(len(content)):
            whole_pdf += content[i].page_content

    whole_pdf = whole_pdf.replace('\n', '')
    return whole_pdf

In [6]:
files_list

['民政-108內正1.pdf',
 '測量重劃-102內正17.pdf',
 '都市計畫-102內正22.pdf',
 '都市計畫-102內正30.pdf',
 '都市計畫-107內正2.pdf']

In [13]:
pdf_108_1= read_pdf(files_list=[files_list[0]])
pdf_102_17= read_pdf(files_list=[files_list[1]])
pdf_102_22= read_pdf(files_list=[files_list[2]])
pdf_102_30= read_pdf(files_list=[files_list[3]])
pdf_107_2= read_pdf(files_list=[files_list[4]])

In [9]:
llm = ChatOllama(
    model = 'llama3',
    verbose = True
)

In [18]:
def extract_content(input, pattern):

    # Search using the pattern
    match = re.search(pattern, input, re.S)

    if match:
        result = match.group(1).strip()  # Use strip() to remove any leading/trailing whitespace
        print("Extraction succeed.")
    else:
        print("No match found")

    return result

In [19]:
pattern = r"案\s*由：(.*?)參、事實與理由："
pdf_108_1_reason = extract_content(pdf_108_1, pattern)
pdf_102_17_reason = extract_content(pdf_102_17, pattern)
pdf_102_22_reason = extract_content(pdf_102_22, pattern)
pdf_102_30_reason = extract_content(pdf_102_30, pattern)
pdf_107_2_reason = extract_content(pdf_107_2, pattern)

Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.


In [23]:
messages = [
    SystemMessage(
        content="你是一個專業的中文AI助手，用繁體中文回答所有的問題，即使問題是英文也要用中文回答"
        ), 
    HumanMessage(
        content=f"對以下文件作內容摘要整理:{pdf_108_1_reason},用繁體中文回覆"
    )
]
model_response = llm.invoke(messages)

In [24]:
model_response

AIMessage(content='以下為內容摘要整理：\n\n**問題：** 內政部制定殯葬管理條例時，未能務實通盤瞭解殯葬業之問題。\n\n**結果：** 導致相關之規範及函釋未盡周延，歷時 10多年竟未能對症化解爭議，滋生諸多民怨，確有怠失。\n\n**需求：** 根據法定程序，提案糾正問題，提高殯葬管理的實質性和效率。', response_metadata={'model': 'llama3', 'created_at': '2024-05-10T06:55:23.836676Z', 'message': {'role': 'assistant', 'content': ''}, 'done': True, 'total_duration': 39828089500, 'load_duration': 4187500, 'prompt_eval_count': 12, 'prompt_eval_duration': 2895808000, 'eval_count': 128, 'eval_duration': 36919362000}, id='run-1d44c357-6899-4628-9bef-5bb859c7fe54-0')

In [30]:
messages = [
    SystemMessage(
        content="你是一個專業的中文AI助手，用繁體中文回答所有的問題，即使問題是英文也要用中文回答"
        ), 
    HumanMessage(
        content=f"閱讀以下內容並以三元組方式描述以上內容中的主體與關係:{pdf_108_1_reason} \n 以繁體中文回答。"
    )
]
model_response = llm.invoke(messages)

In [33]:
with open('三元組test.txt', 'w') as file:
    file.write(model_response.content)

In [34]:
messages = [
    SystemMessage(
        content="你是一個專業的中文AI助手，用繁體中文回答所有的問題，即使問題是英文也要用中文回答"
        ), 
    HumanMessage(
        content=f"將以下三元組用（主體，關係，對象）json格式來撰寫:\n{model_response.content} \n 以繁體中文回答。"
    )
]
tuple_json = llm.invoke(messages)

In [36]:
tuple_json.content

'Here are the three tuples written in JSON format with the labels (主體, 關係, 對象) and answers in Traditional Chinese:\n\n```\n[\n  {\n    "主體": "內政部 (Ministry of the Interior)",\n    "關係": "為殯葬業的管理者，制定殯葬管理條例以規範殯葬業的運作。",\n    "對象": "殯葬業"\n  },\n  {\n    "主體": "殯葬業",\n    "關係": "需要法制的預期和掌握，以便在運作中能夠順暢地運行。",\n    "對象": "法制"\n  },\n  {\n    "主體": "法制",\n    "關係": "是內政部和殯葬業之間的關係，透過法制來規範殯葬業的運作。",\n    "對象": "內政部, 殯葬業"\n  }\n]\n```'

In [37]:
with open('三元組_json_test.txt', 'w') as file:
    file.write(tuple_json.content)

In [39]:
from llama_cpp import Llama

llm = Llama(
      model_path="./models/7B/llama-model.gguf",
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)
output = llm(
      "Q: Name the planets in the solar system? A: ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)

ValueError: Model path does not exist: ./models/7B/llama-model.gguf