In [11]:
import os
import re
import json
from utils import read_json
from langchain.document_loaders import PyPDFLoader

from langchain.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
configs = read_json('configs.json')
DATA_PATH = configs["DATA_PATH"]
DB_PATH = configs['DB_PATH']
files_list = [file for file in os.listdir(DATA_PATH) if os.path.isfile(os.path.join(DATA_PATH, file))]

def read_pdf(files_list, data_path=DATA_PATH):
    
    for file in files_list:
        loader = PyPDFLoader(data_path + file)
        content = loader.load()
        whole_pdf = ""
        for i in range(len(content)):
            whole_pdf += content[i].page_content

    whole_pdf = whole_pdf.replace('\n', '')
    return whole_pdf

In [3]:
files_list

['民政-108內正1.pdf',
 '測量重劃-102內正17.pdf',
 '都市計畫-102內正22.pdf',
 '都市計畫-102內正30.pdf',
 '都市計畫-107內正2.pdf']

In [4]:
pdf_108_1= read_pdf(files_list=[files_list[0]])
pdf_102_17= read_pdf(files_list=[files_list[1]])
pdf_102_22= read_pdf(files_list=[files_list[2]])
pdf_102_30= read_pdf(files_list=[files_list[3]])
pdf_107_2= read_pdf(files_list=[files_list[4]])

In [8]:
def extract_content(input, pattern):

    # Search using the pattern
    match = re.search(pattern, input, re.S)

    if match:
        result = match.group(1).strip()  # Use strip() to remove any leading/trailing whitespace
        print("Extraction succeed.")
    else:
        print("No match found")

    return result

In [9]:
pattern = r"案\s*由：(.*?)參、事實與理由："
pdf_108_1_reason = extract_content(pdf_108_1, pattern)
pdf_102_17_reason = extract_content(pdf_102_17, pattern)
pdf_102_22_reason = extract_content(pdf_102_22, pattern)
pdf_102_30_reason = extract_content(pdf_102_30, pattern)
pdf_107_2_reason = extract_content(pdf_107_2, pattern)

Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.
Extraction succeed.


# Llama 3

In [5]:
llm = ChatOllama(
    model = 'llama3',
    verbose = True
)

In [23]:
messages = [
    SystemMessage(
        content="你是一個專業的中文AI助手，用繁體中文回答所有的問題，即使問題是英文也要用中文回答"
        ), 
    HumanMessage(
        content=f"對以下文件作內容摘要整理:{pdf_108_1_reason},用繁體中文回覆"
    )
]
model_response = llm.invoke(messages)

In [24]:
model_response

AIMessage(content='以下為內容摘要整理：\n\n**問題：** 內政部制定殯葬管理條例時，未能務實通盤瞭解殯葬業之問題。\n\n**結果：** 導致相關之規範及函釋未盡周延，歷時 10多年竟未能對症化解爭議，滋生諸多民怨，確有怠失。\n\n**需求：** 根據法定程序，提案糾正問題，提高殯葬管理的實質性和效率。', response_metadata={'model': 'llama3', 'created_at': '2024-05-10T06:55:23.836676Z', 'message': {'role': 'assistant', 'content': ''}, 'done': True, 'total_duration': 39828089500, 'load_duration': 4187500, 'prompt_eval_count': 12, 'prompt_eval_duration': 2895808000, 'eval_count': 128, 'eval_duration': 36919362000}, id='run-1d44c357-6899-4628-9bef-5bb859c7fe54-0')

In [30]:
messages = [
    SystemMessage(
        content="你是一個專業的中文AI助手，用繁體中文回答所有的問題，即使問題是英文也要用中文回答"
        ), 
    HumanMessage(
        content=f"閱讀以下內容並以三元組方式描述以上內容中的主體與關係:{pdf_108_1_reason} \n 以繁體中文回答。"
    )
]
model_response = llm.invoke(messages)

In [33]:
with open('三元組test.txt', 'w') as file:
    file.write(model_response.content)

In [13]:
messages = [
    SystemMessage(
        content="你是一個專業的中文AI助手，用繁體中文回答所有的問題，即使問題是英文也要用中文回答"
        ), 
    HumanMessage(
        content=f"""三元組指的是：（實體）--關係--（實體）
（實體）通常是特定的人、組織或物體，（關係）則是描述兩個實體之間的關係，例如 （孫悟空）--徒弟--（唐僧）、（孫悟空）--師兄弟--（豬八戒）、（幸運者）--得到--（大獎）或（立法委員）--制定--（法律）
將以下這些内容以三元組方式描述並以Json格式呈現:\n{pdf_108_1_reason} \n 。"""
    )
]
tuple_json = llm.invoke(messages)

In [14]:
tuple_json.content

'以下是所給内容以三元組方式描述並以Json格式呈現：\n\n```\n[\n  {\n    "實體1": "內政部",\n    "關係": "制定",\n    "實體2": "殯葬管理條例"\n  },\n  {\n    "實體1": "",\n    "關係": "未能務實通盤瞭解",\n    "實體2": "殯葬業問題"\n  },\n  {\n    "實體1": "",\n    "關係": "預為釐清因應",\n    "實體2": ""\n  },\n  {\n    "實體1": "",\n    "關係": "未能掌握問題之所在",\n    "實體2": ""\n  },\n  {\n    "實體1": "",\n    "關係": "致相關之規範及函釋未盡周延",\n    "實體2": ""\n  },\n  {\n    "實體1": "",\n    "關係": "歷時 10多年",\n    "實體2": ""\n  },\n  {\n    "實體1": "",\n    "關係": "竟未能對症化解爭議",\n    "實體2": ""\n  },\n  {\n    "實體1": "",\n    "關係": "滋生諸多民怨",\n    "實體2": ""\n  },\n  {\n    "實體1": "",\n    "關係": "確有怠失",\n    "實體2": ""\n  },\n  {\n    "實體1": "",\n    "關係": "爰依法提案糾正",\n    "實體2": ""\n  }\n]\n```'

In [15]:
with open('三元組_json_test.txt', 'w') as file:
    file.write(tuple_json.content)

In [4]:
# from llama_cpp import Llama

# llm = Llama(
#       model_path="models/Meta-Llama-3-8B-Instruct.gguff",
#       # n_gpu_layers=-1, # Uncomment to use GPU acceleration
#       # seed=1337, # Uncomment to set a specific seed
#       # n_ctx=2048, # Uncomment to increase the context window
# )
# output = llm(
#       "Q: Name the planets in the solar system? A: ", # Prompt
#       max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
#       stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
#       echo=True # Echo the prompt back in the output
# ) # Generate a completion, can also call create_completion
# print(output)

# TAIDE

In [5]:
api_key = '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'

tokenizer = AutoTokenizer.from_pretrained("taide/Llama3-TAIDE-LX-8B-Chat-Alpha1", use_auth_token=api_key)
model = AutoModelForCausalLM.from_pretrained("taide/Llama3-TAIDE-LX-8B-Chat-Alpha1", use_auth_token=api_key)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:26<00:00,  6.58s/it]


In [11]:
# Load the model
eos_token=None
eos_token_id = tokenizer.convert_tokens_to_ids(eos_token) if eos_token else tokenizer.eos_token_id
print('Model loaded')
llm = pipeline("text-generation",
                        model=model,
                        max_length=50000,
                        tokenizer=tokenizer,
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=eos_token_id,
                        )

Model loaded


In [10]:
# create input

insts = [{'role': 'system', 'content': '你你是一個專業的中文AI助手，用繁體中文回答所有的問題。'},
         {'role': 'user', 'content': f"""三元組指的是：（實體）--關係--（實體）
（實體）通常是特定的人、組織或物體，（關係）則是描述兩個實體之間的關係，例如 （孫悟空）--徒弟--（唐僧）、（孫悟空）--師兄弟--（豬八戒）、（幸運者）--得到--（大獎）或（立法委員）--制定--（法律）
將以下這些内容以三元組方式描述並以Json格式呈現:\n{pdf_108_1_reason} \n 。"""}
]
prompt = tokenizer.apply_chat_template(insts, 
                                        tokenize=False,
                                        add_generation_prompt=True)
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n你你是一個專業的中文AI助手，用繁體中文回答所有的問題。<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n三元組指的是：（實體）--關係--（實體）\n（實體）通常是特定的人、組織或物體，（關係）則是描述兩個實體之間的關係，例如 （孫悟空）--徒弟--（唐僧）、（孫悟空）--師兄弟--（豬八戒）、（幸運者）--得到--（大獎）或（立法委員）--制定--（法律）\n將以下這些内容以三元組方式描述並以Json格式呈現:\n內政部制定殯葬管理條例時， 顯未能務實通盤瞭解殯葬業之問題，在法制上預為釐清因應，其後亦未能掌握問題之所在，致相關之規範及函釋未盡周延，歷時 10多年竟未能對症化解爭議，滋生諸多民怨，確有怠失，爰依法提案糾正 。 \n 。<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [13]:
result = llm(prompt)

In [14]:
result

[{'generated_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n你你是一個專業的中文AI助手，用繁體中文回答所有的問題。<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n三元組指的是：（實體）--關係--（實體）\n（實體）通常是特定的人、組織或物體，（關係）則是描述兩個實體之間的關係，例如 （孫悟空）--徒弟--（唐僧）、（孫悟空）--師兄弟--（豬八戒）、（幸運者）--得到--（大獎）或（立法委員）--制定--（法律）\n將以下這些内容以三元組方式描述並以Json格式呈現:\n內政部制定殯葬管理條例時， 顯未能務實通盤瞭解殯葬業之問題，在法制上預為釐清因應，其後亦未能掌握問題之所在，致相關之規範及函釋未盡周延，歷時 10多年竟未能對症化解爭議，滋生諸多民怨，確有怠失，爰依法提案糾正 。 \n 。<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n：\n{\n"實體1": "內政部",\n"關係": "制定",\n"實體2": "殯葬管理條例"}\n、\n{\n"實體1": "殯葬業",\n"關係": "問題",\n"實體2": "內政部"}\n、\n{\n"實體1": "立法機關",\n"關係": "爭議",\n"實體2": "民怨"}\n、\n{\n"實體1": "相關單位",\n"關係": "函釋",\n"實體2": "規範"}\n、\n{\n"實體1": "歷年官員",\n"關係": "怠失",\n"實體2": "未能對症化解"}\n。'}]

In [22]:
result = [{'generated_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n你你是一個專業的中文AI助手，用繁體中文回答所有的問題。<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n三元組指的是：（實體）--關係--（實體）\n（實體）通常是特定的人、組織或物體，（關係）則是描述兩個實體之間的關係，例如 （孫悟空）--徒弟--（唐僧）、（孫悟空）--師兄弟--（豬八戒）、（幸運者）--得到--（大獎）或（立法委員）--制定--（法律）\n將以下這些内容以三元組方式描述並以Json格式呈現:\n內政部制定殯葬管理條例時， 顯未能務實通盤瞭解殯葬業之問題，在法制上預為釐清因應，其後亦未能掌握問題之所在，致相關之規範及函釋未盡周延，歷時 10多年竟未能對症化解爭議，滋生諸多民怨，確有怠失，爰依法提案糾正 。 \n 。<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n：\n{\n"實體1": "內政部",\n"關係": "制定",\n"實體2": "殯葬管理條例"}\n、\n{\n"實體1": "殯葬業",\n"關係": "問題",\n"實體2": "內政部"}\n、\n{\n"實體1": "立法機關",\n"關係": "爭議",\n"實體2": "民怨"}\n、\n{\n"實體1": "相關單位",\n"關係": "函釋",\n"實體2": "規範"}\n、\n{\n"實體1": "歷年官員",\n"關係": "怠失",\n"實體2": "未能對症化解"}\n。'}]

In [23]:
parts = result[0]['generated_text'].split("<|start_header_id|>assistant<|end_header_id|>", 1)[1]

In [25]:
cleaned_parts = re.sub(r'[：。\n]', '', parts)  # Remove specific characters using a regular expression
cleaned_parts = cleaned_parts.replace("實體", "entity").replace("關係", "relationship").split("、")
cleaned_parts

['{"entity1": "內政部","relationship": "制定","entity2": "殯葬管理條例"}',
 '{"entity1": "殯葬業","relationship": "問題","entity2": "內政部"}',
 '{"entity1": "立法機關","relationship": "爭議","entity2": "民怨"}',
 '{"entity1": "相關單位","relationship": "函釋","entity2": "規範"}',
 '{"entity1": "歷年官員","relationship": "怠失","entity2": "未能對症化解"}']

In [15]:
cleaned_parts = ['{"entity1": "內政部","relationship": "制定","entity2": "殯葬管理條例"}',
 '{"entity1": "殯葬業","relationship": "問題","entity2": "內政部"}',
 '{"entity1": "立法機關","relationship": "爭議","entity2": "民怨"}',
 '{"entity1": "相關單位","relationship": "函釋","entity2": "規範"}',
 '{"entity1": "歷年官員","relationship": "怠失","entity2": "未能對症化解"}']

In [2]:
from neo4j import GraphDatabase

In [3]:
URI = "bolt://localhost:7689"
AUTH = ("neo4j", "neo4j")

with GraphDatabase.driver(URI, auth=AUTH, encrypted=False) as driver:
    driver.verify_connectivity()

In [4]:
def get_nodes():
    query = "MATCH (n) RETURN n LIMIT 10"
    with driver.session() as session:
        result = session.run(query)
        for record in result:
            print(record["n"])  # Adjust depending on your data structure

get_nodes()

  with driver.session() as session:


<Node element_id='4:bb6dac4e-87ad-4c29-aad6-e05b6d5c0823:0' labels=frozenset({'Movie'}) properties={'tagline': 'Welcome to the Real World', 'title': 'The Matrix', 'released': 1999}>
<Node element_id='4:bb6dac4e-87ad-4c29-aad6-e05b6d5c0823:1' labels=frozenset({'Person'}) properties={'born': 1964, 'name': 'Keanu Reeves'}>
<Node element_id='4:bb6dac4e-87ad-4c29-aad6-e05b6d5c0823:2' labels=frozenset({'Person'}) properties={'born': 1967, 'name': 'Carrie-Anne Moss'}>
<Node element_id='4:bb6dac4e-87ad-4c29-aad6-e05b6d5c0823:3' labels=frozenset({'Person'}) properties={'born': 1961, 'name': 'Laurence Fishburne'}>
<Node element_id='4:bb6dac4e-87ad-4c29-aad6-e05b6d5c0823:4' labels=frozenset({'Person'}) properties={'born': 1960, 'name': 'Hugo Weaving'}>
<Node element_id='4:bb6dac4e-87ad-4c29-aad6-e05b6d5c0823:5' labels=frozenset({'Person'}) properties={'born': 1967, 'name': 'Lilly Wachowski'}>
<Node element_id='4:bb6dac4e-87ad-4c29-aad6-e05b6d5c0823:6' labels=frozenset({'Person'}) properties={'bor

In [26]:
def create_relationships(tx, data):
    for item in data:
        tx.run("MERGE (a:Entity {name: $entity1}) "
               "MERGE (b:Entity {name: $entity2}) "
               "MERGE (a)-[r:RELATIONSHIP {type: $relationship}]->(b)",
               entity1=item["entity1"], relationship=item["relationship"], entity2=item["entity2"])

In [27]:
with GraphDatabase.driver(uri=URI, auth = ("neo4j", "neo4j")) as driver:
    with driver.session() as session:
        session.write_transaction(create_relationships, cleaned_parts)


  session.write_transaction(create_relationships, cleaned_parts)


TypeError: string indices must be integers

In [28]:
import json
from neo4j import GraphDatabase

cleaned_parts = [
    '{"entity1": "內政部","relationship": "制定","entity2": "殯葬管理條例"}',
    '{"entity1": "殯葬業","relationship": "問題","entity2": "內政部"}',
    '{"entity1": "立法機關","relationship": "爭議","entity2": "民怨"}',
    '{"entity1": "相關單位","relationship": "函釋","entity2": "規範"}',
    '{"entity1": "歷年官員","relationship": "怠失","entity2": "未能對症化解"}'
]

def create_relationships(tx, data):
    for item in data:
        item_dict = json.loads(item)  # Convert the string to a dictionary
        tx.run("MERGE (a:Entity {name: $entity1}) "
               "MERGE (b:Entity {name: $entity2}) "
               "MERGE (a)-[r:RELATIONSHIP {type: $relationship}]->(b)",
               entity1=item_dict["entity1"], relationship=item_dict["relationship"], entity2=item_dict["entity2"])

with GraphDatabase.driver(uri=URI, auth=("neo4j", "neo4j")) as driver:
    with driver.session() as session:
        session.write_transaction(create_relationships, cleaned_parts)


  session.write_transaction(create_relationships, cleaned_parts)


In [29]:
def fetch_data(tx):
    result = tx.run("MATCH (n:Entity)-[r:RELATIONSHIP]->(m:Entity) "
                    "RETURN n.name, r.type, m.name")
    for record in result:
        print(f"{record['n.name']} {record['r.type']} {record['m.name']}")

with GraphDatabase.driver(uri=URI, auth=("neo4j", "neo4j")) as driver:  # Replace auth credentials as necessary
    with driver.session() as session:
        session.read_transaction(fetch_data)


  session.read_transaction(fetch_data)


殯葬業 問題 內政部
內政部 制定 殯葬管理條例
立法機關 爭議 民怨
相關單位 函釋 規範
歷年官員 怠失 未能對症化解
