In [131]:
import os

from langchain_community.graphs import Neo4jGraph

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "2wsx3edc"

database = os.environ.get('NEO4J_DATABASE')
graph = Neo4jGraph(database=database)

In [132]:
from graphdatascience import GraphDataScience 

gds = GraphDataScience( 
    os.environ[ "NEO4J_URI" ], 
    auth=(os.environ[ "NEO4J_USERNAME" ], os.environ[ "NEO4J_PASSWORD" ]) 
)

In [133]:
gds.graph.drop("communities")
G, result = gds.graph.project(
    "communities",  #  Graph name
    "__Entity__",  #  Node projection
    {
        "_ALL_": {
            "type": "*",
            "orientation": "UNDIRECTED",
            "properties": {"weight": {"property": "*", "aggregation": "COUNT"}},
        }
    },
)

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

In [134]:
wcc = gds.wcc.stats(G)
print(f"Component count: {wcc['componentCount']}")
print(f"Component distribution: {wcc['componentDistribution']}")

Component count: 299
Component distribution: {'min': 1, 'p5': 1, 'max': 179, 'p999': 179, 'p99': 13, 'p1': 1, 'p10': 1, 'p90': 1, 'p50': 1, 'p25': 1, 'p75': 1, 'p95': 2, 'mean': 1.8528428093645486}


In [121]:
gds.leiden.write(
    G,
    writeProperty="communities",
    includeIntermediateCommunities=True,
    relationshipWeightProperty="weight",
)

Leiden:   0%|          | 0/100 [00:00<?, ?%/s]

writeMillis                                                            947
nodePropertiesWritten                                                  554
ranLevels                                                                2
didConverge                                                           True
nodeCount                                                              554
communityCount                                                         309
communityDistribution    {'min': 1, 'p5': 1, 'max': 45, 'p999': 45, 'p9...
modularity                                                        0.726493
modularities                      [0.6643996755002705, 0.7264926987560844]
postProcessingMillis                                                     3
preProcessingMillis                                                      5
computeMillis                                                          823
configuration            {'writeProperty': 'communities', 'theta': 0.01...
Name: 0, dtype: object

In [135]:
graph.query("""
MATCH (e:`__Entity__`)
WHERE NOT EXISTS {
    MATCH (e)-[:IN_COMMUNITY]->(:__Community__)
}
UNWIND range(0, size(e.communities) - 1 , 1) AS index
CALL {
  WITH e, index
  WITH e, index
  WHERE index = 0
  MERGE (c:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET c.level = index
  MERGE (e)-[:IN_COMMUNITY]->(c)
  RETURN count(*) AS count_0
}
CALL {
  WITH e, index
  WITH e, index
  WHERE index > 0
  MERGE (current:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET current.level = index
  MERGE (previous:`__Community__` {id: toString(index - 1) + '-' + toString(e.communities[index - 1])})
  ON CREATE SET previous.level = index - 1
  MERGE (previous)-[:IN_COMMUNITY]->(current)
  RETURN count(*) AS count_1
}
RETURN count(*)
""")

[{'count(*)': 0}]

In [136]:
graph.query("""
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(:__Entity__)<-[:HAS_ENTITY]-(d:__Chunk__)
WITH c, count(distinct d) AS rank
SET c.rank = rank;
""")

[]

In [137]:
import pandas as pd
import numpy as np

community_size = graph.query(
    """
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(e:__Entity__)
WITH c, count(distinct e) AS entities
RETURN split(c.id, '-')[0] AS level, entities
"""
)
community_size_df = pd.DataFrame.from_records(community_size)
percentiles_data = []
for level in community_size_df["level"].unique():
    subset = community_size_df[community_size_df["level"] == level]["entities"]
    num_communities = len(subset)
    percentiles = np.percentile(subset, [25, 50, 75, 90, 99])
    percentiles_data.append(
        [
            level,
            num_communities,
            percentiles[0],
            percentiles[1],
            percentiles[2],
            percentiles[3],
            percentiles[4],
            max(subset)
        ]
    )

# Create a DataFrame with the percentiles
percentiles_df = pd.DataFrame(
    percentiles_data,
    columns=[
        "Level",
        "Number of communities",
        "25th Percentile",
        "50th Percentile",
        "75th Percentile",
        "90th Percentile",
        "99th Percentile",
        "Max"
    ],
)
percentiles_df

Unnamed: 0,Level,Number of communities,25th Percentile,50th Percentile,75th Percentile,90th Percentile,99th Percentile,Max
0,0,325,1.0,1.0,1.0,2.0,16.8,31
1,1,309,1.0,1.0,1.0,1.0,20.76,62
2,2,309,1.0,1.0,1.0,1.0,26.52,62


In [None]:
community_info = graph.query("""
MATCH (c:`__Community__`)<-[:IN_COMMUNITY*]-(e:__Entity__)
WITH c, collect(e ) AS nodes
WHERE size(nodes) > 1 and c.summary is null
CALL apoc.path.subgraphAll(nodes[0], {
 whitelistNodes:nodes
})
YIELD relationships
RETURN c.id AS communityId, 
       [n in nodes | apoc.map.removeKeys(n{.*, type: [el in labels(n) WHERE el <> '__Entity__'][0]}, ['embedding', 'wcc', 'communities'])] AS nodes,
       [r in relationships | {start: startNode(r).id, type: type(r), end: endNode(r).id, description: r.description}] AS rels
""")
community_info[0]

In [107]:
import os

from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0
)

In [108]:
# from langchain_experimental.llms.ollama_functions import OllamaFunctions
# model_name = 'qwen2:72b-instruct-q8_0'
# llm = OllamaFunctions(model=model_name, temperature=0)
# llm

In [109]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

community_template = """
{community_info}
Summary:"""  # noqa: E501

community_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """請根據提供同社區的資訊包含 nodes 與 relationships, 產生同社區的自然語言的摘要資訊, 
當提供多個來源(sources)資訊時，需將共同的資訊和各來源獨立提供的資訊分別呈現；對於共同資訊部分，統一匯集與展示；對於各個來源的獨立資訊，則按來源分開顯示。此外，如果僅有單一來源資訊，則只需呈現來源檔案名稱即可. No pre-amble.""",
        ),
        (
            "system", "使用繁體中文回應"
        ),
        ("human", community_template),
    ]
)

community_chain = community_prompt | llm | StrOutputParser()

In [110]:

def prepare_string(data):
    nodes_str = "Nodes are:\n"
    for node in data['nodes']:
        desc_list = []
        for key in node:
            if node[key] is not None:
                desc_list.append(f"{key}: {node[key]}")
        nodes_str += ', '.join(desc_list)

    rels_str = "Relationships are:\n"
    for rel in data['rels']:
        start = rel['start']
        end = rel['end']
        rel_type = rel['type']
        if 'description' in rel and rel['description']:
            description = f", description: {rel['description']}"
        else:
            description = ""
        rels_str += f"({start})-[:{rel_type}]->({end}){description}\n"

    return nodes_str + "\n" + rels_str

def process_community(community):
    stringify_info = prepare_string(community)
    summary = community_chain.invoke({'community_info': stringify_info})
    return {"community": community['communityId'], "summary": summary}

process_community(community_info[0])

{'community': '0-66',
 'summary': '共同資訊:\n- 來源: 台灣人壽龍實在住院醫療健康保險附約.pdf, 台灣人壽新住院醫療保險附約.pdf\n  - 節點:\n    - 疾病: 被保險人自本附約生效日（或復效日）起所發生之疾病\n    - 被保險人: 本附約被保險人為主契約之被保險人或其配偶時，其續保之保險年齡最高為八十五歲。\n    - 傷害: 被保險人於本附約有效期間內，遭受意外傷害事故，因而蒙受之傷害\n    - 住院: 被保險人經醫師診斷其疾病或傷害必須入住醫院，且正式辦理住院手續並確實在醫院接受診療者。但不包含全民健康保險法第五十一條所稱之日間住院及精神衛生法第三十五條所稱之日間留院\n    - 醫院: 依照醫療法規定領有開業執照並設有病房收治病人之公、私立及醫療法人醫院\n    - 住院診療\n    - 保單年度: 同一保單年度之門診手術費用保險金最高給付次數以6次為限\n    - 全民健康保險: 全民健康保險之保險對象身分接受外科手術診療\n    - 出院之日\n  - 關係:\n    - (被保險人)-[:因]->(傷害)\n    - (被保險人)-[:因]->(疾病)\n    - (被保險人)-[:接受]->(住院診療)\n    - (被保險人)-[:遭受]->(傷害)\n    - (被保險人)-[:患有]->(疾病)\n    - (被保險人)-[:保險對象]->(全民健康保險)\n    - (住院)-[:ENDS_ON]->(出院之日)\n    - (住院診療)-[:在]->(保單年度)\n\n台灣人壽龍實在住院醫療健康保險附約.pdf:\n  - 節點:\n    - 門診手術費用保險金: 實支實付之給付\n    - 附表二的特定處置\n    - 特定處置\n    - 保單週年日\n    - 入院日\n    - 健康證明文件\n  - 關係:\n    - (被保險人)-[:接受]->(附表二的特定處置)\n    - (被保險人)-[:接受]->(門診手術費用保險金)\n    - (被保險人)-[:NOT_REQUIRE]->(健康證明文件), description: 申請時不具任何健康證明文件\n    - (住院)-[:STARTS_ON]->(入院日)

In [111]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

def process_community_with_llm(community_info, max_retry=0):
    '''
    params:
        community_info: [ { 
                            'communityId': str, 'nodes': [{'id': str, 'description': str|None, 'type': str}, ...], 
                            'rels': [{'start': str, 'description': str|None, 'type': str, 'end': 'str}, ...]
                          },
                          ... ]
        max_retry: 最多嘗試次數, 假設為2, 則最多遞迴執行 2+1=3次
    '''
    summaries = []
    faild_communities = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(process_community, community): community for community in community_info}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing communities"):
            try:
                summaries.append(future.result())
            except Exception as e:
                community = futures[future]
                print(f'process community faild!:{community}, error:\n{e}')
                faild_communities.append(community)
    if len(faild_communities) > 0 and max_retry > 0:
        time.sleep(30) # 防止超出限制的情境, 等待一段時間後再嘗試
        summaries.extend(process_community_with_llm(faild_communities, max_retry=max_retry-1))
    return summaries
summaries = process_community_with_llm(community_info, max_retry=3)

Processing communities:  99%|█████████▉| 90/91 [01:32<00:01,  1.33s/it]

process community faild!:{'communityId': '0-545', 'nodes': [{'id': '截肢術', 'type': 'Procedure', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '手指', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '足趾', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '股關節', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '大腿', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '小腿', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '前臂', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '上臂', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '腕關節', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}, {'id': '踝關節', 'type': 'Body_part', 'sources': ['台灣人壽新住院醫療保險附約.pdf']}], 'rels': [{'start': '截肢術', 'description': '9%', 'type': 'REMOVES', 'end': '足趾'}, {'start': '截肢術', 'description': '13%', 'type': 'REMOVES', 'end': '腕關節'}, {'start': '截肢術', 'description': '13%', 'type': 'REMOVES', 'end': '小腿'}, {'start': '截肢術', 'de

Processing communities: 100%|██████████| 91/91 [01:34<00:00,  1.04s/it]
Processing communities: 100%|██████████| 1/1 [00:02<00:00,  2.86s/it]


In [112]:
summaries

[{'community': '1-270',
  'summary': '共同資訊:\n- 受益人: 本附約各項保險金之受益人為被保險人本人，本公司不受理其指定及變更。被保險人身故時，如本附約保險金尚未給付或未完全給付，則以該被保險人之法定繼承人為該部分保險金之受益人。前項法定繼承人之順序及應得保險金之比例適用民法繼承編相關規定。\n- 受益人的身分證明\n- 保險金申請書\n- 醫療費用收據\n\n台灣人壽龍實在住院醫療健康保險附約.pdf:\n- 醫療診斷書\n- 住院證明\n- 醫療費用明細\n\n台灣人壽新住院醫療保險附約.pdf:\n- 保險單或其謄本\n- 醫療診斷書或住院證明: 但要保人或被保險人為醫師時，不得為被保險人出具診斷書或住院證明。'},
 {'community': '0-161',
  'summary': '共同資訊:\n- 受益人: 本附約各項保險金之受益人為被保險人本人，本公司不受理其指定及變更。被保險人身故時，如本附約保險金尚未給付或未完全給付，則以該被保險人之法定繼承人為該部分保險金之受益人。前項法定繼承人之順序及應得保險金之比例適用民法繼承編相關規定。\n- 保險金申請書: 文件\n- 醫療費用收據: 文件\n- 受益人的身分證明: 文件\n\n台灣人壽龍實在住院醫療健康保險附約.pdf:\n- 醫療診斷書: 文件\n- 住院證明: 文件\n- 醫療費用明細: 文件\n\n台灣人壽新住院醫療保險附約.pdf:\n- 保險單或其謄本: 文件\n- 醫療診斷書或住院證明: 但要保人或被保險人為醫師時，不得為被保險人出具診斷書或住院證明。'},
 {'community': '0-316',
  'summary': '共同資訊:\n- 保險金: 依第五條及第六條之約定所給付\n- 保險金與限額有關聯\n- 保險金與一次住院有關聯\n\n台灣人壽龍實在住院醫療健康保險附約.pdf:\n- 續保保險費\n\n台灣人壽新住院醫療保險附約.pdf:\n- 限額\n- 一次住院'},
 {'community': '0-66',
  'summary': '共同資訊:\n- 來源: 台灣人壽龍實在住院醫療健康保險附約.pdf, 台灣人壽新住院醫療保險附約.pdf\n  - 節點:\n    - 疾病: 被保險人自本附約生效日（或復效日）起

In [113]:
graph.query("""
UNWIND $data AS row
MERGE (c:__Community__ {id:row.community})
SET c.summary = row.summary
""", params={"data": summaries})

[]

In [114]:
graph.query('''
MATCH (n:__Entity__)
WHERE n.description IS NOT NULL and apoc.meta.cypher.type(n.description) <> 'STRING'
REMOVE n.embedding
SET n.description = apoc.text.join(n.description, "\n---\n")
RETURN n.description
''')

[{'n.description': '依主保險契約要保人之申請，並經本公司同意，附加於主契約訂定之。\n---\n本附約係依主保險契約要保人之申請，並經本公司同意，附加於主契約訂定之。'},
 {'n.description': '90,000\n---\n120,000\n---\n180,000'},
 {'n.description': '被保險人因第三條之約定而以全民健康保險之保險對象身分住院診療時，本公司按被保險人住院期間內所發生，且依全民健康保險規定其保險對象應自行負擔及不屬全民健康保險給付範圍之費用核付。\n---\n被保險人因第三條之約定而以全民健康保險之保險對象身分住院或接受門診外科手術診療時，本公司按被保險人住院期間內或接受門診外科手術時所發生，且依全民健康保險規定其保險對象應自行負擔及不屬全民健康保險給付範圍之費用核付。'},
 {'n.description': '1,000\n---\n1,500'},
 {'n.description': '600\n---\n900'},
 {'n.description': '並附心臟科專科醫師診斷證明或心電圖檢查認定須剖腹產者\n---\n嚴重心律不整，並附心臟科專科醫師診斷證明或心電圖檢查認定須剖腹產者'},
 {'n.description': '160,000\n---\n180,000'},
 {'n.description': '40,000\n---\n60,000'},
 {'n.description': '2,000\n---\n3,000'},
 {'n.description': '給付比例：20%\n---\n給付比例：8%'},
 {'n.description': '50%\n---\n32%'}]

In [115]:
graph.query(
    """
MATCH (n:`__Community__`)<-[:IN_COMMUNITY]-()<-[:HAS_ENTITY]-(c)
WITH n, count(distinct c) AS chunkCount
SET n.weight = chunkCount"""
)

[]