In [11]:
import os

from langchain_community.graphs import Neo4jGraph

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "2wsx3edc"

database = os.environ.get('NEO4J_DATABASE')
graph = Neo4jGraph(database=database)

In [12]:
from graphdatascience import GraphDataScience 

gds = GraphDataScience( 
    os.environ[ "NEO4J_URI" ], 
    auth=(os.environ[ "NEO4J_USERNAME" ], os.environ[ "NEO4J_PASSWORD" ]) 
)

In [13]:
gds.graph.drop("communities")
G, result = gds.graph.project(
    "communities",  #  Graph name
    "__Entity__",  #  Node projection
    {
        "_ALL_": {
            "type": "*",
            "orientation": "UNDIRECTED",
            "properties": {"weight": {"property": "*", "aggregation": "COUNT"}},
        }
    },
)

In [14]:
wcc = gds.wcc.stats(G)
print(f"Component count: {wcc['componentCount']}")
print(f"Component distribution: {wcc['componentDistribution']}")

Component count: 26
Component distribution: {'min': 1, 'p5': 1, 'max': 70, 'p999': 70, 'p99': 70, 'p1': 1, 'p10': 1, 'p90': 11, 'p50': 2, 'p25': 1, 'p75': 3, 'p95': 15, 'mean': 5.423076923076923}


In [15]:
gds.leiden.write(
    G,
    writeProperty="communities",
    includeIntermediateCommunities=True,
    relationshipWeightProperty="weight",
)

writeMillis                                                             26
nodePropertiesWritten                                                  141
ranLevels                                                                3
didConverge                                                           True
nodeCount                                                              141
communityCount                                                          31
communityDistribution    {'min': 1, 'p5': 1, 'max': 17, 'p999': 17, 'p9...
modularity                                                        0.808789
modularities             [0.7226193932355931, 0.8027471331107563, 0.808...
postProcessingMillis                                                    33
preProcessingMillis                                                      0
computeMillis                                                          114
configuration            {'writeProperty': 'communities', 'theta': 0.01...
Name: 0, dtype: object

In [16]:
graph.query("""
MATCH (e:`__Entity__`)
UNWIND range(0, size(e.communities) - 1 , 1) AS index
CALL {
  WITH e, index
  WITH e, index
  WHERE index = 0
  MERGE (c:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET c.level = index
  MERGE (e)-[:IN_COMMUNITY]->(c)
  RETURN count(*) AS count_0
}
CALL {
  WITH e, index
  WITH e, index
  WHERE index > 0
  MERGE (current:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET current.level = index
  MERGE (previous:`__Community__` {id: toString(index - 1) + '-' + toString(e.communities[index - 1])})
  ON CREATE SET previous.level = index - 1
  MERGE (previous)-[:IN_COMMUNITY]->(current)
  RETURN count(*) AS count_1
}
RETURN count(*)
""")

[{'count(*)': 423}]

In [17]:
graph.query("""
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(:__Entity__)<-[:HAS_ENTITY]-(d:__Chunk__)
WITH c, count(distinct d) AS rank
SET c.rank = rank;
""")

[]

In [18]:
import pandas as pd
import numpy as np

community_size = graph.query(
    """
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(e:__Entity__)
WITH c, count(distinct e) AS entities
RETURN split(c.id, '-')[0] AS level, entities
"""
)
community_size_df = pd.DataFrame.from_records(community_size)
percentiles_data = []
for level in community_size_df["level"].unique():
    subset = community_size_df[community_size_df["level"] == level]["entities"]
    num_communities = len(subset)
    percentiles = np.percentile(subset, [25, 50, 75, 90, 99])
    percentiles_data.append(
        [
            level,
            num_communities,
            percentiles[0],
            percentiles[1],
            percentiles[2],
            percentiles[3],
            percentiles[4],
            max(subset)
        ]
    )

# Create a DataFrame with the percentiles
percentiles_df = pd.DataFrame(
    percentiles_data,
    columns=[
        "Level",
        "Number of communities",
        "25th Percentile",
        "50th Percentile",
        "75th Percentile",
        "90th Percentile",
        "99th Percentile",
        "Max"
    ],
)
percentiles_df

Unnamed: 0,Level,Number of communities,25th Percentile,50th Percentile,75th Percentile,90th Percentile,99th Percentile,Max
0,0,45,2.0,3.0,3.0,5.0,11.0,11
1,1,33,1.0,3.0,6.0,11.0,15.36,16
2,2,31,1.0,2.0,5.0,14.0,16.7,17


In [30]:
community_info = graph.query("""
MATCH (c:`__Community__`)<-[:IN_COMMUNITY*]-(e:__Entity__)
WITH c, collect(e ) AS nodes
WHERE size(nodes) > 1
CALL apoc.path.subgraphAll(nodes[0], {
 whitelistNodes:nodes
})
YIELD relationships
RETURN c.id AS communityId, 
       [n in nodes | {id: n.id, description: n.description, source: n.sources, type: [el in labels(n) WHERE el <> '__Entity__'][0]}] AS nodes,
       [r in relationships | {start: startNode(r).id, type: type(r), end: endNode(r).id, description: r.description}] AS rels
""")
community_info[0]

{'communityId': '0-46',
 'nodes': [{'id': '首期保險費',
   'source': ['個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版_p3_p7.pdf'],
   'description': None,
   'type': '概念'},
  {'id': '新契約投保',
   'source': ['個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版_p3_p7.pdf'],
   'description': None,
   'type': 'Event'},
  {'id': '核保通過',
   'source': ['個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版_p3_p7.pdf'],
   'description': None,
   'type': '概念'},
  {'id': '保險',
   'source': ['個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版_p3_p7.pdf'],
   'description': None,
   'type': '概念'},
  {'id': '續期繳費方式',
   'source': ['個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版_p3_p7.pdf'],
   'description': None,
   'type': '概念'},
  {'id': '保險費信用卡付款授權書',
   'source': ['個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版_p3_p7.pdf'],
   'description': None,
   'type': 'Document'},
  {'id': '自動轉帳',
   'source': ['個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版_p3_p7.pdf'],
   'description': None,
   'type': 'Event'}],
 'rels': [{'start': '新契約投保',
   'description': '新契約投保時之自動轉帳及信用卡付款授權書',
   'ty

In [31]:
import os

from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0
)

In [41]:
# from langchain_experimental.llms.ollama_functions import OllamaFunctions
# model_name = 'qwen2:72b-instruct-q8_0'
# llm = OllamaFunctions(model=model_name, temperature=0)
# llm

In [51]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

community_template = """
{community_info}
Summary:"""  # noqa: E501

community_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "請根據提供同社區的資訊包含 nodes 與 relationships, 產生同社區的自然語言的摘要資訊, 並提供來源檔名(source). No pre-amble.",
        ),
        (
            "system", "使用繁體中文回應"
        ),
        ("human", community_template),
    ]
)

community_chain = community_prompt | llm | StrOutputParser()

In [52]:
def prepare_string(data):
    nodes_str = "Nodes are:\n"
    for node in data['nodes']:
        node_id = node['id']
        node_type = node['type']
        if 'description' in node and node['description']:
            node_description = f", description: {node['description']}"
        else:
            node_description = ""
        nodes_str += f"id: {node_id}, type: {node_type}{node_description}, source: {node['source']},\n"

    rels_str = "Relationships are:\n"
    for rel in data['rels']:
        start = rel['start']
        end = rel['end']
        rel_type = rel['type']
        if 'description' in rel and rel['description']:
            description = f", description: {rel['description']}"
        else:
            description = ""
        rels_str += f"({start})-[:{rel_type}]->({end}){description}\n"

    return nodes_str + "\n" + rels_str

def process_community(community):
    stringify_info = prepare_string(community)
    summary = community_chain.invoke({'community_info': stringify_info})
    return {"community": community['communityId'], "summary": summary}

process_community(community_info[0])

{'community': '0-46',
 'summary': '在《個人保險首續期繳費暨保單貸款還款息作業手冊(113年7月版)_業務通路版》中，描述了新契約投保的相關流程和條件。新契約投保包括保險費信用卡付款授權書和自動轉帳，並且需要滿足首期保險費和核保通過的條件才能生效保險。此外，續期繳費方式也與新契約投保有約定關係。'}

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def process_community_with_llm(community_info, max_retry=0):
    '''
    params:
        community_info: [ { 
                            'communityId': str, 'nodes': [{'id': str, 'description': str|None, 'type': str}, ...], 
                            'rels': [{'start': str, 'description': str|None, 'type': str, 'end': 'str}, ...]
                          },
                          ... ]
        max_retry: 最多嘗試次數, 假設為2, 則最多遞迴執行 2+1=3次
    '''
    summaries = []
    faild_communities = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(process_community, community): community for community in community_info}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing communities"):
            try:
                summaries.append(future.result())
            except Exception as e:
                community = futures[future]
                print(f'process community faild!:{community}, error:\n{e}')
                faild_communities.append(community)
    if len(faild_communities) > 0 and max_retry > 0:
        summaries.extend(process_community_with_llm(faild_communities, max_retry=max_retry-1))
    return summaries
summaries = process_community_with_llm(community_info, max_retry=0)

Processing communities:  99%|█████████▊| 75/76 [02:11<00:01,  1.76s/it]


In [45]:
summaries

[{'community': '0-6',
  'summary': '中信ATM繳費適用於傳統型台幣商品，使用者可以在全台中信ATM繳費專區進行繳費，並可依需求選擇多筆保單一次繳款或特定保單繳款。繳交保險費金額上限為單筆單次最高新台幣200萬元，當日累計不可超過新台幣300萬元，但仍須視各銀行規定。此外，使用中國信託銀行金融卡進行交易不須支付手續費，但使用其他銀行的金融卡則需支付跨行交易手續費。'},
 {'community': '0-8',
  'summary': '在同一社區中，主要概念是「續期保險費繳交方式及相關規定」，其下包含多個子概念。這些子概念包括：\n\n1. 續期保險費扣款不成通知方式\n2. 續期保險費-採信用卡扣款作業\n3. 續期保險費-採金融機構轉帳作業\n4. 高齡關懷電訪作業\n5. 續期保險費-採自行繳費作業\n\n這些子概念詳細描述了不同的續期保險費繳交方式及相關的操作規定，涵蓋了從信用卡扣款、金融機構轉帳、自行繳費到扣款不成的通知方式，以及針對高齡客戶的關懷電訪作業。'},
 {'community': '0-437',
  'summary': '在這個社區中，有三個概念節點和兩個關係。節點包括：\n\n1. 業務/保經/保代通路：這是一個概念，描述為「適用」。\n2. 電話行銷通路：這是一個概念，描述為「適用：請詳閱『新契約個人保險投保規則手冊-核保及行政篇』」。\n3. 生效日期之認定原則：這是一個概念，描述為「依繳費方式或商品性質做認定」。\n\n關係如下：\n\n1. 「生效日期之認定原則」適用於「電話行銷通路」。\n2. 「生效日期之認定原則」適用於「業務/保經/保代通路」。'},
 {'community': '0-2',
  'summary': '在這個社區中，主要涉及到核印失敗、請款失敗及新契約於核保中主動異動繳費管道等事件。這些事件的生效日認定均依據「新契約首期保險費繳費管道異動之繳費日認定原則」辦理。\n\n具體來說，「新契約首期保險費繳費管道異動之繳費日認定原則」適用於核印失敗或扣款失敗後變更繳費管道，以及核保中主動變更繳費管道的情況。當發生核印失敗、新契約於核保中主動異動繳費管道或請款失敗時，均需依據這一原則來認定其生效日。\n\n總結來說，這些事件和概念之間的關係緊密，所有的生效日認定都依賴於「新

In [46]:
graph.query("""
UNWIND $data AS row
MERGE (c:__Community__ {id:row.community})
SET c.summary = row.summary
""", params={"data": summaries})

[]