In [1]:
import os

from langchain_community.graphs import Neo4jGraph

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "2wsx3edc"

graph = Neo4jGraph()

In [2]:
from graphdatascience import GraphDataScience 

gds = GraphDataScience( 
    os.environ[ "NEO4J_URI" ], 
    auth=(os.environ[ "NEO4J_USERNAME" ], os.environ[ "NEO4J_PASSWORD" ]) 
)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:

G, result = gds.graph.project(
    "communities",  #  Graph name
    "__Entity__",  #  Node projection
    {
        "_ALL_": {
            "type": "*",
            "orientation": "UNDIRECTED",
            "properties": {"weight": {"property": "*", "aggregation": "COUNT"}},
        }
    },
)

In [8]:
wcc = gds.wcc.stats(G)
print(f"Component count: {wcc['componentCount']}")
print(f"Component distribution: {wcc['componentDistribution']}")

Component count: 3
Component distribution: {'min': 1, 'p5': 1, 'max': 21, 'p999': 21, 'p99': 21, 'p1': 1, 'p10': 1, 'p90': 21, 'p50': 2, 'p25': 1, 'p75': 21, 'p95': 21, 'mean': 8.0}


In [9]:
gds.leiden.write(
    G,
    writeProperty="communities",
    includeIntermediateCommunities=True,
    relationshipWeightProperty="weight",
)

writeMillis                                                             19
nodePropertiesWritten                                                   24
ranLevels                                                                2
didConverge                                                           True
nodeCount                                                               24
communityCount                                                           6
communityDistribution    {'min': 1, 'p5': 1, 'max': 11, 'p999': 11, 'p9...
modularity                                                        0.323232
modularities                     [0.30394857667584946, 0.3232323232323232]
postProcessingMillis                                                     8
preProcessingMillis                                                      1
computeMillis                                                           90
configuration            {'writeProperty': 'communities', 'theta': 0.01...
Name: 0, dtype: object

In [10]:
graph.query("""
MATCH (e:`__Entity__`)
UNWIND range(0, size(e.communities) - 1 , 1) AS index
CALL {
  WITH e, index
  WITH e, index
  WHERE index = 0
  MERGE (c:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET c.level = index
  MERGE (e)-[:IN_COMMUNITY]->(c)
  RETURN count(*) AS count_0
}
CALL {
  WITH e, index
  WITH e, index
  WHERE index > 0
  MERGE (current:`__Community__` {id: toString(index) + '-' + toString(e.communities[index])})
  ON CREATE SET current.level = index
  MERGE (previous:`__Community__` {id: toString(index - 1) + '-' + toString(e.communities[index - 1])})
  ON CREATE SET previous.level = index - 1
  MERGE (previous)-[:IN_COMMUNITY]->(current)
  RETURN count(*) AS count_1
}
RETURN count(*)
""")

[{'count(*)': 48}]

In [11]:
graph.query("""
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(:__Entity__)<-[:HAS_ENTITY]-(d:Chunk)
WITH c, count(distinct d) AS rank
SET c.community_rank = rank;
""")

[]

In [14]:
import pandas as pd
import numpy as np

community_size = graph.query(
    """
MATCH (c:__Community__)<-[:IN_COMMUNITY*]-(e:__Entity__)
WITH c, count(distinct e) AS entities
RETURN split(c.id, '-')[0] AS level, entities
"""
)
community_size_df = pd.DataFrame.from_records(community_size)
percentiles_data = []
for level in community_size_df["level"].unique():
    subset = community_size_df[community_size_df["level"] == level]["entities"]
    num_communities = len(subset)
    percentiles = np.percentile(subset, [25, 50, 75, 90, 99])
    percentiles_data.append(
        [
            level,
            num_communities,
            percentiles[0],
            percentiles[1],
            percentiles[2],
            percentiles[3],
            percentiles[4],
            max(subset)
        ]
    )

# Create a DataFrame with the percentiles
percentiles_df = pd.DataFrame(
    percentiles_data,
    columns=[
        "Level",
        "Number of communities",
        "25th Percentile",
        "50th Percentile",
        "75th Percentile",
        "90th Percentile",
        "99th Percentile",
        "Max"
    ],
)
percentiles_df

Unnamed: 0,Level,Number of communities,25th Percentile,50th Percentile,75th Percentile,90th Percentile,99th Percentile,Max
0,0,7,2.0,2.0,3.0,6.2,10.52,11
1,1,6,2.0,2.5,4.5,8.0,10.7,11


In [16]:
community_info = graph.query("""
MATCH (c:`__Community__`)<-[:IN_COMMUNITY*]-(e:__Entity__)
WHERE c.level IN [0,1,4]
WITH c, collect(e ) AS nodes
WHERE size(nodes) > 1
CALL apoc.path.subgraphAll(nodes[0], {
 whitelistNodes:nodes
})
YIELD relationships
RETURN c.id AS communityId, 
       [n in nodes | {id: n.id, description: n.description, type: [el in labels(n) WHERE el <> '__Entity__'][0]}] AS nodes,
       [r in relationships | {start: startNode(r).id, type: type(r), end: endNode(r).id, description: r.description}] AS rels
""")
community_info

[{'communityId': '0-19',
  'nodes': [{'id': '保險金', 'description': None, 'type': 'Concept'},
   {'id': '年複利方式', 'description': None, 'type': 'Concept'},
   {'id': '抵繳保險費', 'description': None, 'type': 'Action'},
   {'id': '各年度之增值回饋分享金', 'description': None, 'type': 'Benefit'},
   {'id': '增額繳清保險金額', 'description': None, 'type': 'Benefit'},
   {'id': '本公司', 'description': None, 'type': 'Organization'},
   {'id': '本契約終止', 'description': None, 'type': 'Event'},
   {'id': '要保人', 'description': None, 'type': 'Person'},
   {'id': '宣告利率', 'description': None, 'type': 'Concept'},
   {'id': '儲存生息', 'description': None, 'type': 'Concept'},
   {'id': '受益人', 'description': None, 'type': 'Person'}],
  'rels': [{'start': '各年度之增值回饋分享金',
    'description': None,
    'type': 'USED_FOR',
    'end': '抵繳保險費'},
   {'start': '各年度之增值回饋分享金',
    'description': None,
    'type': 'RELATED_TO',
    'end': '要保人'},
   {'start': '各年度之增值回饋分享金',
    'description': None,
    'type': 'OPTION',
    'end': '增額繳清保險金額'},
   

In [17]:
import os

from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)

In [20]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

community_template = """請根據提供同社區的資訊包含 nodes 與 relationships,
產生自然語言的摘要 同社區的資訊:
{community_info}

Summary:"""  # noqa: E501

community_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Given an input triples, generate the information summary. No pre-amble.",
        ),
        ("human", community_template),
    ]
)

community_chain = community_prompt | llm | StrOutputParser()

In [21]:
def prepare_string(data):
    nodes_str = "Nodes are:\n"
    for node in data['nodes']:
        node_id = node['id']
        node_type = node['type']
        if 'description' in node and node['description']:
            node_description = f", description: {node['description']}"
        else:
            node_description = ""
        nodes_str += f"id: {node_id}, type: {node_type}{node_description}\n"

    rels_str = "Relationships are:\n"
    for rel in data['rels']:
        start = rel['start']
        end = rel['end']
        rel_type = rel['type']
        if 'description' in rel and rel['description']:
            description = f", description: {rel['description']}"
        else:
            description = ""
        rels_str += f"({start})-[:{rel_type}]->({end}){description}\n"

    return nodes_str + "\n" + rels_str

def process_community(community):
    stringify_info = prepare_string(community)
    summary = community_chain.invoke({'community_info': stringify_info})
    return {"community": community['communityId'], "summary": summary}

process_community(community_info[0])

{'community': '0-19',
 'summary': '在同一個社區中，主要的概念和相關行動圍繞著「各年度之增值回饋分享金」展開。以下是各節點和關係的摘要：\n\n- **各年度之增值回饋分享金**：這是核心概念，與多個節點有關。其用途包括：\n  - 用於抵繳保險費和增額繳清保險金額。\n  - 使用年複利方式和宣告利率進行計算。\n  - 受益人可以受領該金額。\n  - 要保人會收到有關增值回饋分享金的通知，並且可以選擇增額繳清保險金額。\n  - 在本契約終止時給付增值回饋分享金給要保人。\n  - 包含了保險金的概念。\n  - 該金額由本公司主動給付並管理，並且可能會被退還給要保人。\n\n- **要保人**：這是一個關鍵人物，與增值回饋分享金有多種互動。要保人可以選擇增額繳清保險金額，並會收到增值回饋分享金的通知。\n\n- **本公司**：作為一個組織，本公司管理和退還增值回饋分享金，並使用宣告利率來計算回饋。\n\n- **受益人**：受益人可以受領增值回饋分享金。\n\n- **儲存生息**：這個概念包含了各年度之增值回饋分享金。\n\n以上是同社區中各節點及其關係的摘要。'}

In [22]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

summaries = []
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_community, community): community for community in community_info}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing communities"):
        summaries.append(future.result())

Processing communities: 100%|██████████| 11/11 [00:06<00:00,  1.64it/s]


In [23]:
summaries

[{'community': '0-16', 'summary': '現金給付是一種利益，它在保單年度開始給付。'},
 {'community': '0-14',
  'summary': '台灣人壽是一個組織，提供了一個名為「美鑫美利美元利率變動型終身壽險」的產品。這個產品包含了特定的承保範圍。'},
 {'community': '1-4',
  'summary': '台灣人壽是一家提供保險產品的組織，其中包括一種名為「美鑫美利美元利率變動型終身壽險」的產品。這款保險產品包含了特定的承保範圍。'},
 {'community': '1-6',
  'summary': '身故保險金和喪葬費用保險金兩者之間存在一種變更關係。兩者皆屬於保險概念，且喪葬費用保險金可由身故保險金進行變更。'},
 {'community': '1-1',
  'summary': '在民國九十九年二月三日，有一份契約訂立。該契約涉及到被保險人和未成年人。被保險人在此契約中達到了保險年齡，而未成年人與被保險人之間存在身故的關係。本契約是由未成年人訂立的。'},
 {'community': '0-7',
  'summary': '本社區包含兩個節點和一個關係。節點包括一個標識為「未成年人」的個人節點和一個標識為「本契約」的概念節點。這兩個節點之間存在一個關係，其中「本契約」有一個[:訂立]的關係指向「未成年人」。'},
 {'community': '0-2',
  'summary': '在同一社區內，有兩個概念節點：「身故保險金」和「喪葬費用保險金」。這兩個節點之間存在一個變更關係，其中「身故保險金」變更為「喪葬費用保險金」。'},
 {'community': '1-5', 'summary': '現金給付是一種福利，並且在保單年度開始給付。'},
 {'community': '0-19',
  'summary': '在這個社區中，"各年度之增值回饋分享金" 是一個中心概念，與多個節點和關係相關聯。它被用於抵繳保險費、增額繳清保險金額，並使用年複利方式和宣告利率。這些增值回饋分享金會主動由本公司給付，並包含於保險金中。要保人和受益人都是這些增值回饋分享金的相關人，其中要保人可以選擇將這些金額增額繳清保險金額，並且會被通知這些回饋金的信息。本公司不僅管理和返回這些增值回饋分

In [24]:
graph.query("""
UNWIND $data AS row
MERGE (c:__Community__ {id:row.community})
SET c.summary = row.summary
""", params={"data": summaries})

[]