<a href="https://colab.research.google.com/github/e11106013/LLM/blob/main/LLM_to_Graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
#@title 安裝套件
!pip install langchain_community jq pyvis



In [42]:
#@title 下載文件
!gdown '1EDcqa8OmfR7Y-wmoiDauYg30HGbhkkZQ' -O ./input/
#!gdown '1wFEw-M80BFIcvy2a8j22r4xwR6eoK7zK'
# 列出目前目錄下所有的檔案
# 確認檔案下載成功
print("📄 input 資料夾內容：", os.listdir("./input/"))

Downloading...
From: https://drive.google.com/uc?id=1EDcqa8OmfR7Y-wmoiDauYg30HGbhkkZQ
To: /content/input/「賽德克．巴萊」重現的霧社事件(17).json
  0% 0.00/3.65k [00:00<?, ?B/s]100% 3.65k/3.65k [00:00<00:00, 10.8MB/s]
📄 input 資料夾內容： ['「賽德克．巴萊」重現的霧社事件(17).json']


In [45]:
#@title Setup folder
import os
from pathlib import Path

## Input data directory
data_dir = "./input/"
inputdirectory = Path(f"{data_dir}")
## This is where the output csv files will be written
out_dir = "./output/"
outputdirectory = Path(f"{out_dir}")
if not os.path.exists(outputdirectory):
    os.makedirs(outputdirectory)

In [46]:
#@title Load file
from pprint import pprint
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import JSONLoader
#載入json檔案
file='「賽德克．巴萊」重現的霧社事件(17).json'

## JSON Loader
loader = JSONLoader(
    file_path=data_dir+file,
    jq_schema='.articles[]')
data = loader.load()

# 將json內key:articles內文依每行分段)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

segmentation = splitter.split_documents(data)
#print("Number of chunks = ", len(segmentation))


In [47]:
#@title Create a dataframe of all the chunks
import uuid
import pandas as pd
import numpy as np

def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

# input the segmentation number
df = documents2Dataframe(segmentation)
#print(df.shape)
# df.head()
df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)

## Setup the LLM

In [48]:
# import the packages
from openai import OpenAI
import os

# https://console.groq.com/keys
os.environ["GROQ_API_KEY"] = 'gsk_'
model_name = "llama-3.3-70b-versatile"

client = OpenAI(
        base_url="https://api.groq.com/openai/v1",
        api_key=os.environ["GROQ_API_KEY"]
        )

In [49]:

#@title function to call the model to generate
def generate(sys_input: str, user_input: str, temp: str):
    #input = f"{prompt}"
    response = client.chat.completions.create(
            model=model_name,
            #response_format={"type": "json_object"},
            messages = [{'role':'system','content': sys_input},{'role':'user','content': user_input}],
            temperature = temp
            #stream=True
            #max_tokens=200,
    )
    return [(response.choices[0].message.content)]


def df2Graph(dataframe: pd.DataFrame) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        #lambda row:  testPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
        lambda row:  Prompt(row.text, {"chunk_id": row.chunk_id}), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list

def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

## Extract Concepts, Prompt

In [50]:
import json
def Prompt(input: str, metadata={}):
    #def graphPrompt(input: str, metadata={}, model="mistral-openorca:latest"):
    print(metadata['chunk_id'])
    SYS_PROMPT = (
        "你是一位network graph maker，需要提取文章中的實體(entities)及其關係(relation)。"
        "提供給你一段文章（由```分隔），任務是找出文章裡本體(ontology)"
        "找出文章中的關鍵術語(term)，這些實體(entities)的有以下分類:"
        "[人物、人名、地點、地名、物件、文件、事件、單位組織、雜項、數字單位、縮寫、概念、條件]\n"
        "步驟1：比對整段文章的同時，仔細思考(think about)其中提到的關鍵術語(term)。\n"
        "\t關鍵術語包括人物、人名、地點、地名、物件、文件、事件、單位組織、雜項、數字單位、縮寫、概念、條件等\n\n"
        "步驟2：仔細思考(think about)如何將關鍵術語(term)與其他術語(term)建立一對一的關係。\n"
        "\t在同一段文章中出現的關鍵術語(term)通常有相關(relation)。\n\n"
        "步驟3：找出每一對相關術語(term)之間的關係(relation)。\n"
        "\t以關鍵術語(term)的前後句子的動詞作為關係\n\n"
        "輸出務必以標準json格式，不需要多餘解釋說明，以下範例：\n"
        "["
        "   {"
        '       "node_1": "從提取的本體(ontology)中提取的實體(entities)",'
        '       "node_2": "從提取的本體(ontology)中提取的與node_1 有關(relation)的實體(entities)",'
        '       "edge": "node_1 和node_2 這兩個實體(entities)的關係(relation)，使用一個動詞(verb)簡化這個句子'
        "   }, {...}"
        "]"
    )

    USER_PROMPT = f"content: ```{input}``` \n\n output: "
    response = generate(sys_input=SYS_PROMPT, user_input=USER_PROMPT, temp = 0.0)
    print(response[0])
    try:
        result = json.loads(response[0])
        result = [dict(item, **metadata) for item in result]
    except:
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
        with open('error_file.log', 'w', encoding='utf-8') as file:
            #file.write("Original faulty JSON string:\n")
            file.write( metadata['chunk_id']+response[0])
            #file.write((str(metadata),response[0]))
    return result

In [51]:
#@title To regenerate the graph with LLM, set this to True
regenerate = True #False
if regenerate:
    # Generate a JSON with a LLM
    concepts_list = df2Graph(df)
    # JSON to DF
    dfg1 = graph2Df(concepts_list)
    dfg1.to_csv(outputdirectory/"dfg1.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"dfg1.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4

d7f7dd56111646ddb93ede8dc3e6498d
[
  {
    "node_1": "霧社事件",
    "node_2": "德哥塔雅原住民",
    "edge": "發生"
  },
  {
    "node_1": "霧社事件",
    "node_2": "日本帝國主義",
    "edge": "反抗"
  },
  {
    "node_1": "德哥塔雅原住民",
    "node_2": "莫那魯道",
    "edge": "率領"
  },
  {
    "node_1": "莫那魯道",
    "node_2": "霧社公校",
    "edge": "衝入"
  },
  {
    "node_1": "霧社公校",
    "node_2": "日本人",
    "edge": "殺死"
  },
  {
    "node_1": "台灣總督府",
    "node_2": "霧社事件",
    "edge": "採取"
  },
  {
    "node_1": "台灣總督府",
    "node_2": "飛機",
    "edge": "使用"
  },
  {
    "node_1": "飛機",
    "node_2": "化學毒氣",
    "edge": "投擲"
  },
  {
    "node_1": "莫那魯道",
    "node_2": "自縊",
    "edge": "死亡"
  },
  {
    "node_1": "德哥塔雅原住民",
    "node_2": "滅族",
    "edge": "遭受"
  }
]
200379d963e348a288089198d6bbf0ee
[
  {
    "node_1": "霧社事件",
    "node_2": "日本",
    "edge": "忽視"
  },
  {
    "node_1": "日本",
    "node_2": "泰雅族",
    "edge": "忽視"
  },
  {
    "node_1": "日本",
    "node_2": "原住民",
    "edge": "徵召"
  },
  {
    "node_1": "日本警官

In [52]:
## print the graph.csv
# print(dfg1.shape)
# dfg1.head()
#dfg1.tail()

In [53]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


In [54]:
#dfg1 = pd.read_csv(outputdirectory/"dfg1.csv", sep="|")
#dfg2 = contextual_proximity(dfg1)
#dfg2.to_csv(outputdirectory/"dfg2.csv", sep="|", index=False)
#dfg2.tail()

In [56]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)

dfg.to_csv(outputdirectory/"dfg.csv", sep="|", index=False)
#dfg.tail()

In [59]:
dfg = pd.read_csv(outputdirectory/"dfg.csv", sep="|")
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape
#nodes.tail()
# 具KG概念的NER實體
print (nodes)

['人見次郎' '人類文化' '人類生活歷史' '化學毒氣' '原住民' '原住民抗日精神' '台灣總督府' '史詩電影' '國家檔案' '導演'
 '巫金墩' '德哥塔雅原住民' '日本' '日本人' '日本帝國主義' '日本帝國議會' '日本警官' '泰雅族' '滅族' '理蕃政策'
 '石塚英藏' '種族衝突' '端倪' '總務長官人見次郎' '總督府' '總督石塚英藏' '自然界' '自縊' '莫那魯道' '軍國主義天皇思想'
 '部落婦女' '電影' '霧社事件' '霧社公校' '飛機' '總務長官' '抗日精神' '總督' '威信' '台灣反抗行動']


In [60]:
import networkx as nx
G = nx.Graph()
## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [63]:
# 畫圖社群
import matplotlib.pyplot as plt

def draw_communities(G, communities, title):
    pos = nx.spring_layout(G)
    plt.figure(figsize=(8, 6))
    for i, community in enumerate(communities):
        nx.draw_networkx_nodes(G, pos, nodelist=community, label=f'Community {i+1}')
    nx.draw_networkx_edges(G, pos)
    plt.title(title)
    plt.axis('off')
    plt.legend()
    plt.show()


# Girvan-Newman 演算法來檢測圖中的社群結構，並輸出檢測到的社群數量和具體的社群列表。
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
#print("Number of Communities = ", len(communities))
#print(communities)
#draw_communities(G, sorted(map(sorted, next_level_communities)), "Girvan-Newman Communities")

In [65]:
import seaborn as sns
import random

# 將調色板設置為 "hls"，即色相（Hue）、亮度（Lightness）和飽和度（Saturation）。Seaborn 將使用這個調色板來生成顏色。
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    #打亂顏色順序
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    # 將字典列表轉換為 pandas DataFrame，其中每一行對應一個節點及其分配的顏色和組別。
    df_colors = pd.DataFrame(rows)
    return df_colors

# 定義函數後，使用 communities 作為參數調用函數
colors = colors2Community(communities)
#colors

for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [66]:
from pyvis.network import Network
import IPython
# from IPython.display import HTML

graph_output_directory = "./index.html"

net = Network(
    notebook=True,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

#net.show(graph_output_directory, notebook=True)
#HTML(filename="./index.html")
import IPython
IPython.display.HTML(filename='./index.html')