In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

'\nCopyright (c) Microsoft Corporation.\n'

In [12]:
import os

import pandas as pd
import tiktoken

from graphrag.query.indexer_adapters import (
    read_indexer_entities, 
    read_indexer_reports, 
    read_indexer_relationships,
    read_indexer_text_units,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

## Global Search example

Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).

### LLM setup

In [13]:
api_key =  os.environ["ZHIPUAI_API_KEY"]
 
llm = ChatOpenAI(
    api_key=api_key,
    model="glm-4",
    api_base="https://open.bigmodel.cn/api/paas/v4",
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)


token_encoder = tiktoken.get_encoding("cl100k_base")

### Load community reports as context for global search

- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.
- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the `rank` attribute in the community reports table for context ranking)

In [14]:
# parquet files generated from indexing pipeline
INPUT_DIR = "./coplay_analysis_2024_02_graph/output/20240716-142712/artifacts/"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"

TEXT_UNIT_TABLE = "create_final_text_units"
# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

In [15]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Report records: 1398


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,1460,# The Withdrawn Message and Its Community Impa...,4,7.5,The Withdrawn Message and Its Community Impact,The impact severity rating is high due to the ...,This community is centered around a withdrawn ...,[{'explanation': 'The withdrawn message is a p...,"{\n ""title"": ""The Withdrawn Message and Its...",cd8bd1a6-35ed-4c19-b566-84970ad2b44c
1,1462,# The 'Family' and 'Common Interests' Communit...,4,6.5,The 'Family' and 'Common Interests' Community,The community's impact severity rating is mode...,This community is centered around the topics o...,[{'explanation': 'The entity 'family' plays a ...,"{\n ""title"": ""The 'Family' and 'Common Inte...",906e0e6c-5748-4a5d-9297-91dbb81e100b
2,1463,# Emoji Usage in Digital Communication\n\nThe ...,4,7.0,Emoji Usage in Digital Communication,The impact severity rating is high due to the ...,The community revolves around the use of emoji...,[{'explanation': 'The entity '表情图片' is central...,"{\n ""title"": ""Emoji Usage in Digital Commun...",f33e0cd4-87ca-4c20-90a7-8f572310e228
3,1464,# Image Sharing and Text Semantics Community\n...,4,7.5,Image Sharing and Text Semantics Community,The community has a high impact severity ratin...,This community is centered around the entities...,[{'explanation': 'Image Sharing serves as a cr...,"{\n ""title"": ""Image Sharing and Text Semant...",4ff0a442-8d91-4a55-b02d-e84267969a62
4,1465,# 快感转移社区\n\n该社区围绕“快感转移”这一核心概念展开，涉及个体在社交互动中情感体验...,4,7.0,快感转移社区,该社区的影响严重性评分为7.0，因为快感转移在社交互动中的广泛应用和其对个体行为的影响。,该社区围绕“快感转移”这一核心概念展开，涉及个体在社交互动中情感体验的传递和转换。主要实体包...,[{'explanation': '快感转移是指个体在社交互动中，将快乐或愉悦的情感体验通过...,"{\n ""title"": ""\u5feb\u611f\u8f6c\u79fb\u793...",577b3ecd-84a3-4334-bbbc-e83878d036d9


In [16]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
relationship_df.head()

Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""露ᥫᩣ""","""资源文件""",6.0,"The entities ""\u9732\u196b\u1a63"" and ""\u8d44\...","[2f9cd9703c27ddff0fa9566dd51cf050, 7e26ee101d0...",ea3e1090830f4dd487275c5b26431017,0,1004,14,1018
1,"""露ᥫᩣ""","""预期落空""",5.0,"The entity known as ""隐形"" (let's romanize it as...","[0334afcf65b714b055b1053620bdebf0, 1be0c7e8eb4...",3be7fb95d9d44770bff83ce1cd97da29,1,1004,134,1138
2,"""露ᥫᩣ""","""汪汪？""",1.0,"""露ᥫᩣ通过模仿狗叫来回应张毛峰，表现出一种互动行为。""",[139bacf9bda1d187c2b598ee00a4a15c],53b08a50449b4ad68d97b06d2cc34552,2,1004,1,1005
3,"""露ᥫᩣ""","""＝""",1.0,"""露ᥫᩣ的情绪状态反映了对话的紧张气氛。""",[139bacf9bda1d187c2b598ee00a4a15c],5f28d75699ca4adb9457b1d3ef616992,3,1004,1,1005
4,"""露ᥫᩣ""","""切""",1.0,"""露ᥫᩣ的情绪状态是对张毛峰意图的直接反应。""",[139bacf9bda1d187c2b598ee00a4a15c],26f48172f1ae4a3cad86b27d532ffde2,4,1004,1,1005


In [17]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 7987


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,df45c0960a2757f041adf9f4b5ef10ac,\n\n\n### 00_个人ip介绍\n> 昵称：露ᥫᩣ\n>\n> 资源文件(msg-8...,300,[014f54feee58fac2034a36c59f2793bc],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[ea3e1090830f4dd487275c5b26431017, 6f4aced1d97..."
1,bb0f578b1cec306d6fa524b919e750ab,身体状况的非理性关心。\n- **偶然性**：对话中插入的表情符号和突然改变话题（如从吃饭到...,300,[014f54feee58fac2034a36c59f2793bc],"[b45241d70f0e43fca764df95b2b81f77, 19a7f254a5d...","[3be7fb95d9d44770bff83ce1cd97da29, bd0bc746087..."
2,6ee515a5c906d798c2f853edd50f1cea,�究领域相关。\n10. **卢曼理论与社会学的交叉点**：这段对话是研究数字时代人际互动的...,300,[014f54feee58fac2034a36c59f2793bc],"[e1fd0e904a53409aada44442f23a51cb, de988724cfd...","[27d8a54157ab4b00844ef0dcfc8aeb0d, 01ac4b169e3..."
3,b7a921cb076f250b71696e5932098e13,各种观点，并在社交媒体的背景下进行应用和研究。\n```\n\n#### 03-故事场景生成...,300,[014f54feee58fac2034a36c59f2793bc],"[e1fd0e904a53409aada44442f23a51cb, de988724cfd...","[27d8a54157ab4b00844ef0dcfc8aeb0d, 042d8102a81..."
4,0f77df9e5f0e50ebcc703c7533499e6a,做的菜，但他一直要求我带他一起，我当然没答应。他甚至开玩笑说给我买东西，但我的心情并没有因此...,300,[014f54feee58fac2034a36c59f2793bc],"[de988724cfdf45cebfba3b13c43ceede, 254770028d7...","[216e518f1ff2428eb475c31d06809fa2, 67cb560129f..."


#### Build global context based on community reports

In [18]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

#### Perform global search

In [19]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.1,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.1,
}

In [20]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="Multiple Paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [21]:
import nest_asyncio
nest_asyncio.apply()
result = search_engine.search(
    "张毛峰说了哪些情话"
)

print(result.response)

Error parsing search response json
Traceback (most recent call last):
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 194, in _map_response_single_batch
    processed_response = self.parse_search_response(search_response)
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 233, in parse_search_response
    parsed_elements = json.loads(search_response)["points"]
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1

根据提供的数据，没有找到张毛峰所说的具体情话。


Error parsing search response json
Traceback (most recent call last):
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 194, in _map_response_single_batch
    processed_response = self.parse_search_response(search_response)
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 233, in parse_search_response
    parsed_elements = json.loads(search_response)["points"]
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1

根据提供的数据，张毛峰在社区中表达了一些情感，但并没有明确记录他所说的“情话”。在数据中，张毛峰与“不带他玩游戏”和“想我”两个实体存在直接关系，这反映了他在情感和社交活动中的矛盾心理。这可能表明他在社交互动中有所保留或存在某种不确定性。然而，没有具体的“情话”记录。


Error parsing search response json
Traceback (most recent call last):
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 194, in _map_response_single_batch
    processed_response = self.parse_search_response(search_response)
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 233, in parse_search_response
    parsed_elements = json.loads(search_response)["points"]
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1

什么都没有
什么都没有。


Error parsing search response json
Traceback (most recent call last):
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 194, in _map_response_single_batch
    processed_response = self.parse_search_response(search_response)
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 233, in parse_search_response
    parsed_elements = json.loads(search_response)["points"]
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1

什么情话都没有说


Error parsing search response json
Traceback (most recent call last):
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 194, in _map_response_single_batch
    processed_response = self.parse_search_response(search_response)
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 233, in parse_search_response
    parsed_elements = json.loads(search_response)["points"]
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1

什么都没有```json{  "points": [    {"description": "The data provided does not contain any information about what Zhang Maofeng said about love.", "score": 0}  ]}


Error parsing search response json
Traceback (most recent call last):
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 194, in _map_response_single_batch
    processed_response = self.parse_search_response(search_response)
  File "/media/gpt4-pdf-chatbot-langchain/graphrag/graphrag/query/structured_search/global_search/search.py", line 233, in parse_search_response
    parsed_elements = json.loads(search_response)["points"]
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/dmeck/.pyenv/versions/3.10.0/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1

根据提供的数据，张毛峰在对话中提到了多个浪漫元素，包括“爱人的手”、“浪漫的气息”、“玫瑰”和“白雪”。这些元素不仅象征着爱情和亲密关系，还反映了张毛峰在情感表达和氛围营造方面的特点。具体来说：1. 爱人的手：张毛峰用“爱人的手”来传达深情和亲密关系，这个符号在社区中的地位凸显了其在情感表达方面的重要性。[Data: Entities (5471), Relationships (2773, 4579, 12069)]2. 浪漫的气息：张毛峰在对话中体验到的情感状态，这种情感状态影响了他的情绪。这种浪漫气氛不仅体现在对话中，还涵盖了讨论主题周围的情感氛围。[Data: Entities (5981), Relationships (2774, 12077)]3. 玫瑰：张毛峰用玫瑰来表达爱意。玫瑰不仅在对谈中被提及，还与爱情和美丽紧密相关。[Data: Entities (5469), Relationships (2771, 4578, 12069, 12070)]4. 白雪：张毛峰用白雪来传达爱情的纯洁和宁静。白雪作为自然景观，不仅增强了浪漫氛围，还在情感表达方面具有象征意义。[Data: Entities (5470), Relationships (2772, 12070, 12071)]这些浪漫元素的使用显示了张毛峰在情感表达和氛围营造方面的特点。
张毛峰在社交媒体上对露表达了他的情感，包括直接表达爱意和关心 [Data: Reports (747, 600, 586, 545, 300, +more)]。


In [30]:
# inspect the data used to build the context for the LLM responses
df = result.context_data["reports"]
  
# Sort the DataFrame by 'id' as strings
df_extended = df.sort_values(by='id', key=lambda x: x.astype(int))


# Filter the DataFrame to only include the specified 'id' values
ids_to_print = ["5503"]
filtered_df = df_extended[df_extended['id'].isin(ids_to_print)]

first_content = filtered_df.iloc[0]['content']

first_content

Unnamed: 0,id,title,occurrence weight,content,rank
406,3,Problem Solving and Communication Dynamics,0.001845,# Problem Solving and Communication Dynamics\n...,4.5
649,5,Image Popup and JavaScript Functionality,0.001845,# Image Popup and JavaScript Functionality\n\n...,3.0
981,10,忙碌的工作与享受游戏时光,0.000923,# 忙碌的工作与享受游戏时光\n\n该社区主要由两个关键实体组成，即忙碌的工作状态和享受游戏...,4.5
744,18,张毛峰与一心二用行为,0.000923,# 张毛峰与一心二用行为\n\n该社区围绕张毛峰声称的能够同时进行两项活动的行为构建。张毛峰...,3.0
55,19,Game Time and Shared Relaxation,0.001845,# Game Time and Shared Relaxation\n\nThis comm...,3.5
...,...,...,...,...,...
68,1262,The Dad and His Community,0.011070,# The Dad and His Community\n\nThis community ...,6.5
762,1264,'露露的爸爸' and '炖大骨头',0.001845,# '露露的爸爸' and '炖大骨头'\n\nThis community is cent...,4.5
809,1265,Pet Discussion with Zhang Mudong,0.001384,# Pet Discussion with Zhang Mudong\n\nThis com...,3.0
646,1266,日常琐事与情感反应社区,0.003229,# 日常琐事与情感反应社区\n\n该社区围绕日常琐事这一主题展开，涉及到张毛峰、露ᥫᩣ等人的...,4.5


In [33]:
# inspect the data used to build the context for the LLM responses
len(report_df)
# # Sort the DataFrame by 'id' as strings
# df_reports = report_df.sort_values(by='human_readable_id', key=lambda x: x.astype(int))
 
# # Filter the DataFrame to only include the specified 'id' values
# ids_to_print = ["5503"]
# filtered_df_reports = df_reports[df_reports['human_readable_id'].isin(ids_to_print)]

# filtered_df_reports

1398

In [25]:

# 创建 DataFrame
df_entities = pd.DataFrame(entities)

# Filter the DataFrame to only include the specified 'id' values
ids_to_print = ["481"]
filtered_df = df_entities[df_entities['short_id'].isin(ids_to_print)]
filtered_df

Unnamed: 0,id,short_id,title,type,description,description_embedding,name_embedding,graph_embedding,community_ids,text_unit_ids,document_ids,rank,attributes
1566,a1773cac7d4c4939aec965660e5015fe,481,"""日常点滴""","""CONVERSATION_TOPIC""","""日常点滴是指患者分享的日常生活细节，这些细节可能与其情绪状态和问题有关。""","[-0.0018155464, 0.026184125, -0.027611624, -0....",,,[232],[3ecd619f711e0171353229fae1a60e4e],,2,


In [37]:



# 创建 DataFrame
df_relationships = pd.DataFrame(relationships)
 


# Filter the DataFrame to only include the specified 'id' values
ids_to_print = ["1697", "2257"]
filtered_df = df_relationships[df_relationships['short_id'].isin(ids_to_print)]
filtered_df


Unnamed: 0,id,short_id,source,target,weight,description,description_embedding,text_unit_ids,document_ids,attributes
1697,bd2978a9aff1462f9fa5e82f6a826407,1697,"""安全的空间""","""日常点滴""",1.0,"""在安全的空间中，张毛峰被鼓励分享他的日常点滴，这些细节可能有助于理解他的情况。""",,[3ecd619f711e0171353229fae1a60e4e],,{'rank': 11}
2257,97febb21fcdb4b9ebc5a4c01c6581381,2257,"""情绪体验""","""日常点滴""",1.0,"""通过讨论日常点滴，可以引导张毛峰表达他的情绪体验。""",,[3ecd619f711e0171353229fae1a60e4e],,{'rank': 8}


In [55]:

# 创建 DataFrame
df_text_units = pd.DataFrame(text_units)
 

# Filter the DataFrame to only include the specified 'id' values
ids_to_print = ["3ecd619f711e0171353229fae1a60e4e", "3ecd619f711e0171353229fae1a60e4e"]
filtered_df = df_text_units[df_text_units['id'].isin(ids_to_print)]
filtered_df

Unnamed: 0,id,short_id,text,text_embedding,entity_ids,relationship_ids,covariate_ids,n_tokens,document_ids,attributes
184,3ecd619f711e0171353229fae1a60e4e,184,# 故事梗概\n \nNone\n\n\n# 故事情境\n\nNone\n\n\n# 故事场...,,"[b45241d70f0e43fca764df95b2b81f77, 43544b99c3b...","[86355651c9bb4820870218527e5cc981, 979e2a88198...",,300,[2eb13b054b64fc5fccf3124f54f356d2],


In [61]:

create_base_documents_df = pd.read_parquet(f"{INPUT_DIR}/create_base_documents.parquet")
 
# Filter the DataFrame to only include the specified 'id' values
ids_to_print = ["2eb13b054b64fc5fccf3124f54f356d2"]
filtered_df = create_base_documents_df[create_base_documents_df['id'].isin(ids_to_print)]
 

first_content = filtered_df.iloc[0]['raw_content']
first_content

'\n\n\n### 00_个人ip介绍\n> 昵称：张毛峰\n>\n> 资源文件(msg-3_5_keyframe)\n>\n> 字幕文件([msg-3_5_keyframe.csv](/media/gpt4-pdf-chatbot-langchain/InterpretationoDreams/社会交流步骤分析/msg_extract_csv/msg-3_5_keyframe.csv))\n>\n> 存储文件([storage_msg-3_5_keyframe](/media/gpt4-pdf-chatbot-langchain/InterpretationoDreams/社会交流步骤分析/msg_extract_storage/张毛峰/storage_msg-3_5_keyframe))\n\n\n#### 03- 故事情境生成 `story_scenario_context`\n```text\nNone\n```\n\n#### 03-故事场景生成 `scene_monologue_context`\n```text\nNone\n```\n\n#### 04-情感情景引导\n```text\n作为一个心理咨询工作者，尝试使用开放性问题与来访者交流，尝试带入他们的感受和思维，下面内容中已经整理了关于None的`故事梗概`，`故事情境`,`故事场景`，请告诉我如何引导来访者说出自己的问题\n你可以尝试分步思考然后告诉我答案，Step by Step Decomposition\n\n例如\n**Step 1: 建立情感连接**\n开始时，您可以通过表达理解和共鸣来建立情感连接，让来访者感到舒适。您可以说：“我注意到这个对话中有许多愉快的时刻和互动。你对这些时刻有什么特别的感受吗？”\n\n**Step 2: 探索来访者的感受**\n继续引导来访者表达他们的感受。您可以问：“在这个对话中，有哪些瞬间让你感到开心或快乐？”\n\n**Step 3: 询问是否有反感情绪**\n除了积极的情感，也要问询是否有一些负面情感或担忧。您可以说：“除了快乐的瞬间，是否有一些让你感到不安或担忧的地方？”\n\n\n\n# 故事梗概\n \nNone\n\n\n# 故事情境\n\nNone\n\n\n# 故事场景\n\nNone\n\n```\n\n#### 04-情感情景引导