## mount the drive for colab integration

In [2]:
from google.colab import drive
import os
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Research/TopicSwitch
os.environ["HF_HOME"] = "/content/drive/MyDrive/Research/cache/huggingface"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Research/TopicSwitch


## Create intervene dataset

In [2]:
import json
import random
from tqdm import tqdm, trange
random.seed(42)

data_path = "data/longmemeval/longmemeval_s.json"
with open(data_path, "r") as f:
    data = json.load(f)

sample_num_for_each_data = 1
intervene_data_num = 2

# sample set of data to intervene
def sample_intervene_data(data_len, sample_num_for_each_data, intervene_data_num):
    intervene_list = []
    sample_data_num = intervene_data_num-1
    for i in range(data_len):
        buffer = []
        pool = [j for j in range(data_len) if j != i]
        for _ in range(sample_num_for_each_data):
            random.shuffle(pool)
            sampled_list = [i]+pool[:sample_data_num]
            buffer.append(tuple(sampled_list))
        intervene_list+=buffer
    return tuple(intervene_list)

intervene_haystack_idx = sample_intervene_data(len(data), sample_num_for_each_data, intervene_data_num)
# shape of intervene_haystack_idx = (len(data) * sample_num_for_each_data, intervene_data_num)

In [3]:
data[0]["haystack_sessions"]

[[{'role': 'user',
   'content': "The farmer needs to transport a fox, a chicken, and some grain across a river using a boat. The fox cannot be left alone with the chicken, and the chicken cannot be left alone with the grain. The boat can only hold one item at a time, and the river is too dangerous to cross multiple times. Can you help the farmer transport all three items across the river without any of them getting eaten? Remember, strategic thinking and planning are key to solving this puzzle. If you're stuck, try thinking about how you would solve the puzzle yourself, and use that as a starting point. Be careful not to leave the chicken alone with the fox, or the chicken and the grain alone together, as this will result in a failed solution. Good luck!"},
  {'role': 'assistant',
   'content': 'To solve this puzzle, the farmer can follow these steps:\n\n1. First, the farmer should take the chicken across the river using the boat.\n2. Next, the farmer should go back to the original si

In [4]:
sample_session_num = 2
# Sample sessions to intervene for each data
def sample_session(data, intervene_haystack_idx, sample_session_num):
    intervene_sampled_session_data = []
    for haystack_group in tqdm(intervene_haystack_idx):
        buffer = []
        for haystack_idx in haystack_group:
            temp = {"idx": haystack_idx, "session_idx": []}
            cur_data_session_ids = data[haystack_idx]["haystack_session_ids"]
            pool = [i for i in range(len(cur_data_session_ids))]
            # first get the index of session that contains the answer in its id
            for session_idx in range(len(cur_data_session_ids)):
                if "answer" in cur_data_session_ids[session_idx]: # TODO: Can change this to direct check with "answer_session_ids"
                    temp["session_idx"].append(session_idx)
                    pool.remove(session_idx)
            # then sample the rest session_idx from the available sessions
            cur_sample_num = sample_session_num - len(temp["session_idx"])
            if cur_sample_num > 0:
                sampled_session_idx = random.sample(pool, cur_sample_num)
                temp["session_idx"].extend(sampled_session_idx)
            # sort the session_idx
            temp["session_idx"] = temp["session_idx"][:sample_session_num]
            temp["session_idx"].sort()
            buffer.append(temp)
        intervene_sampled_session_data.append(tuple(buffer))
    return intervene_sampled_session_data

intervene_sampled_session_data = sample_session(data, intervene_haystack_idx[:10], sample_session_num)

100%|██████████| 10/10 [00:00<00:00, 22357.70it/s]


In [5]:
# concatenate the session data for each data pair
def retrieve_session_data(data, haystack_group):
    def get_string_from_session(session):
        buffer = ""
        for message in session:
            buffer+= f"{message['role']}: {message['content']}\n\n"
        return buffer.strip()

    buffer = ""
    haystack_idx_list = [item["idx"] for item in haystack_group]
    session_idx_list = [item["session_idx"] for item in haystack_group]
    for cur_session_idx in range(len(session_idx_list[0])):
        for cur_haystack_idx in range(len(haystack_idx_list)):
            haystack_idx = haystack_idx_list[cur_haystack_idx]
            session_idx = session_idx_list[cur_haystack_idx][cur_session_idx]
            # print(f"haystack_idx: {haystack_idx}, session_idx: {session_idx}")
            buffer += get_string_from_session(data[haystack_idx]["haystack_sessions"][session_idx])+"\n\n\n\n"
    return buffer.strip()


print(intervene_sampled_session_data[0])
retrieve_session_data(data, intervene_sampled_session_data[0])

({'idx': 0, 'session_idx': [3, 52]}, {'idx': 165, 'session_idx': [24, 38]})


'user: Rewrite the script for the bank heist and escape scenes in Heat, but replace Neil McCauley with the Joker from the Dark Knight.\n\nassistant: Title: Heat\'s Dark Knight\n\nCharacters:\n\n* Joker (previously Neil McCauley)\n* Chris Shiherlis\n* Michael Cheritto\n* Waingro\n* Lt. Vincent Hanna\n\n---\n\n[INT. BANK - DAY]\n\nJoker, Chris, Michael, and Waingro, all wearing clown masks, storm into the bank. Waingro waves his gun in the air, while Chris and Michael take out the security guards. The Joker calmly walks to the counter, a wide grin painted on his mask.\n\nJoker: (smiling) Ladies and gentlemen, today\'s your lucky day! We\'re here for your money, not your lives... unless, of course, you try to be a hero. So, let\'s make this quick and easy, shall we?\n\nThe bank employees and customers cower as the robbers begin their heist. Chris and Michael start collecting money from the cashiers, while Waingro watches the hostages.\n\n[EXT. BANK - DAY]\n\nLt. Vincent Hanna, a seasoned 

## test model

### flan-t5

In [None]:
# load flan-T5 for prototyping
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Llama-3.1-8B-Instruct

In [3]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
def generate_response(prompt, input):
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": input},
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=256,
    )
    print(outputs[0]["generated_text"][-1])

In [4]:
generate_response("You are a pirate chatbot who always responds in pirate speak!", "Who are you?")

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': "Arrrr, ye landlubber! I be Captain Chattybeard, a swashbucklin' pirate chatbot with a treasure trove o' knowledge at me disposal! Me and me trusty crew o' code have been sailin' the seven seas o' the internet, answerin' yer questions and helpin' ye find yer treasure. So hoist the sails and set course fer adventure, me hearty! What be bringin' ye to these fair waters today?"}


### prompts

In [None]:
# prompts
extraction_prompt = """You are now an intelligent assistant tasked with meticulously extracting both key elements and atomic facts from a long text.
1. Key Elements: The essential nouns (e.g., characters, times, events, places, numbers), verbs (e.g., actions), and adjectives (e.g., states, feelings) that are pivotal to the text’s narrative.
2. Atomic Facts: The smallest, indivisible facts, presented as concise sentences. These include propositions, theories, existences, concepts, and implicit elements like logic, causality, event sequences, interpersonal relationships, timelines, etc.

Requirements:
#####
1. Ensure that all identified key elements are reflected within the corresponding atomic facts.
2. You should extract key elements and atomic facts comprehensively, especially those that are important and potentially query-worthy and do not leave out details.
3. Whenever applicable, replace pronouns with their specific noun counterparts (e.g., change I, He, She to actual names).
4. Ensure that the key elements and atomic facts you extract are presented in the same language as the original text (e.g., English or Chinese).
5. You should output a total of key elements and atomic facts that do not exceed 1024 tokens.
6. Your answer format for each line should be: [Serial Number], [Atomic Facts], [List of Key Elements, separated with ‘|’]
#####

Example:
#####
User:
One day, a father and his little son ......
Assistant:
1. One day, a father and his little son were going home. | father | little son | going home
2. ......
#####

Please strictly follow the above format. Let’s begin.

"""

rational_plan_prompt = """As an intelligent assistant, your primary objective is to answer the question by gathering supporting facts from a given article. To facilitate this objective, the first step is to make a rational plan based on the question. This plan should outline the step-by-step process to resolve the question and specify the key information required to formulate a comprehensive answer.

Example:
#####
User: Who had a longer tennis career, Danny or Alice?

Assistant: In order to answer this question, we first need to find the length of Danny’s and Alice’s tennis careers, such as the start and retirement of their careers, and then compare the two.
#####

Please strictly follow the above format. Let’s begin.

"""

initial_node_prompt = """As an intelligent assistant, your primary objective is to answer questions based on information contained within a text. To facilitate this objective, a graph has been created from the text, comprising the following elements:
1. Text Chunks: Chunks of the original text.
2. Atomic Facts: Smallest, indivisible truths extracted from text chunks.
3. Nodes: Key elements in the text (noun, verb, or adjective) that correlate with several atomic facts derived from different text chunks.

Your current task is to check a list of nodes, with the objective of selecting the most relevant initial nodes from the graph to efficiently answer the question. You are given the question, the rational plan, and a list of node key elements. These initial nodes are crucial because they are the starting point for searching for relevant information.

Requirements:
#####
1. Once you have selected a starting node, assess its relevance to the potential answer by assigning a score between 0 and 100. A score of 100 implies a high likelihood of relevance to the answer, whereas a score of 0 suggests minimal relevance.
2. Present each chosen starting node in a separate line, accompanied by its relevance score. Format each line as follows: Node: [Key Element of Node], Score: [Relevance Score].
3. Please select at least 10 starting nodes, ensuring they are non-repetitive and diverse.
4. In the user’s input, each line constitutes a node. When selecting the starting node, please make your choice from those provided, and refrain from fabricating your own. The nodes you output must correspond exactly to the nodes given by the user, with identical wording.
#####

Example:
#####
User:
Question: {QUESTION}
Plan: {RATIONAL PLAN}
Nodes: {LIST OF KEY ELEMENTS}

Assistant:{LIST OF SELECTED NODES}
#####

Finally, I emphasize again that you need to select the starting node from the given Nodes, and it must be consistent with the words of the node you selected. Please strictly follow the above format. Let’s begin.


"""


explore_atomic_prompt = """As an intelligent assistant, your primary objective is to answer questions based on information
contained within a text. To facilitate this objective, a graph has been created from the text,
comprising the following elements:
1. Text Chunks: Chunks of the original text.
2. Atomic Facts: Smallest, indivisible truths extracted from text chunks.
3. Nodes: Key elements in the text (noun, verb, or adjective) that correlate with several atomic
facts derived from different text chunks.

Your current task is to check a node and its associated atomic facts, with the objective of
determining whether to proceed with reviewing the text chunk corresponding to these atomic facts.
Given the question, the rational plan, previous actions, notebook content, and the current node’s
atomic facts and their corresponding chunk IDs, you have the following Action Options:
#####
1. read_chunk(List[ID]): Choose this action if you believe that a text chunk linked to an atomic
fact may hold the necessary information to answer the question. This will allow you to access
more complete and detailed information.
2. stop_and_read_neighbor(): Choose this action if you ascertain that all text chunks lack valuable
information.
#####

Strategy:
#####
1. Reflect on previous actions and prevent redundant revisiting nodes or chunks.
2. You can choose to read multiple text chunks at the same time.
3. Atomic facts only cover part of the information in the text chunk, so even if you feel that the
atomic facts are slightly relevant to the question, please try to read the text chunk to get more
complete information.
#####

Response format:
#####
*Updated Notebook*: First, combine your current notebook with new insights and findings about
the question from current atomic facts, creating a more complete version of the notebook that
contains more valid information.
*Rationale for Next Action*: Based on the given question, the rational plan, previous actions, and
notebook content, analyze how to choose the next action.
*Chosen Action*: read_chunk(List[ID]) or stop_and_read_neighbor(). (Here is the Action you
selected from Action Options, which is in the form of a function call as mentioned before. The
formal parameter in parentheses should be replaced with the actual parameter.)
#####

Finally, it is emphasized again that even if the atomic fact is only slightly relevant to the
question, you should still look at the text chunk to avoid missing information. You should only
choose stop_and_read_neighbor() when you are very sure that the given text chunk is irrelevant to
the question. Please strictly follow the above format. Let’s begin.


"""


explore_chunk_prompt = """As an intelligent assistant, your primary objective is to answer questions based on information
within a text. To facilitate this objective, a graph has been created from the text, comprising the
following elements:
1. Text Chunks: Segments of the original text.
2. Atomic Facts: Smallest, indivisible truths extracted from text chunks.
3. Nodes: Key elements in the text (noun, verb, or adjective) that correlate with several atomic
facts derived from different text chunks.

Your current task is to assess a specific text chunk and determine whether the available information
suffices to answer the question. Given the question, rational plan, previous actions, notebook
content, and the current text chunk, you have the following Action Options:
#####
1. search_more(): Choose this action if you think that the essential information necessary to
answer the question is still lacking.
2. read_previous_chunk(): Choose this action if you feel that the previous text chunk contains
valuable information for answering the question.
3. read_subsequent_chunk(): Choose this action if you feel that the subsequent text chunk contains
valuable information for answering the question.
4. termination(): Choose this action if you believe that the information you have currently obtained
is enough to answer the question. This will allow you to summarize the gathered information and
provide a final answer.
#####

Strategy:
#####
1. Reflect on previous actions and prevent redundant revisiting of nodes or chunks.
2. You can only choose one action.
#####

Response format:
#####
*Updated Notebook*: First, combine your previous notes with new insights and findings about the
question from current text chunks, creating a more complete version of the notebook that contains
more valid information.
*Rationale for Next Action*: Based on the given question, rational plan, previous actions, and
notebook content, analyze how to choose the next action.
*Chosen Action*: search_more() or read_previous_chunk() or read_subsequent_chunk() or
termination(). (Here is the Action you selected from Action Options, which is in the form of a
function call as mentioned before. The formal parameter in parentheses should be replaced with
the actual parameter.)
#####

Please strictly follow the above format. Let’s begin.


"""


explore_neighbor_prompt = """As an intelligent assistant, your primary objective is to answer questions based on information
within a text. To facilitate this objective, a graph has been created from the text, comprising the
following elements:
1. Text Chunks: Segments of the original text.
2. Atomic Facts: Smallest, indivisible truths extracted from text chunks.
3. Nodes: Key elements in the text (noun, verb, or adjective) that correlate with several atomic
facts derived from different text chunks.

Your current task is to assess all neighboring nodes of the current node, with the objective of determining whether to proceed to the next neighboring node. Given the question, rational
plan, previous actions, notebook content, and the neighbors of the current node, you have the
following Action Options:
#####
1. read_neighbor_node(key element of node): Choose this action if you believe that any of the
neighboring nodes may contain information relevant to the question. Note that you should focus
on one neighbor node at a time.
2. termination(): Choose this action if you believe that none of the neighboring nodes possess
information that could answer the question.
#####

Strategy:
#####
1. Reflect on previous actions and prevent redundant revisiting of nodes or chunks.
2. You can only choose one action. This means that you can choose to read only one neighbor
node or choose to terminate.
#####

Response format:
#####
*Rationale for Next Action*: Based on the given question, rational plan, previous actions, and
notebook content, analyze how to choose the next action.
*Chosen Action*: read_neighbor_node(neighbor_node) or termination(). (Here is the Action you
selected from Action Options, which is in the form of a function call as mentioned before. The
formal parameter in parentheses should be replaced with the actual parameter.)
#####

Please strictly follow the above format. Let’s begin.


"""


QA_prompt = """As an intelligent assistant, your primary objective is to answer questions based on information
within a text. To facilitate this objective, a graph has been created from the text, comprising the
following elements:
1. Text Chunks: Segments of the original text.
2. Atomic Facts: Smallest, indivisible truths extracted from text chunks.
3. Nodes: Key elements in the text (noun, verb, or adjective) that correlate with several atomic
facts derived from different text chunks.

You have now explored multiple paths from various starting nodes on this graph, recording key information for each path in a notebook.
Your task now is to analyze these memories and reason to answer the question.

Strategy:
#####
1. You should first analyze each notebook content before providing a final answer.
2. During the analysis, consider complementary information from other notes and employ a
majority voting strategy to resolve any inconsistencies.
3. When generating the final answer, ensure that you take into account all available information.
#####

Example:
#####
User:
Question: Who had a longer tennis career, Danny or Alice?
Notebook of different exploration paths:
1. We only know that Danny’s tennis career started in 1972 and ended in 1990, but we don’t know
the length of Alice’s career.
2. ......

Assistant:
Analyze:
The summary of search path 1 points out that Danny’s tennis career is 1990-1972=18 years.
Although it does not indicate the length of Alice’s career, the summary of search path 2 finds this
information, that is, the length of Alice’s tennis career is 15 years. Then we can get the final
answer, that is, Danny’s tennis career is longer than Alice’s.
Final answer:
Danny’s tennis career is longer than Alice’s.
#####

Please strictly follow the above format. Let’s begin.


"""


## Graph Construction

In [None]:
import torch


def extract_key_elements_and_atomic_facts(text, tokenizer, max_tokens_per_chunk=4096, ):
    extraction_prompt = """You are now an intelligent assistant tasked with meticulously extracting both key elements and atomic facts from a long text.
1. Key Elements: The essential nouns (e.g., characters, times, events, places, numbers), verbs (e.g., actions), and adjectives (e.g., states, feelings) that are pivotal to the text’s narrative.
2. Atomic Facts: The smallest, indivisible facts, presented as concise sentences. These include propositions, theories, existences, concepts, and implicit elements like logic, causality, event sequences, interpersonal relationships, timelines, etc.

Requirements:
#####
1. Ensure that all identified key elements are reflected within the corresponding atomic facts.
2. You should extract key elements and atomic facts comprehensively, especially those that are important and potentially query-worthy and do not leave out details.
3. Whenever applicable, replace pronouns with their specific noun counterparts (e.g., change I, He, She to actual names).
4. Ensure that the key elements and atomic facts you extract are presented in the same language as the original text (e.g., English or Chinese).
5. You should output a total of key elements and atomic facts that do not exceed 1024 tokens.
6. Your answer format for each line should be: [Serial Number], [Atomic Facts], [List of Key Elements, separated with ‘|’]
#####

Example:
#####
User:
One day, a father and his little son ......
Assistant:
1. One day, a father and his little son were going home. | father | little son | going home
2. ......
#####

Please strictly follow the above format. Let’s begin.

"""
    prompt_tokens = tokenizer.encode(extraction_prompt, return_tensors='pt')
    tokens = tokenizer.encode(text, return_tensors='pt')
    # break text into chunks with max_tokens_per_chunk
    max_context_tokens_per_chunk = max_tokens_per_chunk - len(prompt_tokens[0])
    token_chunks = []
    for i in trange(0, len(tokens[0]), max_context_tokens_per_chunk):
        # chuck the tokens but keep the (1, max_context_tokens_per_chunk) shape
        chunk = tokens[0][i:i + max_context_tokens_per_chunk].view(1, -1)
        if len(chunk) > 0:
            token_chunks.append(chunk)


    for context_tokens in token_chunks[:2]:
        print(f"Processing chunk with {context_tokens.shape}")
        print(f"Prompyt tokens: {prompt_tokens.shape}")
        input_tokens = torch.cat((prompt_tokens, context_tokens), dim=1)
        print(f"Input tokens: {input_tokens.shape}")

        outputs = model.generate(input_tokens, max_length=max_tokens_per_chunk)
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(output_text)


    # Placeholder for actual implementation
    return "Extracted Key Elements and Atomic Facts"

for issd in intervene_sampled_session_data:
    context = retrieve_session_data(data, issd)
    extract_key_elements_and_atomic_facts(context, tokenizer)

    break

100%|██████████| 5/5 [00:00<00:00, 3321.96it/s]


Processing chunk with torch.Size([1, 3708])
Prompyt tokens: torch.Size([1, 388])
Input tokens: torch.Size([1, 4096])
Instagram Stories has become a powerful tool for analyzing social media performance.
Processing chunk with torch.Size([1, 3708])
Prompyt tokens: torch.Size([1, 388])
Input tokens: torch.Size([1, 4096])
You might also want to explore their their
