In [None]:
import yaml
from pathlib import Path
from ollama import Client
import json
import pickle
import os
import pandas as pd
import re
import json
import pynvml
import time

# Initialize Ollama client
client = Client(host='http://localhost:11434')

# Load NER prompt template
ner_prompt_path = Path('../prompts') / 'entity_extraction.yaml'
with open(ner_prompt_path, 'r', encoding='utf-8') as file:
    ner_prompt_content = yaml.safe_load(file)
ner_prompt = ner_prompt_content['entity_extraction']

# Load summarization prompt template
sum_prompt_path = Path('../prompts') / 'article_summarization.yaml'
with open(sum_prompt_path, 'r', encoding='utf-8') as file:
    sum_prompt_content = yaml.safe_load(file)
sum_prompt = sum_prompt_content['article_summarization']

# Example text to process
sample_text = """
Apple announced its new iPhone 15 on September 12, 2023. 
Tim Cook presented the event at Apple Park in Cupertino, California. 
The event was also streamed live on YouTube, where millions of viewers tuned in.
"""

In [18]:
def get_gpu_temeprature():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
    return temperature

In [17]:
def gpu_temperature_rest_time():
    if get_gpu_temeprature() >= 85:
        return 60
    else:
        return 0

In [2]:
def extract_json(text):
    # Search for a block that starts with '{' and ends with '}'.
    # The re.DOTALL flag allows the '.' to match newline characters.
    match = re.search(r'\{.*\}', text, re.DOTALL)
    if match:
        json_str = match.group(0)
        try:
            # Try to decode the JSON string into a Python object
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
            return None
    else:
        print("No JSON block found in the text.")
        return None

In [3]:
def get_ollama_summary(article_body, sum_prompt, model="gemma3:27b-it-q8_0"):
# Prepare the messages with both system and user prompts
    sum_messages = [
        {
            'role': 'system',
            'content': sum_prompt['system_prompt']
        },
        {
            'role': 'user',
            'content': sum_prompt['user_prompt_template'].replace("{text_to_process}", article_body)
        }
    ]

    # Call Ollama API
    response = client.chat(
        model=model,
        messages=sum_messages,
        options={"temperature":0.4}
    )

    # Get the raw response content
    summary = response['message']['content']

    return summary

In [4]:
def get_ollama_entities(article_summary, ner_prompt, model="gemma3:27b-it-q8_0"):
# Prepare the messages with both system and user prompts
    ner_messages = [
        {
            'role': 'system',
            'content': ner_prompt['system_prompt']
        },
        {
            'role': 'user',
            'content': ner_prompt['user_prompt_template'].replace("{text_to_process}", article_summary)
        }
    ]

    # Call Ollama API
    response = client.chat(
        model=model,
        messages=ner_messages,
        options={"temperature":0.2}
    )

    # Get the raw response content
    entities_json = extract_json(response['message']['content'])

    return entities_json 

In [5]:
# Load baseline graph
with open("../data/MultiHop_graph_w_sem_embeddings.pkl", "rb") as f:
    G = pickle.load(f)

In [6]:
# Path to corpus file
multihop_corpus_path = os.path.join("..", "data", "Multi-hop_RAG_dataset", "corpus.json")

# Read JSON
with open(multihop_corpus_path, "r", encoding="utf-8") as f:
    corpus = json.load(f)

# Convert corpus data into df
corpus_as_df = pd.DataFrame(corpus)


In [None]:
# Initialize dictionary to store detected entities and counter variable
entities_dict = {} 

# For each node in the graph...
for node, data in G.nodes(data=True):

    # If the node is of type "article"...
    if data["type"] == 'article':
        
        print("Processing article: ", node)
        
        # Retrieve article body 
        article_body = corpus_as_df.iloc[node]["body"]

        # Check GPU temperature and wait if necessary 
        while gpu_temperature_rest_time() != 0:
            time.sleep(gpu_temperature_rest_time())
            print("Pausing code execution to cool down GPU...")

        # Generate LLM summary of article for entity extraction 
        llm_summary = get_ollama_summary(article_body, sum_prompt)

       # Initialize entity extraction loop tracking variables 
        json_entities = None
        while_count = 0

        # While we don't have a valid Json output for entities... 
        while json_entities is None:

            # Check GPU temperature and wait if necessary 
            while gpu_temperature_rest_time() != 0:
                time.sleep(gpu_temperature_rest_time())
                print("Pausing code execution to cool down GPU...")

            # Extract entities 
            json_entities = get_ollama_entities(llm_summary, ner_prompt)
            while_count += 1

            # If after three attempts the output is not valid... 
            if while_count == 3:
                # Exit loop and skip  
                print("JSON not extracted for: ", node)
                break

        # Include entities and summary in "entities_dict" 
        entities_dict[node] ={"entities":json_entities, "summary":llm_summary}

1
Amazon’s Cyber Monday sales are ongoing through November 27th, extending from their Black Friday start on November 17th, and include both carry-over and new deals. The sales cover a wide range of products including Amazon devices (Echo, Fire TV, Kindle), Apple products, TVs, laptops, headphones, tablets, gaming items, speakers, vacuums, kitchen appliances, smart home devices, fitness trackers, beauty tech, drones, cameras, Lego, and gift cards. 

Specific deals highlighted include the Echo Show for under $40, the 10th generation 64GB iPad for $349, a 65-inch Fire TV at a record low price, the Microsoft Surface Laptop Go 3 for $599.99, Bose QuietComfort 45 headphones for under $200, and the Meta Quest 2 with a $50 credit. Many deals are marked with symbols indicating all-time low prices (🔥) or Prime member exclusives (📨).
While count:  1
2
Betashares will acquire Bendigo and Adelaide Bank’s superannuation business, marking its entry into the superannuation sector. The Australian share

In [None]:
# Identify entity citations in chunks

# for each article in the multi-hop dataset
for article_id in range(610):

    # identify all "chunk" nodes derived from a given article 
    prefix = f"{article_id}_chunk"
    matching_nodes = [node for node in G.nodes if str(node).startswith(prefix)]

    # for each chunk node 
    for node in matching_nodes:

        # for each entity found in an article... 
        for entity in entities_dict[article_id]['entities']['entities']:

            # Apply "setdefault" method with "appears_in" list of entity
            entity.setdefault('appears_in', [])

            # If the entity appears in the considered chunk...
            if entity['name'] in G.nodes[node]["text"]:

                # Add chunk node id to the entity's 'appears_in' list 
                entity['appears_in'].append(node)


['0_chunk_0', '0_chunk_1']

In [None]:
#temperature = get_gpu_temeprature()

29

In [36]:
with open("../data/extracted_entities.pkl", "wb") as f:
    pickle.dump(entities_dict, f)