In [1]:
import yaml
from pathlib import Path
from ollama import Client
import json
import pickle
import os
import pandas as pd
import re
import json
import pynvml
import time
import subprocess

# Initialize Ollama client
client = Client(host='http://localhost:11434')

# Load NER prompt template
ner_prompt_path = Path('../prompts') / 'entity_extraction.yaml'
with open(ner_prompt_path, 'r', encoding='utf-8') as file:
    ner_prompt_content = yaml.safe_load(file)
ner_prompt = ner_prompt_content['entity_extraction']

# Load summarization prompt template
sum_prompt_path = Path('../prompts') / 'article_summarization.yaml'
with open(sum_prompt_path, 'r', encoding='utf-8') as file:
    sum_prompt_content = yaml.safe_load(file)
sum_prompt = sum_prompt_content['article_summarization']

# Example text to process
sample_text = """
Apple announced its new iPhone 15 on September 12, 2023. 
Tim Cook presented the event at Apple Park in Cupertino, California. 
The event was also streamed live on YouTube, where millions of viewers tuned in.
"""

In [2]:
def get_gpu_temperature():
    # Run nvidia-smi to query GPU temperature
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=temperature.gpu", "--format=csv,noheader"],
        stdout=subprocess.PIPE,
        text=True
    )
    # Parse and return temperature of GPU 0
    return int(result.stdout.strip().split('\n')[0])

In [3]:
def gpu_temperature_rest_time():
    if get_gpu_temperature() >= 80:
        return 100
    else:
        return 0

In [4]:
def extract_json(text):
    # Search for a block that starts with '{' and ends with '}'.
    # The re.DOTALL flag allows the '.' to match newline characters.
    match = re.search(r'\{.*\}', text, re.DOTALL)
    if match:
        json_str = match.group(0)
        try:
            # Try to decode the JSON string into a Python object
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
            return None
    else:
        print("No JSON block found in the text.")
        return None

In [5]:
def get_ollama_summary(article_body, sum_prompt, model="gemma3:27b-it-q8_0"):
# Prepare the messages with both system and user prompts
    sum_messages = [
        {
            'role': 'system',
            'content': sum_prompt['system_prompt']
        },
        {
            'role': 'user',
            'content': sum_prompt['user_prompt_template'].replace("{text_to_process}", article_body)
        }
    ]

    # Call Ollama API
    response = client.chat(
        model=model,
        messages=sum_messages,
        options={"temperature":0.4}
    )

    # Get the raw response content
    summary = response['message']['content']

    return summary

In [6]:
def get_ollama_entities(article_summary, ner_prompt, model="gemma3:27b-it-q8_0"):
# Prepare the messages with both system and user prompts
    ner_messages = [
        {
            'role': 'system',
            'content': ner_prompt['system_prompt']
        },
        {
            'role': 'user',
            'content': ner_prompt['user_prompt_template'].replace("{text_to_process}", article_summary)
        }
    ]

    # Call Ollama API
    response = client.chat(
        model=model,
        messages=ner_messages,
       # options={"temperature":0.2}
    )

    # Get the raw response content
    entities_json = extract_json(response['message']['content'])

    return entities_json 

In [7]:
# Load baseline graph
with open("../data/MultiHop_graph_w_sem_embeddings.pkl", "rb") as f:
    G = pickle.load(f)

In [8]:
# Path to corpus file
multihop_corpus_path = os.path.join("..", "data", "Multi-hop_RAG_dataset", "corpus.json")

# Read JSON
with open(multihop_corpus_path, "r", encoding="utf-8") as f:
    corpus = json.load(f)

# Convert corpus data into df
corpus_as_df = pd.DataFrame(corpus)


In [None]:
# Initialize dictionary to store detected entities and counter variable
entities_dict = {} 

# For each node in the graph...
for node, data in G.nodes(data=True):

    # If the node is of type "article"...
    if data["type"] == 'article':
        
        print("Processing article: ", node)
        
        # Retrieve article body 
        article_body = corpus_as_df.iloc[node]["body"]

        # Check GPU temperature and wait if necessary 
        while gpu_temperature_rest_time() != 0:
            print(f"== Pausing code execution to cool down GPU... ({get_gpu_temperature()}) ==")
            time.sleep(gpu_temperature_rest_time())

        # Generate LLM summary of article for entity extraction 
        llm_summary = get_ollama_summary(article_body, sum_prompt)
        print(f"-----> Summary for article {node} finished...")

       # Initialize entity extraction loop tracking variables 
        json_entities = None
        while_count = 0

        # While we don't have a valid Json output for entities... 
        while json_entities is None:

            # Check GPU temperature and wait if necessary 
            while gpu_temperature_rest_time() != 0:
                print(f"== Pausing code execution to cool down GPU... ({get_gpu_temperature()}) ==")
                time.sleep(gpu_temperature_rest_time())

            # Extract entities 
            json_entities = get_ollama_entities(llm_summary, ner_prompt)
            while_count += 1

            # If after three attempts the output is not valid... 
            if while_count >= 3:
                # Exit loop and skip  
                print("----------> JSON not extracted for: ", node)
                break
        print(f"-----> Entity extraction for article {node} finished...")

        # Include entities and summary in "entities_dict" 
        entities_dict[node] ={"entities":json_entities, "summary":llm_summary}

Processing article:  0
-----> Summary for article 0 finished...
-----> Entity extraction for article 0 finished...
Processing article:  1
-----> Summary for article 1 finished...
-----> Entity extraction for article 1 finished...
Processing article:  2
== Pausing code execution to cool down GPU... (83) ==


In [None]:
entities_dict

{0: {'entities': {'entities': [{'type': 'Organization',
     'name': 'Amazon',
     'sentences': ['Amazon is holding an 11-day shopping event, beginning November 17th and continuing through Cyber Monday, November 27th, with both Black Friday and Cyber Monday deals available.',
      'with a $50 Amazon credit.']},
    {'type': 'Date',
     'name': 'November 17th',
     'sentences': ['Amazon is holding an 11-day shopping event, beginning November 17th and continuing through Cyber Monday, November 27th, with both Black Friday and Cyber Monday deals available.']},
    {'type': 'Date',
     'name': 'November 27th',
     'sentences': ['Amazon is holding an 11-day shopping event, beginning November 17th and continuing through Cyber Monday, November 27th, with both Black Friday and Cyber Monday deals available.']},
    {'type': 'Organization',
     'name': 'Apple',
     'sentences': ['The article highlights deals across numerous categories including Amazon devices (Echo, Fire TV, Kindle), Appl

In [None]:
with open("../data/extracted_entities_A.pkl", "wb") as f:
    pickle.dump(entities_dict, f)

In [None]:
# Identify entity citations in chunks

# for each article in the multi-hop dataset
for article_id in range(610):

    # identify all "chunk" nodes derived from a given article 
    prefix = f"{article_id}_chunk"
    matching_nodes = [node for node in G.nodes if str(node).startswith(prefix)]

    # for each chunk node 
    for node in matching_nodes:

        # for each entity found in an article... 
        for entity in entities_dict[article_id]['entities']['entities']:

            # Apply "setdefault" method with "appears_in" list of entity
            entity.setdefault('appears_in', [])

            # If the entity appears in the considered chunk...
            if entity['name'] in G.nodes[node]["text"]:

                # Add chunk node id to the entity's 'appears_in' list 
                entity['appears_in'].append(node)


['0_chunk_0', '0_chunk_1']

In [None]:
#temperature = get_gpu_temeprature()

29

In [None]:
with open("../data/extracted_entities_B.pkl", "wb") as f:
    pickle.dump(entities_dict, f)