In [None]:
#---Installing Libraries and Dependencies---
import subprocess
import sys

def install_dependencies():
    # List of pip packages
    packages = [
        "openai",
        "networkx",
        "ipycytoscape>=1.3.1",
        "ipywidgets",
        "pandas",
        "together",
        "rdflib",
        "google-search-results",
        "PyPDF2",
        "pytesseract",
        "pdf2image",
        "Pillow"
    ]

    print("Installing pip packages...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])

    print("Installing apt packages (poppler-utils)...")
    subprocess.check_call(["apt-get", "update"])
    subprocess.check_call(["apt-get", "install", "-y", "poppler-utils"])

install_dependencies()


Installing pip packages...
Installing apt packages (poppler-utils)...


In [None]:
#---Retrieving Credentials---
def retrieving_credentials():

    #---Storing API Credentials---
    #os.environ["OPENAI_API_KEY"] = 'yourkey'
    #os.environ["OPENAI_API_BASE"] = "https://api.openai.com/v1"

    os.environ["TOGETHER_API_KEY"] = 'yourkey'
    #os.environ["TOGETHER_API_BASE"] = "https://api.together.xyz/v1"



    #os.environ["DEEEPSEEK_API_KEY"] = 'yourkey'
    #os.environ["DEEPSEEK_API_BASE"] = "https://api.deepseek.com"'''

    #---LLM model selection---

    llm_model_name = 'mistralai/Mixtral-8x7B-Instruct-v0.1'

    print(f"Intended LLM model: {llm_model_name}")

    #---Retrieving  Credentials---

    api_key = os.getenv("TOGETHER_API_KEY")
    base_url = os.getenv("TOGETHER_API_BASE")

    print(f"Retrieved API key: {'Set' if api_key else 'Not_set'}")
    print(f"Retrieved Base URL: {base_url if base_url else 'Not Set (will use default TogetherAI)'}")

    return api_key, base_url, llm_model_name


In [None]:
#---Client response check---
def client_response_check():
    from together import Together

    client = Together()

    response = client.chat.completions.create(
        model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        messages=[{"role": "user", "content": "Type LLM model is online only!"}]
    )
    print(response.choices[0].message.content)

    return client, response

In [None]:
#---Initializing The Client---
def initialize_client(api_key, base_url, client):
    # --- Validate Key and Initialize Client ---
    from together import Together

    if not api_key:
        print("Error: TOGETHER_API_KEY environment variable not set or key not provided directly.")
        print("Please set the environment variable (or uncomment/edit the test lines) and restart the kernel.")
        raise SystemExit("API Key configuration failed.")
    else:
        try:
            client = Together(api_key=api_key, base_url=base_url)
            print("Together client initialized successfully.")
        except Exception as e:
            print(f"Error initializing Together client: {e}")
            print("Check your API key, base URL (if used), and network connection.")
            raise SystemExit("LLM client initialization failed.")

    return client


In [None]:
#---Defining the LLM Parameters---
def llm_parameters():
    #---Define LLM Call Parameters---
    llm_temperature = 0.0 # Lower temperature for more deterministic, factual output. 0.0 is best for extraction.
    llm_max_tokens =  4096 # Max tokens for the LLM response (adjust based on the model limits)

    print(f"LLM Temperature set to: {llm_temperature}")
    print(f"LLM Max Tokens set to: {llm_max_tokens}")

    return llm_temperature, llm_max_tokens


In [None]:
#---Searching for the relevant documents---
def search_satellite_power_docs(query, num_results=10):
    params = {
        "engine": "google",
        "q": query,
        "num": num_results,
        "api_key": "yourserpapikey"
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    links = [res["link"] for res in results.get("organic_results", [])]
    return links





In [None]:
#---Ranking the topic links----
def rank_topic_links(links, base_topic = "satellite power system fault diagnosis"):
    ranking_model = SentenceTransformer('all-MiniLM-L6-v2')
    base_embed = ranking_model.encode(base_topic, convert_to_tensor=True)
    scored_links = []

    for link in links:
        try:
            response = requests.get(link, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            paragraphs = soup.get_text()

            # Embed the full text of the page
            doc_embed = ranking_model.encode(paragraphs, convert_to_tensor=True)

            # Calculate cosine similarity
            score = util.pytorch_cos_sim(base_embed, doc_embed).item()

            # Optional: use your Together client for extra LLM verification if needed
            # response = client.chat.completions.create(
            #     model="mistralai/Mixtral-8x7B-Instruct-v0.1",
            #     messages=[
            #         {"role": "user", "content": f"Does this document relate to {base_topic}? Content:\n{paragraphs[:1000]}"}
            #     ]
            # )
            # confidence_verbal = response.choices[0].message.content

            scored_links.append((link, score))

        except Exception as e:
            print(f"Failed to process {link}: {e}")
            continue

    # Sort links by descending score
    return sorted(scored_links, key=lambda x: x[1], reverse=True)

In [None]:
#---Text Extraction---
def extract_text_from_pdf(pdf_url):
    # Download the PDF
    response = requests.get(pdf_url)
    with open("temp.pdf", "wb") as f:
        f.write(response.content)

    # First try to extract text normally
    reader = PyPDF2.PdfReader("temp.pdf")
    all_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            all_text += text + "\n"

    # If no text found, fallback to OCR on images of each page
    if not all_text.strip():
        print("No text found in PDF pages, running OCR...")
        images = convert_from_path("temp.pdf")
        ocr_text = ""
        for img in images:
            ocr_text += pytesseract.image_to_string(img) + "\n"
        return ocr_text

    return all_text


def extract_text_from_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup.get_text()


In [None]:
# -- Refactored unstructured_text() --
def unstructured_text(para):
    print("--- Input Text Loaded ---")
    print(para)
    print("-" * 25)

    # Basic Statistics
    char_count = len(para)
    word_count = len(para.split())
    print(f"Total characters: {char_count}")
    print(f"Approximate word count: {word_count}")
    print("-" * 25)

    return para, char_count, word_count


In [None]:
#--- Document Collection Pipeline---
def document_collection_pipeline_main():
    query = "satellite power system fault diagnosis PDF"
    links = search_satellite_power_docs(query)
    ranked_links = rank_topic_links(links, base_topic="satellite power system fault diagnosis")

    collected_texts = []
    num_top = max(1, int(len(ranked_links) * 0.8))

    for link, score in ranked_links[:num_top]:
        print(f"\nFetching: {link} | Confidence Score: {score}")
        try:
            if link.endswith(".pdf"):
                doc_text = extract_text_from_pdf(link)
            else:
                doc_text = extract_text_from_html(link)

            for para in doc_text.split("\n\n"):
                if len(para.strip()) > 50:
                    text, char_count, word_count = unstructured_text(para)
                    collected_texts.append(text)

        except Exception as e:
            print(f"Error processing link: {link}\n{e}")
            continue

    full_text = "\n".join(collected_texts)

    # Token-safe chunking (e.g., split by ~2000 words)
    CHUNK_WORDS = 1500
    words = full_text.split()
    full_text_chunks = [" ".join(words[i:i+CHUNK_WORDS]) for i in range(0, len(words), CHUNK_WORDS)]

    return full_text_chunks


In [None]:
#--- Co-refrence resolution prompting---
def coref_prompt(text_chunk, client, llm_model_name, llm_temperature, llm_max_tokens):
    prompt = f"""
    Resolve all coreferences in the following paragraph. Replace all pronouns (like "she", "he", "they") or whatever pronoun is in use, with the actual names they refer to. Do not summarize:

    Text:
    \"\"\"{text_chunk}\"\"\"

    Output:
    """

    response = client.chat.completions.create(
        model=llm_model_name,
        messages=[
            {"role": "system", "content": "You are a helpful AI trained in language understanding."},
            {"role": "user", "content": prompt}
        ],
        temperature=llm_temperature,
        max_tokens=llm_max_tokens
    )

    resolved_text = response.choices[0].message.content.strip()
    return resolved_text


In [None]:
#--- Chunk Configuration---
def chunk_configuration(resolved_text):

    #--- Chunking Configuration ---
    chunk_size = 150 #Number of words per chunk (adjust as needed)
    overlap = 30 # Number of words to overlap (must be < chunk_size)

    print(f"Chunk Size set to: {chunk_size} words")
    print(f"Overlap set to: {overlap} words")

    #--- Basic Validation ---
    if overlap >= chunk_size and chunk_size > 0:
      print(f"Error: Overlap ({overlap}) must be smaller than chunk size ({chunk_size}).")
      raise SystemExit("Chunking configuration error.")
    else:
      print("Chunking configuration is valid.")
    words = resolved_text.split()
    total_words = len(words)

    print(f"Text split into {total_words} words.")

    #---Visualization of first 20 words---
    print(f"First 20 words: {words[:20]}")

    return chunk_size, overlap, words, total_words


In [None]:
#---Chunking Execution---
def chunking_execution(chunk_size, overlap, words, total_words):
    #---performing chunking based on the configuration---

    chunks = []
    start_index = 0
    chunk_number = 1

    print(f"Starting chunking process...")
    while start_index < total_words:
      end_index = min(start_index + chunk_size, total_words)
      chunk_text = " ".join(words[start_index:end_index])
      chunks.append({"text": chunk_text, "chunk_number": chunk_number})

      #Printing the detailed log

      print(f" Created chunk {chunk_number}: words {start_index} to {end_index}")
      next_start_index = start_index + chunk_size - overlap

      # Ensure progress is made
      if next_start_index <= start_index:
            if end_index == total_words:
                break #Already processed the last part
            next_start_index = start_index + 1

      start_index = next_start_index
      chunk_number += 1

      #---Safety Break---
      if chunk_number > total_words: #Simple safety
            print("Warning: Chunking loop exceeded total word count, breaking.")
            break

    print(f"\nText successfully split into {len(chunks)} chunks.")

    print("--- Chunk Details ---")
    if chunks:
      # Create a DataFrame for better visualization
      chunks_df = pd.DataFrame(chunks)
      chunks_df['word_count'] = chunks_df['text'].apply(lambda x: len(x.split()))
      display(chunks_df[['chunk_number', 'word_count', 'text']])
    else:
      print("No chunks were created (text might be shorter than chunk size).")
    print("-"*25)

    return chunks, chunks_df

In [None]:
#---Extraction Prompting---
def extraction_prompt(chunks):

    # --- System Prompt: Sets the context/role for the LLM ---
    extraction_system_prompt = """
    You are an AI expert specialized in knowledge graph extraction.
    Your task is to identify and extract factual Subject-Predicate-Object (SPO) triples from the given text.
    When temporal information (e.g., years, dates, or time periods) is explicitly or implicitly available, include it as an additional "temporal" key in each triple.
    Focus on accuracy and adhere strictly to the JSON output format requested in the user prompt.
    Extract core entities and the most direct relationship.
    """


    extraction_user_prompt_template = """
    Please extract Subject-Predicate-Object (S-P-O) triples from the text below.

    **VERY IMPORTANT RULES:**
    1.  **Output Format:** Respond ONLY with a single, valid JSON array. Each element MUST be an object with keys "subject", "predicate", "object". If a time or date is mentioned in relation to the triple, add a fourth key "temporal".
    2.  **JSON Only:** Do NOT include any text before or after the JSON array. Do NOT use markdown ```json ... ``` tags.
    3.  **Concise Predicates:** Keep the 'predicate' value concise (1-3 words, ideally 1-2). Use verbs or short verb phrases (e.g., 'discovered', 'was born in', 'won').
    4.  **Lowercase:** ALL values for 'subject', 'predicate', 'object', and 'temporal' MUST be lowercase.
    5.  **Pronoun Resolution:** Replace pronouns (she, he, it, her, etc.) with the specific lowercase entity name they refer to based on the text context (e.g., 'marie curie').
    6.  **Specificity:** Capture specific details (e.g., 'nobel prize in physics' instead of just 'nobel prize' if specified).
    7.  **Completeness:** Extract all distinct factual relationships mentioned.
    8.  **Temporal Info:** If a specific time, date, or year is mentioned (e.g., "in 1903", "on july 4", "during the 20th century"), include it as a "temporal" key in the JSON object.

    **Text to Process:**
    {text_chunk}

    Required JSON Output Format Example:
    [
      {{ "subject": "marie curie", "predicate": "discovered", "object": "radium", "temporal": "1898" }},
      {{ "subject": "marie curie", "predicate": "won", "object": "nobel prize in physics", "temporal": "1903" }}
    ]

    Your JSON Output (MUST start with '[' and end with ']'):
    You MUST return an array of JSON objects (even if there's only one). Wrap it in [ ] brackets.
    Only include the "temporal" key if time info is actually present or implied.
    """

    print("--- System Prompt ---")
    print(extraction_system_prompt)
    print("\n" + "-" * 25 + "\n")

    print("--- User Prompt Template (Structure) ---")
    # Show structure, replacing the placeholder for clarity
    print(extraction_user_prompt_template.replace("{text_chunk}", "[... text chunk goes here ...]"))
    print("\n" + "-" * 25 + "\n")

    # Show an example of the *actual* prompt that will be sent for the first chunk
    print("--- Example Filled User Prompt (for Chunk 1) ---")
    if chunks:
        example_filled_prompt = extraction_user_prompt_template.format(text_chunk=chunks[0]['text'])
        # Displaying a limited portion for brevity
        print(example_filled_prompt[:600] + "\n[... rest of chunk text ...]\n" + example_filled_prompt[-200:])
    else:
        print("No chunks available to create an example filled prompt.")
    print("\n" + "-" * 25)

    return extraction_system_prompt, extraction_user_prompt_template



In [None]:
#---S-P-O Extraction---
def spo_extraction(extraction_user_prompt_template, extraction_system_prompt, chunks, llm_model_name, llm_temperature, llm_max_tokens, client):
    import json
    import re
    import pandas as pd

    all_extracted_triples = []
    failed_chunks = []

    print(f"Starting triple extraction from {len(chunks)} chunks using model '{llm_model_name}'...")

    for chunk_info in chunks:  # <-- Loop through all chunks
        chunk_text = chunk_info['text']
        chunk_num = chunk_info['chunk_number']

        print(f"\n--- Processing Chunk {chunk_num}/{len(chunks)} ---")

        print("1. Formatting User Prompt...")
        user_prompt = extraction_user_prompt_template.format(text_chunk=chunk_text)

        llm_output = None
        error_message = None

        try:
            print("2. Sending request to LLM...")
            response = client.chat.completions.create(
                model=llm_model_name,
                messages=[
                    {"role": "system", "content": extraction_system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=llm_temperature,
                max_tokens=llm_max_tokens,
                response_format={ "type": "json_object" },
            )
            print("   LLM response received.")

            print("3. Extracting raw response content...")
            llm_output = response.choices[0].message.content.strip()
            print(f"--- Raw LLM Output (Chunk {chunk_num}) ---")
            print(llm_output)
            print("-" * 20)

        except Exception as e:
            error_message = str(e)
            print(f"   ERROR during API call: {error_message}")
            failed_chunks.append({'chunk_number': chunk_num, 'error': f'API/Processing Error: {error_message}', 'response': ''})
            continue  # Skip parsing and validation for this chunk

        # 4. Parse JSON
        parsed_json = None
        parsing_error = None
        if llm_output is not None:
            print("4. Attempting to parse JSON from response...")
            try:
                parsed_data = json.loads(llm_output)
                if isinstance(parsed_data, dict):
                    print("   Detected dictionary response, attempting to extract list...")
                    list_values = [v for v in parsed_data.values() if isinstance(v, list)]
                    if len(list_values) == 1:
                        parsed_json = list_values[0]
                        print("      Successfully extracted list from dictionary.")
                    else:
                        raise ValueError("JSON object received, but doesn't contain a single list of triples.")
                elif isinstance(parsed_data, list):
                    parsed_json = parsed_data
                    print("   Successfully parsed JSON list directly.")
                else:
                    raise ValueError("Parsed JSON is not a list or expected dictionary wrapper.")
            except json.JSONDecodeError as json_err:
                parsing_error = f"JSONDecodeError: {json_err}. Trying regex fallback..."
                print(f"   {parsing_error}")
                match = re.search(r'^\s*(\[.*?\])\s*$', llm_output, re.DOTALL)
                if match:
                    try:
                        parsed_json = json.loads(match.group(1))
                        print("      Successfully parsed JSON from regex extraction.")
                        parsing_error = None
                    except json.JSONDecodeError as nested_err:
                        parsing_error = f"JSONDecodeError after regex: {nested_err}"
                        print(f"      ERROR: Regex content is not valid JSON: {nested_err}")
                else:
                    parsing_error = "JSONDecodeError and Regex fallback failed."
                    print("      ERROR: Regex could not find JSON array structure.")
            except ValueError as val_err:
                parsing_error = f"ValueError: {val_err}"
                print(f"   ERROR: {parsing_error}")

        if parsed_json is not None:
            print(f"--- Parsed JSON Data (Chunk {chunk_num}) ---")
            print(json.dumps(parsed_json, indent=2))
            print("-" * 20)
        else:
            print(f"--- JSON Parsing FAILED (Chunk {chunk_num}) --- ")
            print(f"   Final Parsing Error: {parsing_error}")
            print("-" * 20)
            failed_chunks.append({'chunk_number': chunk_num, 'error': f'Parsing Failed: {parsing_error}', 'response': llm_output})
            continue

        # 5. Validate and store
        print("5. Validating structure and extracting triples...")
        valid_triples_in_chunk = []
        invalid_entries = []

        if isinstance(parsed_json, list):
            for item in parsed_json:
                if isinstance(item, dict) and all(k in item for k in ['subject', 'predicate', 'object']):
                    if all(isinstance(item[k], str) for k in ['subject', 'predicate', 'object']):
                        item['chunk'] = chunk_num
                        valid_triples_in_chunk.append(item)
                    else:
                        invalid_entries.append({'item': item, 'reason': 'Non-string value'})
                else:
                    invalid_entries.append({'item': item, 'reason': 'Incorrect structure/keys'})
        else:
            invalid_entries.append({'item': parsed_json, 'reason': 'Not a list'})
            if not any(fc['chunk_number'] == chunk_num for fc in failed_chunks):
                failed_chunks.append({'chunk_number': chunk_num, 'error': 'Parsed data not a list', 'response': llm_output})

        print(f"   Found {len(valid_triples_in_chunk)} valid triples in this chunk.")
        if invalid_entries:
            print(f"   Skipped {len(invalid_entries)} invalid entries.")

        if valid_triples_in_chunk:
            print(f"--- Valid Triples Extracted (Chunk {chunk_num}) ---")
            display(pd.DataFrame(valid_triples_in_chunk))
            print("-" * 20)
            all_extracted_triples.extend(valid_triples_in_chunk)
        else:
            print(f"--- No valid triples extracted from this chunk. ---")
            print("-" * 20)

        print(f"--- Running Total Triples Extracted: {len(all_extracted_triples)} ---")
        print(f"--- Failed Chunks So Far: {len(failed_chunks)} ---")

    print("\n Finished processing all chunks.")
    if failed_chunks:
        print("\n--- Failed Chunks Summary ---")
        for failed in failed_chunks:
            print(f"Chunk #{failed['chunk_number']} failed with error: {failed['error']}")
            print(f"Response:\n{failed['response']}\n")
    else:
        print("\n No failed chunks!")

    return all_extracted_triples, failed_chunks


In [None]:
#---Extraction and simplification of SPO Triples---
def extract_and_simplify_triples(all_extracted_triples, client, llm_model_name, llm_temperature, llm_max_tokens):

    #Creating the variable
    all_final_triples = []


    # STEP 1: Extract complex triples
    complex_system_prompt = """You are an assistant that simplifies complex SPO (Subject–Predicate–Object) triples into smaller, atomic relationships without losing their intended meaning or logical structure.

A triple is considered **complex** and needs simplification if:

1. The **predicate** contains multiple actions, modifiers, or logical connectors (e.g., "owns and operates", "is responsible for and manages").
2. The **subject** or **object** is a compound concept (e.g., "economic and political conditions", "intrinsic metabolic and structural state").
3. The triple **implicitly encodes multiple relationships**, such as ownership, role, category, or part-whole associations.
4. Any **part of the triple** is vague or underspecified (e.g., "None", "this", "that process").
5. There is a **temporal aspect** (e.g., dates, durations, time ranges) that applies to the subject, predicate, or object.

Your task is to:
- Decompose compound predicates into distinct triples, each representing a single relationship.
- Split compound subjects or objects into multiple entities if each part plays a different role.
- Make implicit relationships explicit, such as "is a part of", "is a type of", or "belongs to".
- Replace vague entities (e.g., "None", "this") with inferred concrete references where context allows.
- Identify any **temporal expressions** (e.g., "in 1995", "since 2020", "during the war") and extract them as separate triples using predicates such as:
  - "happened in"
  - "started in"
  - "ended in"
  - "occurred during"
  - "was active during"

###  Examples of complex triples and how to simplify them:

**Example 1 (compound predicate):**
- Complex: { "subject": "john", "predicate": "owns and operates", "object": "two factories" }
- Simplified:
  - { "subject": "john", "predicate": "owns", "object": "two factories" }
  - { "subject": "john", "predicate": "operates", "object": "two factories" }

**Example 2 (compound subject):**
- Complex: { "subject": "economic and political conditions", "predicate": "influence", "object": "public policy" }
- Simplified:
  - { "subject": "economic conditions", "predicate": "influence", "object": "public policy" }
  - { "subject": "political conditions", "predicate": "influence", "object": "public policy" }

**Example 3 (implicit parts):**
- Complex: { "subject": "biological cell", "predicate": "has a membrane and nucleus", "object": null }
- Simplified:
  - { "subject": "biological cell", "predicate": "has", "object": "membrane" }
  - { "subject": "biological cell", "predicate": "has", "object": "nucleus" }

**Example 4 (role inference):**
- Complex: { "subject": "employee", "predicate": "is the manager of", "object": "team" }
- Simplified:
  - { "subject": "employee", "predicate": "has role", "object": "manager" }
  - { "subject": "manager", "predicate": "manages", "object": "team" }

**Example 5 (vague subject):**
- Complex: { "subject": "none", "predicate": "damages", "object": "machine" }
- Simplified:
  - Try to resolve or infer the missing subject from context if possible.
  - If not resolvable, flag for human review.

**Example 6 (temporal factor):**
- Complex: { "subject": "marie curie", "predicate": "won", "object": "nobel prize in physics in 1903" }
- Simplified:
  - { "subject": "marie curie", "predicate": "won", "object": "nobel prize in physics" }
  - { "subject": "marie curie winning nobel prize in physics", "predicate": "happened in", "object": "1903" }

Make sure every triple is atomic, clear, and temporally enriched if time context is available.
"""


    complex_user_prompt = f"""Here is a list of triples extracted from a document.

Your job is to **select only the complex triples** from this list, using the rules described (including compound, implicit, vague, and temporal relationships):

{json.dumps(all_extracted_triples, indent=2)}

Return only the ones that need simplification in JSON.

These are the complex triples:
[
  {{ "subject": "john", "predicate": "owns and operates", "object": "two factories" }},
  ...
]
"""




    # STEP 1.5: Extract JSON from response safely using regex
    # Remove leading "These are the complex triples:" or similar headers

    def extract_json_list(text):
        try:
            return json.loads(text)
        except:
            fenced_match = re.search(r"```(?:json)?\s*(\[\s*{.*?}\s*\])\s*```", text, re.DOTALL)
            if fenced_match:
                try:
                    return json.loads(fenced_match.group(1))
                except Exception as e:
                    print(" Failed parsing fenced JSON block:", e)
                    print("Matched text:\n", fenced_match.group(1))

            loose_match = re.search(r"(\[\s*{.*?}\s*\])", text, re.DOTALL)
            if loose_match:
                try:
                    return json.loads(loose_match.group(1))
                except Exception as e:
                    print(" Failed parsing loose JSON match:", e)
                    print("Matched text:\n", loose_match.group(1))

            raise ValueError("No valid JSON list found.")

    response = client.chat.completions.create(
        model=llm_model_name,
        messages=[
            {"role": "system", "content": complex_system_prompt},
            {"role": "user", "content": complex_user_prompt}
        ],
        temperature=llm_temperature,
        max_tokens=llm_max_tokens
    )


    try:
        complex_triples = extract_json_list(response.choices[0].message.content)
    except Exception as e:
        print("\n LLM Output for Complex Triple Detection:")
        print("===========================================")
        print(response.choices[0].message.content)
        print("\n JSON Extraction Error:", e)

        for triple in all_extracted_triples:
            triple['chunk_id'] = str(uuid.uuid4())
            all_final_triples.append(triple)
        return all_final_triples
    complex_triples_df = pd.DataFrame(complex_triples)
    #print(f"this is complex triples cuz {complex_triples_df}")
    display(complex_triples_df)

    # STEP 2: Identify simple triples
    complex_set = {(t['subject'], t['predicate'], t['object']) for t in complex_triples}
    simple_triples = [t for t in all_extracted_triples if (t['subject'], t['predicate'], t['object']) not in complex_set]

    if not complex_triples:

      for triple in all_extracted_triples:
          triple['chunk_id'] = str(uuid.uuid4())
          all_final_triples.append(triple)
      return all_final_triples

    display(all_final_triples)


    # STEP 3: Simplify complex triples
    simplification_prompt = f"""
You are a knowledge extraction assistant. Your goal is to simplify SPO (Subject-Predicate-Object) triples into clear, atomic, and semantically accurate representations — without overcomplicating them.

Follow these guidelines carefully:

1. Retain the **core meaning** of each triple, but rephrase or reformat for clarity if needed.
2. Avoid using meta-language like "has a property", "has a source", or "can cause damage to" unless absolutely essential to preserve meaning.
3. If a **subject or object contains multiple distinct elements** (e.g., "Martin and Jake", "Spoon or Disc"), **split them into separate triples**, preserving the predicate.
4. If multiple actions or modifiers are present in a triple, **break them down** into atomic units.
5. Use **natural predicates** when possible (e.g., use "is damaged by" instead of "has a property").
6. For each simplified triple, include a `"complexity_type"` to indicate:
   - `"simple"`: A straightforward, atomic triple with one clear subject, predicate, and object.
   - `"compound"`: A triple with compound elements or multiple linked ideas.
   - `"implicit"`: A triple based on inferred meaning not explicitly stated in the original.
   - `"temporal"`: A triple that includes or implies time-based relationships.

Input:
Complex Triples:
{json.dumps(complex_triples, indent=2)}

Output:
Simplified Triples (JSON list format). Each triple must follow this structure:
{{
  "subject": "...",
  "predicate": "...",
  "object": "...",
  "complexity_type": "simple" | "compound" | "implicit" | "temporal"
}}
"""



    simplification_response = client.chat.completions.create(
        model=llm_model_name,
        messages=[
            {"role": "system", "content": "You are an assistant that simplifies logical triples for knowledge extraction."},
            {"role": "user", "content": simplification_prompt}
        ],
        temperature=llm_temperature,
        max_tokens=llm_max_tokens
    )

    try:
        simplified_triples = extract_json_list(simplification_response.choices[0].message.content)

        # Flatten in case the model nested some lists
        flattened_triples = []
        for t in simplified_triples:
            if isinstance(t, list):
                flattened_triples.extend(t)
            else:
                flattened_triples.append(t)
        simplified_triples = flattened_triples

    except Exception as e:
        print("Failed to extract simplified triples JSON:")
        print(simplification_response.choices[0].message.content)
        return []


    complex_df = pd.DataFrame(complex_triples)
    simplified_df = pd.DataFrame(simplified_triples)
    print(f"here is complex cousins {complex_triples}")
    print(f"here is simplified dudes {simplified_triples}")

    # STEP 4: Add chunk_id and return
    all_final_triples = []

    for triple in simple_triples + simplified_triples:
        if isinstance(triple, dict):
           triple['chunk_id'] = str(uuid.uuid4())
           all_final_triples.append(triple)
        else:
            print(" Unexpected triple format (not a dict):", triple)


    print(f"all_final_triples is as follows {all_final_triples}")

    return all_final_triples


In [None]:
#---Preview of extracted triples---
def preview_spo_extraction(chunks, all_final_triples, failed_chunks):
    #--- Summary of Extraction (Reflecting state after the single chunk demo)

    print(f"\n--- Overall Extraction Summary---")
    print(f"Total chunks defined: {len(chunks)}")
    processed_chunks = len(chunks) - len(failed_chunks) #Approximation if loop didnt run fully
    print(f"Chunks processed (attempted): {processed_chunks + len(failed_chunks)}") #Chunks went through the loop
    print(f"Total valid triples extracted across all processed chunks: {len(all_final_triples)}")
    print(f"Number of chunks that failed API call or parsing: {len(failed_chunks)}")

    if failed_chunks:
        print("\nDetails of Failed Chunks:")
        for failure in failed_chunks:
            print(f"  Chunk {failure['chunk_number']}: Error: {failure['error']}")
            print(f"    Response (start): {failure.get('response', '')[:100]}...")
    print("-" * 25)

    # Display all extracted triples using Pandas
    print("\n--- All Extracted Triples (Before Normalization) ---")
    if all_final_triples:
        all_triples_df = pd.DataFrame(all_final_triples)
        display(all_triples_df)
    else:
        print("No triples were successfully extracted.")

    print("-" * 25)

    return processed_chunks, all_triples_df

In [None]:
#---Normalization of triples---
def triplet_normalization_execution(all_final_triples):

    # Initialize lists and tracking variables
    normalized_triples = []
    seen_triples = set() # Tracking (S-P-O) tuples
    original_count = len(all_final_triples)
    empty_removed_count = 0
    duplicate_removed_count = 0

    print(f"Starting normalization and re-duplication of {original_count} triples")

    print("Processing triples for normalization (showing first 5 examples):")
    example_limit = 5
    processed_count = 0

    for i, triple in enumerate(all_final_triples):
        show_example = (i < example_limit)
        if show_example:
            print(f"\n--- Example {i+1} ---")
            print(f"Original Triple (Chunk {triple.get('chunk', '?')}): {triple}")

        subject_raw = triple.get('subject')
        predicate_raw = triple.get('predicate')
        object_raw = triple.get('object')
        chunk_num = triple.get('chunk', 'unknown')

        triple_valid = False
        normalized_sub, normalized_pred, normalized_obj = None, None, None

        if isinstance(subject_raw, str) and isinstance(predicate_raw, str) and isinstance(object_raw, str):
            # 1. Normalize
            normalized_sub = subject_raw.strip().lower()
            normalized_pred = re.sub(r'\s+', ' ', predicate_raw.strip().lower()).strip()
            normalized_obj = object_raw.strip().lower()
            if show_example:
                print(f"Normalized: SUB='{normalized_sub}', PRED='{normalized_pred}', OBJ='{normalized_obj}'")

            # 2. Filter Empty
            if normalized_sub and normalized_pred and normalized_obj:
                triple_identifier = (normalized_sub, normalized_pred, normalized_obj)

                # 3. De-duplicate
                if triple_identifier not in seen_triples:
                    normalized_triples.append({
                        'subject': normalized_sub,
                        'predicate': normalized_pred,
                        'object': normalized_obj,
                        'source_chunk': chunk_num
                    })
                    seen_triples.add(triple_identifier)
                    triple_valid = True
                    if show_example:
                        print("Status: Kept (New Unique Triple)")
                else:
                    duplicate_removed_count += 1
                    if show_example:
                        print("Status: Discarded (Duplicate)")
            else:
                empty_removed_count += 1
                if show_example:
                    print("Status: Discarded (Empty component after normalization)")
        else:
            empty_removed_count += 1 # Count non-string/missing as needing removal
            if show_example:
                print("Status: Discarded (Non-string or missing component)")
        processed_count += 1

    print(f"\n... Finished processing {processed_count} triples.")

    # --- Summary of Normalization ---
    print(f"\n--- Normalization & De-duplication Summary ---")
    print(f"Original extracted triple count: {original_count}")
    print(f"Triples removed (empty/invalid components): {empty_removed_count}")
    print(f"Duplicate triples removed: {duplicate_removed_count}")
    final_count = len(normalized_triples)
    print(f"Final unique, normalized triple count: {final_count}")
    print("-" * 25)

    # Display a sample of normalized triples using Pandas
    print("\n--- Final Normalized Triples ---")
    if normalized_triples:
        normalized_df = pd.DataFrame(normalized_triples)
        display(normalized_df)
    else:
        print("No valid triples remain after normalization.")

    print("-" * 25)

    return normalized_triples, normalized_df



In [None]:
#---Creation of the Knowledge Graph---
def KG_creation(normalized_triples):

    #Creating an empty directed graph
    knowledge_graph = nx.DiGraph()

    print("Initialized an empty NetworkX DiGraph.")
    # Visualize the initial empty graph state
    print("--- Initial Graph Info ---")
    try:
        # Try the newer method first
        print(nx.info(knowledge_graph))
    except AttributeError:
        # Fallback for different NetworkX versions
        print(f"Type: {type(knowledge_graph).__name__}")
        print(f"Number of nodes: {knowledge_graph.number_of_nodes()}")
        print(f"Number of edges: {knowledge_graph.number_of_edges()}")
    print("-" * 25)

    print("Adding triples to the NetworkX graph...")

    added_edges_count = 0
    update_interval = 5 # How often to print graph info update

    if not normalized_triples:
        print("Warning: No normalized triples to add to the graph.")
    else:
        for i, triple in enumerate(normalized_triples):
            subject_node = triple['subject']
            object_node = triple['object']
            predicate_label = triple['predicate']

            # Nodes are added automatically when adding edges, but explicit calls are fine too
            # knowledge_graph.add_node(subject_node)
            # knowledge_graph.add_node(object_node)

            # Add the directed edge with the predicate as a 'label' attribute
            knowledge_graph.add_edge(subject_node, object_node, label=predicate_label)
            added_edges_count += 1

            # --- Visualize Graph Growth ---
            if (i + 1) % update_interval == 0 or (i + 1) == len(normalized_triples):
                print(f"\n--- Graph Info after adding Triple #{i+1} --- ({subject_node} -> {object_node})")
                try:
                    # Try the newer method first
                    print(nx.info(knowledge_graph))
                except AttributeError:
                    # Fallback for different NetworkX versions
                    print(f"Type: {type(knowledge_graph).__name__}")
                    print(f"Number of nodes: {knowledge_graph.number_of_nodes()}")
                    print(f"Number of edges: {knowledge_graph.number_of_edges()}")
                # For very large graphs, printing info too often can be slow. Adjust interval.

    print(f"\nFinished adding triples. Processed {added_edges_count} edges.")

    return knowledge_graph, added_edges_count

In [None]:
#Summarization of the knowledge graph---
def KG_summary(knowledge_graph, added_edges_count):

    # --- Final Graph Statistics ---
    num_nodes = knowledge_graph.number_of_nodes()
    num_edges = knowledge_graph.number_of_edges()

    print(f"\n--- Final NetworkX Graph Summary ---")
    print(f"Total unique nodes (entities): {num_nodes}")
    print(f"Total unique edges (relationships): {num_edges}")

    if num_edges != added_edges_count and isinstance(knowledge_graph, nx.DiGraph):
        print(f"Note: Added {added_edges_count} edges, but graph has {num_edges}. DiGraph overwrites edges with same source/target. Use MultiDiGraph if multiple edges needed.")

    if num_nodes > 0:
        try:
          density = nx.density(knowledge_graph)
          print(f"Graph density: {density:.4f}")
          if nx.is_weakly_connected(knowledge_graph):
              print("The graph is weakly connected (all nodes reachable ignoring direction).")
          else:
              num_components = nx.number_weakly_connected_components(knowledge_graph)
              print(f"The graph has {num_components} weakly connected components.")
        except Exception as e:
            print(f"Could not calculate some graph metrics: {e}") # Handle potential errors on empty/small graphs
    else:
        print("Graph is empty, cannot calculate metrics.")
    print("-" * 25)

    # --- Sample Nodes ---
    print("\n--- Sample Nodes (First 10) ---")
    if num_nodes > 0:
        nodes_sample = list(knowledge_graph.nodes())[:10]
        display(pd.DataFrame(nodes_sample, columns=['Node Sample']))
    else:
        print("Graph has no nodes.")

    # --- Sample Edges ---
    print("\n--- Sample Edges (First 10 with Labels) ---")
    if num_edges > 0:
        edges_sample = []
        for u, v, data in list(knowledge_graph.edges(data=True))[:10]:
            edges_sample.append({'Source': u, 'Target': v, 'Label': data.get('label', 'N/A')})
        display(pd.DataFrame(edges_sample))
    else:
        print("Graph has no edges.")
    print("-" * 25)

    return {
        "num_nodes": num_nodes,
        "num_edges": num_edges,
        "nodes_sample": nodes_sample if num_nodes > 0 else [],
        "edges_sample": edges_sample if num_edges > 0 else [],
        "density": density if num_nodes > 0 else None,
        "is_weakly_connected": nx.is_weakly_connected(knowledge_graph) if num_nodes > 0 else None
    }


In [None]:
#---Interactive Visualization---
def interactive_visualization(knowledge_graph):

    print("Preparing interactive visualization...")

    # --- Check Graph Validity for Visualization ---
    can_visualize = False
    if 'knowledge_graph' not in locals() or not isinstance(knowledge_graph, nx.Graph):
        print("Error: 'knowledge_graph' not found or is not a NetworkX graph.")
    elif knowledge_graph.number_of_nodes() == 0:
        print("NetworkX Graph is empty. Cannot visualize.")
    else:
        print(f"Graph seems valid for visualization ({knowledge_graph.number_of_nodes()} nodes, {knowledge_graph.number_of_edges()} edges).")
        can_visualize = True

    cytoscape_nodes = []
    cytoscape_edges = []

    if can_visualize:
        print("Converting nodes...")
        # Calculate degrees for node sizing
        node_degrees = dict(knowledge_graph.degree())
        max_degree = max(node_degrees.values()) if node_degrees else 1

        for node_id in knowledge_graph.nodes():
            degree = node_degrees.get(node_id, 0)
            # Simple scaling for node size (adjust logic as needed)
            node_size = 15 + (degree / max_degree) * 50 if max_degree > 0 else 15

            cytoscape_nodes.append({
                'data': {
                    'id': str(node_id), # ID must be string
                    'label': str(node_id).replace(' ', '\n'), # Display label (wrap spaces)
                    'degree': degree,
                    'size': node_size,
                    'tooltip_text': f"Entity: {str(node_id)}\nDegree: {degree}" # Tooltip on hover
                }
            })
        print(f"Converted {len(cytoscape_nodes)} nodes.")

        print("Converting edges...")
        edge_count = 0
        for u, v, data in knowledge_graph.edges(data=True):
            edge_id = f"edge_{edge_count}" # Unique edge ID
            predicate_label = data.get('label', '')
            cytoscape_edges.append({
                'data': {
                    'id': edge_id,
                    'source': str(u),
                    'target': str(v),
                    'label': predicate_label, # Label on edge
                    'tooltip_text': f"Relationship: {predicate_label}" # Tooltip on hover
                }
            })
            edge_count += 1
        print(f"Converted {len(cytoscape_edges)} edges.")

        # Combine into the final structure
        cytoscape_graph_data = {'nodes': cytoscape_nodes, 'edges': cytoscape_edges}

        # Visualize the converted structure (first few nodes/edges)
        print("\n--- Sample Cytoscape Node Data (First 2) ---")
        print(json.dumps(cytoscape_graph_data['nodes'][:2], indent=2))
        print("\n--- Sample Cytoscape Edge Data (First 2) ---")
        print(json.dumps(cytoscape_graph_data['edges'][:2], indent=2))
        print("-" * 25)
    else:
        print("Skipping data conversion as graph is not valid for visualization.")
        cytoscape_graph_data = {'nodes': [], 'edges': []}

    return cytoscape_graph_data, can_visualize

In [None]:
#---Widget Creation---
def ipycytoscape_widget_creation(knowledge_graph, can_visualize, cytoscape_graph_data):
    if can_visualize:
        print("Creating ipycytoscape widget...")
        cyto_widget = ipycytoscape.CytoscapeWidget()
        print("Widget created.")

        print("Loading graph data into widget...")
        cyto_widget.graph.add_graph_from_json(cytoscape_graph_data, directed=True)
        print("Data loaded.")
    else:
        print("Skipping widget creation.")
        cyto_widget = None

    if cyto_widget:
        print("Defining enhanced colorful and interactive visual style...")
        # More vibrant and colorful styling with a modern color scheme
        visual_style = [
            {
                'selector': 'node',
                'style': {
                    'label': 'data(label)',
                    'width': 'data(size)',
                    'height': 'data(size)',
                    'background-color': '#3498db',  # Bright blue
                    'background-opacity': 0.9,
                    'color': '#ffffff',             # White text
                    'font-size': '12px',
                    'font-weight': 'bold',
                    'text-valign': 'center',
                    'text-halign': 'center',
                    'text-wrap': 'wrap',
                    'text-max-width': '100px',
                    'text-outline-width': 2,
                    'text-outline-color': '#2980b9',  # Matching outline
                    'text-outline-opacity': 0.7,
                    'border-width': 3,
                    'border-color': '#1abc9c',      # Turquoise border
                    'border-opacity': 0.9,
                    'shape': 'ellipse',
                    'transition-property': 'background-color, border-color, border-width, width, height',
                    'transition-duration': '0.3s',
                    'tooltip-text': 'data(tooltip_text)'
                }
            },
            {
                'selector': 'node:selected',
                'style': {
                    'background-color': '#e74c3c',  # Pomegranate red
                    'border-width': 4,
                    'border-color': '#c0392b',
                    'text-outline-color': '#e74c3c',
                    'width': 'data(size) * 1.2',    # Enlarge selected nodes
                    'height': 'data(size) * 1.2'
                }
            },
            {
                'selector': 'node:hover',
                'style': {
                    'background-color': '#9b59b6',  # Purple on hover
                    'border-width': 4,
                    'border-color': '#8e44ad',
                    'cursor': 'pointer',
                    'z-index': 999
                }
            },
            {
                'selector': 'edge',
                'style': {
                    'label': 'data(label)',
                    'width': 2.5,
                    'curve-style': 'bezier',
                    'line-color': '#2ecc71',         # Green
                    'line-opacity': 0.8,
                    'target-arrow-color': '#27ae60',
                    'target-arrow-shape': 'triangle',
                    'arrow-scale': 1.5,
                    'font-size': '10px',
                    'font-weight': 'normal',
                    'color': '#2c3e50',
                    'text-background-opacity': 0.9,
                    'text-background-color': '#ecf0f1',
                    'text-background-shape': 'roundrectangle',
                    'text-background-padding': '3px',
                    'text-rotation': 'autorotate',
                    'edge-text-rotation': 'autorotate',
                    'transition-property': 'line-color, width, target-arrow-color',
                    'transition-duration': '0.3s',
                    'tooltip-text': 'data(tooltip_text)'
                }
            },
            {
                'selector': 'edge:selected',
                'style': {
                    'line-color': '#f39c12',         # Yellow-orange
                    'target-arrow-color': '#d35400',
                    'width': 4,
                    'text-background-color': '#f1c40f',
                    'color': '#ffffff',               # White text
                    'z-index': 998
                }
            },
            {
                'selector': 'edge:hover',
                'style': {
                    'line-color': '#e67e22',         # Orange on hover
                    'width': 3.5,
                    'cursor': 'pointer',
                    'target-arrow-color': '#d35400',
                    'z-index': 997
                }
            },
            {
                'selector': '.center-node',
                'style': {
                    'background-color': '#16a085',    # Teal
                    'background-opacity': 1,
                    'border-width': 4,
                    'border-color': '#1abc9c',        # Turquoise border
                    'border-opacity': 1
                }
            }
        ]

        print("Setting enhanced visual style on widget...")
        cyto_widget.set_style(visual_style)

        # Apply a better animated layout
        cyto_widget.set_layout(name='cose',
                              nodeRepulsion=5000,
                              nodeOverlap=40,
                              idealEdgeLength=120,
                              edgeElasticity=200,
                              nestingFactor=6,
                              gravity=90,
                              numIter=2500,
                              animate=True,
                              animationDuration=1000,
                              initialTemp=300,
                              coolingFactor=0.95)

        # Add a special class to main nodes (Marie Curie)
        if len(cyto_widget.graph.nodes) > 0:
            main_nodes = [node.data['id'] for node in cyto_widget.graph.nodes
                        if node.data.get('degree', 0) > 10]

            # Create gradient styles for center nodes
            for i, node_id in enumerate(main_nodes):
                # Use vibrant colors for center nodes
                center_style = {
                    'selector': f'node[id = "{node_id}"]',
                    'style': {
                        'background-color': '#9b59b6',   # Purple
                        'background-opacity': 0.95,
                        'border-width': 4,
                        'border-color': '#8e44ad',      # Darker purple border
                        'border-opacity': 1,
                        'text-outline-width': 3,
                        'text-outline-color': '#8e44ad',
                        'font-size': '14px'
                    }
                }
                visual_style.append(center_style)

            # Update the style with the new additions
            cyto_widget.set_style(visual_style)

        print("Enhanced colorful and interactive style applied successfully.")
    else:
        print("Skipping style definition.")

    return cyto_widget

In [None]:
#---Layout Setting for the widget---
def setting_layout(cyto_widget):

    if cyto_widget:
        print("Setting layout algorithm ('cose')...")
        # cose (Compound Spring Embedder) is often good for exploring connections
        cyto_widget.set_layout(name='cose',
                              animate=True,
                              # Adjust parameters for better spacing/layout
                              nodeRepulsion=4000, # Increase repulsion
                              nodeOverlap=40,    # Increase overlap avoidance
                              idealEdgeLength=120, # Slightly longer ideal edges
                              edgeElasticity=150,
                              nestingFactor=5,
                              gravity=100,        # Increase gravity slightly
                              numIter=1500,      # More iterations
                              initialTemp=200,
                              coolingFactor=0.95,
                              minTemp=1.0)
        print("Layout set. The graph will arrange itself when displayed.")
    else:
        print("Skipping layout setting.")

In [None]:
#---Widget Display Function---
def widget_display_function(cyto_widget):

    if cyto_widget:
        print("Displaying interactive graph widget below...")
        print("Interact: Zoom (scroll), Pan (drag background), Move Nodes (drag nodes), Hover for details.")
        display(cyto_widget)
    else:
        print("No widget to display.")

    # Add a clear separator
    print("\n" + "-" * 25 + "\nEnd of Visualization Step." + "\n" + "-" * 25)

Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
#---RDF Graph Creation---
def rdf_graph(knowledge_graph):

    from rdflib import Graph, URIRef, Namespace, Literal
    from urllib.parse import quote

    # RDFLib Graph
    rdf_graph = Graph()
    EX = Namespace("http://kg.local/")
    rdf_graph.bind("ex", EX)

    for source, target, data in knowledge_graph.edges(data=True):
        predicate = data.get("label", "relatedTo")

        # URL-encode all parts to create valid URIs
        source_uri = URIRef(EX[quote(str(source))])
        predicate_uri = URIRef(EX[quote(str(predicate))])
        target_uri = URIRef(EX[quote(str(target))])

        rdf_graph.add((source_uri, predicate_uri, target_uri))



    rdf_graph.serialize("kg.ttl", format="turtle")

    #--- validation of turtle file ---

    rdf_graph = Graph()
    rdf_graph.parse("kg.ttl", format="turtle")
    print(f"Number of triples in graph: {len(rdf_graph)}")

    return rdf_graph

In [None]:
#---SPARQL Query Generation---
def sparql_query(rdf):

    query = """
    SELECT ?subject ?predicate ?object
    WHERE {
      ?subject ?predicate ?object
    }
    LIMIT 16
    """

    results = rdf.query(query)

    for row in results:
        print(f"Subject: {row.subject}, Predicate: {row.predicate} Object {row.object}")

    df = pd.DataFrame(results, columns=["subject", "predicate", "object"])
    print(df)

    return sparql_query


In [None]:
#---Installing Dependencies---

install_dependencies()


Installing pip packages...
Installing apt packages (poppler-utils)...


In [None]:
import openai              # For interacting with OpenAI LLMs
import json                # For parsing and serializing LLM responses
import networkx as nx      # To create, manipulate, and analyze graph data structures
import ipycytoscape        # For interactive Cytoscape graph visualization in Jupyter notebooks
import ipywidgets          # To create and manage interactive widgets in Jupyter notebooks
import pandas as pd        # For data manipulation and tabular data structures (DataFrames)
import os                  # For accessing system environment variables like API keys
import math                # For basic and advanced mathematical operations
import re                  # For regular expression operations in text cleaning and pattern matching
import warnings            # To filter or suppress warning messages, such as deprecations
import together            # For interacting with Together AI's language models
import uuid                # For generating unique identifiers, such as chunk IDs
import re                  # (Duplicate) Used for pattern matching and text processing
from serpapi import GoogleSearch  # To perform web search queries using SerpAPI
import requests            # For making HTTP requests to fetch HTML or PDF files
from bs4 import BeautifulSoup     # For parsing and extracting content from HTML
from sentence_transformers import SentenceTransformer, util  # For semantic text embedding and similarity scoring
import PyPDF2              # For reading and extracting text from PDF files
from pdf2image import convert_from_path  # To convert PDF pages into images (for OCR)
import pytesseract         # For performing OCR (Optical Character Recognition) on images using Tesseract


def initialize_environment():


    #configuring settings for better display and fewer warnings
    warnings.filterwarnings('ignore', category=DeprecationWarning)
    pd.set_option('display.max_rows', 100) #display more rows in pandas tables
    pd.set_option('display.max_colwidth', 150) #display more text width in pandas tabs

    print("Libraries installed successfully.")

In [None]:
def KG_pipeline():
    # --- Setup environment and client ---
    initialize_environment()
    api_key, base_url, llm_model_name = retrieving_credentials()
    client_response_check()
    client = initialize_client(api_key, base_url, client="together")

    # --- LLM parameters ---
    llm_temperature, llm_max_tokens = llm_parameters()

    # --- Document search ---
    query = "satellite power system fault diagnosis PDF"
    links = search_satellite_power_docs(query, num_results=10)
    scored_links = rank_topic_links(links, base_topic="satellite power system fault diagnosis")

    # --- PDF text extraction with OCR fallback ---
    pdf_links = [link for link in links if link.lower().endswith('.pdf')]
    if pdf_links:
        pdf_url = pdf_links[0]  # pick first PDF link
        all_text = extract_text_from_pdf(pdf_url)
    else:
        all_text = ""
        print("No PDF link found.")

    # --- HTML text extraction fallback ---
    html_links = [link for link in links if not link.lower().endswith('.pdf')]
    if html_links:
        url = html_links[0]
        html_text = extract_text_from_html(url)
    else:
        html_text = ""
        print("No HTML link found.")

    # Prefer PDF text if available, otherwise HTML text
    para = all_text if all_text.strip() else html_text

    if para:
        para, char_count, word_count = unstructured_text(para)

    # --- Main document processing pipeline ---
    full_text = document_collection_pipeline_main()

    # --- Coreference resolution on full text ---
    # --- Coreference resolution on each chunk ---
    resolved_chunks = []
    for i, chunk in enumerate(full_text):
        print(f"Resolving corefs in chunk {i+1}/{len(full_text)}...")
        try:
            resolved = coref_prompt(chunk, client, llm_model_name, llm_temperature, llm_max_tokens)
            resolved_chunks.append(resolved)
        except Exception as e:
            print(f"Coref resolution failed on chunk {i+1}: {e}")
            resolved_chunks.append(chunk)  # fallback to original

    # Combine resolved text
    resolved_text = "\n".join(resolved_chunks)


    # --- Text chunking ---
    chunk_size, overlap, words, total_words = chunk_configuration(resolved_text)
    chunks, chunks_df = chunking_execution(chunk_size, overlap, words, total_words)

    # --- SPO extraction ---
    extraction_system_prompt, extraction_user_prompt_template = extraction_prompt(chunks)
    all_extracted_triples, failed_chunks = spo_extraction(
        extraction_user_prompt_template, extraction_system_prompt, chunks,
        llm_model_name, llm_temperature, llm_max_tokens, client
    )

    all_final_triples = extract_and_simplify_triples(all_extracted_triples, client, llm_model_name, llm_temperature, llm_max_tokens)
    all_triples_df = preview_spo_extraction(chunks, all_final_triples, failed_chunks)

    # --- Knowledge graph creation ---
    normalized_triples, normalized_df = triplet_normalization_execution(all_final_triples)
    knowledge_graph, added_edges_count = KG_creation(normalized_triples)
    summary = KG_summary(knowledge_graph, added_edges_count)

    num_nodes = summary["num_nodes"]
    num_edges = summary["num_edges"]

    cytoscape_graph_data, can_visualize = interactive_visualization(knowledge_graph)
    cyto_widget = ipycytoscape_widget_creation(knowledge_graph, can_visualize, cytoscape_graph_data)

    setting_layout(cyto_widget)
    widget_display_function(cyto_widget)

    # --- Create RDF graph and run SPARQL query ---
    rdf = rdf_graph(knowledge_graph)
    sparql_query(rdf)


In [None]:
from google.colab import output
output.enable_custom_widget_manager()

Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
#---calling KG_pipeline---
KG_pipeline()

Libraries installed successfully.
Intended LLM model: mistralai/Mixtral-8x7B-Instruct-v0.1
Retrieved API key: Set
Retrieved Base URL: Not Set (will use default TogetherAI)
 It seems like you are referring to the LLM (Master of Laws) model of education, which is typically offered online by many universities and institutions. Online education has become increasingly popular in recent years, as it allows students to access quality education from anywhere in the world, at their own pace and convenience.

Online LLM programs offer a range of benefits, such as flexibility, accessibility, and cost-effectiveness. They allow students to balance their work, family, and other commitments while pursuing their education. Additionally, online programs often offer a wide range of courses and specializations, giving students the opportunity to tailor their studies to their interests and career goals.

However, it is important to note that online learning requires a certain level of discipline, time ma

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

--- Input Text Loaded ---
Fault Detection and Diagnosis in Spacecraft Electrical
Power Systems
Marc A. Carbone∗
NASA Glenn Research Center, Cleveland, Ohio 44135
and
Kenneth A. Loparo†
Case Western Reserve University, Cleveland, Ohio 44106
https://doi.org/10.2514/1.I011136
The ability to accurately identify and isolate failures in the electrical power system (EPS) is critical to ensure the
reliability of spacecraft. This paper proposes a novel solution to the problem of fault detection and diagnosis in direct
current (DC) electric power systems for spacecraft. Autonomous operation becomes essential during deep spacemissions that lack the ability to monitor and control the spacecraft from ground locations. The current state of EPS
fault supervision is insufficient to guarantee highly reliable operation. To solve this issue, a combination of model-
based and knowledge-based techniques are used in a hierarchical framework to improve the diagnostic performanceof the system. Noise, disturba

Unnamed: 0,chunk_number,word_count,text
0,1,150,"[2405.17339] Physics-Informed Real NVP for Satellite Power System Fault Detection We gratefully acknowledge support from the Simons Foundation, me..."
1,2,150,"conditions and limited accessibility, raise the need for robust and reliable techniques to identify and prevent satellite faults. Fault detection ..."
2,3,150,"approach outperforms existing methods of fault detection, demonstrating its suitability for addressing the unique challenges of satellite EPS sub-..."
3,4,150,"the paper titled Physics-Informed Real NVP for Satellite Power System Fault Detection, by Carlo Cena and 4 other authorsView PDFHTML (experimental..."
4,5,150,is MathJax?) subscribe to arXiv mailingsClick here to subscribe Subscribe arXiv Operational Status Get status notifications via email or slack Acc...
5,6,150,Ray ID: 956ec00dad1f44e1Client IP: 35.190.141.199© ResearchGate GmbH. All rights reserved. ScienceDirect Skip to main content ScienceDirectHelp Th...
6,7,150,"CA 94035 Phone: (650) 604-4199 Ole.J.Mengshoel@nasa.gov Adnan Darwiche Computer Science Department University of California Los Angeles, CA 90095 ..."
7,8,150,"a rich variety of failure modes. This paper discusses electrical power system fault diagnosis by means of probabilistic techniques. Specically, w..."
8,9,150,developed for ADAPT along with successful experimental results. Keywords: Bayesian networks; arithmetic circuits; un- certainty; model-based diagn...
9,10,150,"tant issues that arise in engineering diagnostic applications in this area, namely the modelling challenge and the real- time reasoning challenge ..."


-------------------------
--- System Prompt ---

    You are an AI expert specialized in knowledge graph extraction.
    Your task is to identify and extract factual Subject-Predicate-Object (SPO) triples from the given text.
    When temporal information (e.g., years, dates, or time periods) is explicitly or implicitly available, include it as an additional "temporal" key in each triple.
    Focus on accuracy and adhere strictly to the JSON output format requested in the user prompt.
    Extract core entities and the most direct relationship.
    

-------------------------

--- User Prompt Template (Structure) ---

    Please extract Subject-Predicate-Object (S-P-O) triples from the text below.

    **VERY IMPORTANT RULES:**
    1.  **Output Format:** Respond ONLY with a single, valid JSON array. Each element MUST be an object with keys "subject", "predicate", "object". If a time or date is mentioned in relation to the triple, add a fourth key "temporal".
    2.  **JSON Only:** Do NO

Unnamed: 0,subject,predicate,object,temporal,chunk
0,carlo cena,and colleagues,submitted,2024-05-27 (v1),1
1,carlo cena,and colleagues,last revised,2024-12-02 (v2),1
2,fault detection methods,required,space sector,,1


--------------------
--- Running Total Triples Extracted: 3 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 2/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 2) ---
[
  { "subject": "this paper", "predicate": "proposes", "object": "an AI-based fault detection methodology" },
  { "subject": "our study", "predicate": "focuses", "object": "on the application of a PI Real NVP model for fault detection in space systems" },
  { "subject": "our physics-informed approach", "predicate": "outperforms", "object": "existing methods of fault detection" },
  { "subject": "the use of a physics-informed loss", "predicate": "has a competitive advantage", "object": "in addressing satellite EPS sub-system faults" }
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 2) ---
[
  {
    "subject": "this pape

Unnamed: 0,subject,predicate,object,chunk
0,this paper,proposes,an AI-based fault detection methodology,2
1,our study,focuses,on the application of a PI Real NVP model for fault detection in space systems,2
2,our physics-informed approach,outperforms,existing methods of fault detection,2
3,the use of a physics-informed loss,has a competitive advantage,in addressing satellite EPS sub-system faults,2


--------------------
--- Running Total Triples Extracted: 7 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 3/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 3) ---
[
  {
    "subject": "physics-informed real nvp",
    "predicate": "outperforms",
    "object": "existing methods of fault detection"
  },
  {
    "subject": "physics-informed real nvp",
    "predicate": "demonstrates",
    "object": "suitability for addressing unique challenges of satellite eps sub-system faults"
  },
  {
    "subject": "physics-informed loss in ai models",
    "predicate": "has",
    "object": "competitive advantage for space needs",
    "temporal": "2024"
  },
  {
    "subject": "physics-informed loss in ai models",
    "predicate": "addresses",
    "object": "specific space needs",
    "temporal": "2024"
  },
  {
    "subject": "specific space needs",
    "predicate": "include",
    "obje

Unnamed: 0,subject,predicate,object,chunk,temporal
0,physics-informed real nvp,outperforms,existing methods of fault detection,3,
1,physics-informed real nvp,demonstrates,suitability for addressing unique challenges of satellite eps sub-system faults,3,
2,physics-informed loss in ai models,has,competitive advantage for space needs,3,2024.0
3,physics-informed loss in ai models,addresses,specific space needs,3,2024.0
4,specific space needs,include,"robustness, reliability, and power constraints",3,2024.0
5,physics-informed real nvp for satellite power system fault detection,presented,2024 international conference on advanced intelligent mechatronics (aim),3,2024.0


--------------------
--- Running Total Triples Extracted: 13 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 4/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 4) ---
[
  {"subject": "carlo cena", "predicate": "co-authored", "object": "physics-informed real nvp for satellite power system fault detection"},
  {"subject": "arxivlabs", "predicate": "collaborates", "object": "experimental projects"},
  {"subject": "arxivlabs", "predicate": "embraces", "object": "openness"},
  {"subject": "arxivlabs", "predicate": "embraces", "object": "community"},
  {"subject": "arxivlabs", "predicate": "embraces", "object": "excellence"},
  {"subject": "arxivlabs", "predicate": "embraces", "object": "user data privacy"},
  {"subject": "arxiv", "predicate": "committed", "object": "values"},
  {"subject": "arxiv", "predicate": "works", "object": "with partners"},
  {"subject": "arxivlabs", "p

Unnamed: 0,subject,predicate,object,chunk
0,carlo cena,co-authored,physics-informed real nvp for satellite power system fault detection,4
1,arxivlabs,collaborates,experimental projects,4
2,arxivlabs,embraces,openness,4
3,arxivlabs,embraces,community,4
4,arxivlabs,embraces,excellence,4
5,arxivlabs,embraces,user data privacy,4
6,arxiv,committed,values,4
7,arxiv,works,with partners,4
8,arxivlabs,allows,develop and share new arxiv features,4


--------------------
--- Running Total Triples Extracted: 22 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 5/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 5) ---
[]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 5) ---
[]
--------------------
5. Validating structure and extracting triples...
   Found 0 valid triples in this chunk.
--- No valid triples extracted from this chunk. ---
--------------------
--- Running Total Triples Extracted: 22 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 6/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 6) ---
[
  {
    "subject": "ole j. mengshoel",
    "predicate": "works at",
    "object": "usra/riacs nasa ames research center

Unnamed: 0,subject,predicate,object,temporal,chunk
0,ole j. mengshoel,works at,usra/riacs nasa ames research center,present,6
1,ole j. mengshoel,phone,(650) 604-4199,,6
2,ole j. mengshoel,email,ole.j.mengshoel@nasa.gov,,6
3,adnan darwiche,works at,computer science department university of california los angeles,present,6
4,adnan darwiche,phone,(310) 206-5201,,6
5,adnan darwiche,email,darwiche@cs.ucla.edu,,6
6,keith cascio,works at,computer science department university of california,present,6


--------------------
--- Running Total Triples Extracted: 29 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 7/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 7) ---
[
  {"subject": "electrical power systems", "predicate": "play", "object": "critical role", "temporal": "present"},
  {"subject": "electrical power systems", "predicate": "exhibit", "object": "rich variety of failure modes", "temporal": "present"},
  {"subject": "this paper", "predicate": "discusses", "object": "electrical power system fault diagnosis", "temporal": "present"},
  {"subject": "this paper", "predicate": "develops", "object": "diagnostic capability for electrical power system faults", "temporal": "present"},
  {"subject": "diagnostic capability", "predicate": "based on", "object": "probabilistic techniques", "temporal": "present"}
]
--------------------
4. Attempting to parse JSON from response.

Unnamed: 0,subject,predicate,object,temporal,chunk
0,electrical power systems,play,critical role,present,7
1,electrical power systems,exhibit,rich variety of failure modes,present,7
2,this paper,discusses,electrical power system fault diagnosis,present,7
3,this paper,develops,diagnostic capability for electrical power system faults,present,7
4,diagnostic capability,based on,probabilistic techniques,present,7


--------------------
--- Running Total Triples Extracted: 34 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 8/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 8) ---
[
  {"subject": "we", "predicate": "discussed", "object": "development of a diagnostic capability for an electrical power system testbed ADAPT"},
  {"subject": "we", "predicate": "emphasized", "object": "tackled challenges regarding modelling and real-time performance"},
  {"subject": "we", "predicate": "discussed", "object": "bayesian network modeling approach for electrical power systems"},
  {"subject": "we", "predicate": "built", "object": "arithmetic circuits for real-time systems"},
  {"subject": "we", "predicate": "discussed", "object": "probabilistic diagnostic models for ADAPT"},
  {"subject": "we", "predicate": "achieved", "object": "real-time performance", "temporal": "recent"}
]
-----------------

Unnamed: 0,subject,predicate,object,chunk,temporal
0,we,discussed,development of a diagnostic capability for an electrical power system testbed ADAPT,8,
1,we,emphasized,tackled challenges regarding modelling and real-time performance,8,
2,we,discussed,bayesian network modeling approach for electrical power systems,8,
3,we,built,arithmetic circuits for real-time systems,8,
4,we,discussed,probabilistic diagnostic models for ADAPT,8,
5,we,achieved,real-time performance,8,recent


--------------------
--- Running Total Triples Extracted: 40 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 9/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 9) ---
[
  {
    "subject": "researchers from adapt",
    "predicate": "developed",
    "object": "bayesian networks and arithmetic circuits for uncertainty and model-based diagnosis",
    "temporal": "recently"
  },
  {
    "subject": "researchers from adapt",
    "predicate": "discussed",
    "object": "use of bayesian networks and arithmetic circuits in diagnosis and health management for electrical power systems",
    "temporal": "recently"
  },
  {
    "subject": "researchers from adapt",
    "predicate": "addressed",
    "object": "modelling challenge and real-time reasoning challenge in engineering diagnostic applications",
    "temporal": "recently"
  },
  {
    "subject": "researchers from adapt",
    "pre

Unnamed: 0,subject,predicate,object,temporal,chunk
0,researchers from adapt,developed,bayesian networks and arithmetic circuits for uncertainty and model-based diagnosis,recently,9
1,researchers from adapt,discussed,use of bayesian networks and arithmetic circuits in diagnosis and health management for electrical power systems,recently,9
2,researchers from adapt,addressed,modelling challenge and real-time reasoning challenge in engineering diagnostic applications,recently,9
3,researchers from adapt,utilized,samiam and ace tools,recently,9


--------------------
--- Running Total Triples Extracted: 44 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 10/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 10) ---
[
  { "subject": "advanced diagnostics and prognostics testbed", "predicate": "has", "object": "capabilities for power generation, power storage, and power distribution" },
  { "subject": "advanced diagnostics and prognostics testbed", "predicate": "is", "object": "a fully operational electrical power system", "temporal": "present" },
  { "subject": "advanced diagnostics and prognostics testbed", "predicate": "is", "object": "representative of such systems in aircraft and spacecraft" },
  { "subject": "probabilistic diagnostic application", "predicate": "is developed for", "object": "the advanced diagnostics and prognostics testbed" },
  { "subject": "probabilistic diagnostic application", "predicate": "is

Unnamed: 0,subject,predicate,object,chunk,temporal
0,advanced diagnostics and prognostics testbed,has,"capabilities for power generation, power storage, and power distribution",10,
1,advanced diagnostics and prognostics testbed,is,a fully operational electrical power system,10,present
2,advanced diagnostics and prognostics testbed,is,representative of such systems in aircraft and spacecraft,10,
3,probabilistic diagnostic application,is developed for,the advanced diagnostics and prognostics testbed,10,
4,probabilistic diagnostic application,is,an operational prototype,10,
5,probabilistic diagnostic application,works on,real-world data from ADAPT,10,
6,electrical power systems,are,critical for the proper functioning of various systems,10,


--------------------
--- Running Total Triples Extracted: 51 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 11/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 11) ---
[
  {"subject": "electrical power systems", "predicate": "are critical for", "object": "proper operation of aircraft and spacecraft", "temporal": "present"},
  {"subject": "electrical power systems loads", "predicate": "include", "object": "crucial subsystems such as avionics, propulsion, life support, and thermal management systems"},
  {"subject": "reliance on electrical power systems", "predicate": "is increasing", "object": "in all-electric aircraft and spacecraft designs"},
  {"subject": "electrical power systems", "predicate": "play", "object": "central roles in other parts of society"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- 

Unnamed: 0,subject,predicate,object,temporal,chunk
0,electrical power systems,are critical for,proper operation of aircraft and spacecraft,present,11
1,electrical power systems loads,include,"crucial subsystems such as avionics, propulsion, life support, and thermal management systems",,11
2,reliance on electrical power systems,is increasing,in all-electric aircraft and spacecraft designs,,11
3,electrical power systems,play,central roles in other parts of society,,11


--------------------
--- Running Total Triples Extracted: 55 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 12/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 12) ---
[
  {
    "subject": "we",
    "predicate": "discussed",
    "object": "challenges of developing a diagnostic reasoner for a real-world EPS"
  },
  {
    "subject": "the modelling challenge",
    "predicate": "concerns",
    "object": "how to model an EPS by means of Bayesian networks"
  },
  {
    "subject": "our use of Bayesian networks",
    "predicate": "motivated by",
    "object": "the combination of deterministic and stochastic behavior seen in EPS systems"
  },
  {
    "subject": "the modelling challenge",
    "predicate": "includes",
    "object": "constructing an EPS diagnostic model that captures deterministic and stochastic behavior"
  },
  {
    "subject": "the modelling challenge",
    "predi

Unnamed: 0,subject,predicate,object,chunk
0,we,discussed,challenges of developing a diagnostic reasoner for a real-world EPS,12
1,the modelling challenge,concerns,how to model an EPS by means of Bayesian networks,12
2,our use of Bayesian networks,motivated by,the combination of deterministic and stochastic behavior seen in EPS systems,12
3,the modelling challenge,includes,constructing an EPS diagnostic model that captures deterministic and stochastic behavior,12
4,the modelling challenge,includes,modelling the EPS in sufficient detail to ensure high diagnostic accuracy,12
5,the diagnostic model,should be,robust and easy,12


--------------------
--- Running Total Triples Extracted: 61 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 13/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 13) ---
[
  {"subject": "modeling eps", "predicate": "ensures", "object": "high diagnostic accuracy"},
  {"subject": "diagnostic model", "predicates": ["developed", "for"], "object": "particular eps"},
  {"subject": "diagnostic model", "predicate": "should be", "object": "robust, extendable, and general"},
  {"subject": "ai components", "predicate": "embedded", "object": "hard real-time systems"},
  {"subject": "decision support", "predicate": "of interest", "object": "nasa for manned missions"},
  {"subject": "unmanned missions", "predicate": "use", "object": "avionics with rtos"},
  {"subject": "embedded diagnostic engine", "predicate": "designed", "object": "within rtos framework"},
  {"subject": "rtos task", "

Unnamed: 0,subject,predicate,object,chunk
0,modeling eps,ensures,high diagnostic accuracy,13
1,diagnostic model,should be,"robust, extendable, and general",13
2,ai components,embedded,hard real-time systems,13
3,decision support,of interest,nasa for manned missions,13
4,unmanned missions,use,avionics with rtos,13
5,embedded diagnostic engine,designed,within rtos framework,13
6,rtos task,declares,worst-case execution time,13


--------------------
--- Running Total Triples Extracted: 68 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 14/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 14) ---
[
  {"subject": "vehicle's avionics", "predicate": "should be designed", "object": "within RTOS framework"},
  {"subject": "RTOS task", "predicate": "needs to declare", "object": "worst-case execution time"},
  {"subject": "Bayesian Network (BN) infer- ence problems", "predicate": "are inherently computationally hard", "temporal": "Cooper 1990; Shimony 1994; Park & Darwiche"},
  {"subject": "BN", "predicate": "consists of", "object": "nodes"},
  {"subject": "EPS health", "predicate": "is represented by", "object": "HE"},
  {"subject": "HE", "predicate": "belongs to", "object": "X"},
  {"subject": "HE", "predicate": "is composed of", "object": "HC"},
  {"subject": "HC", "predicate": "represents", "object": 

Unnamed: 0,subject,predicate,object,chunk,temporal
0,vehicle's avionics,should be designed,within RTOS framework,14,
1,RTOS task,needs to declare,worst-case execution time,14,
2,BN,consists of,nodes,14,
3,EPS health,is represented by,HE,14,
4,HE,belongs to,X,14,
5,HE,is composed of,HC,14,
6,HC,represents,health of an EPS excluding its sensors,14,
7,HC,belongs to,X,14,
8,EPS sensors' health,is represented by,HS,14,
9,HS,belongs to,X,14,


--------------------
--- Running Total Triples Extracted: 79 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 15/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 15) ---
[
  {"subject": "the BN", "predicate": "contains", "object": "nodes representing other parts of an EPS subsystem"},
  {"subject": "the BN", "predicate": "has", "object": "input or evidence nodes E", "temporal": "given text"},
  {"subject": "E", "predicate": "belongs to", "object": "X"},
  {"subject": "the BN", "predicate": "has", "object": "Bayesian Inference Command nodes Health nodes Sensor nodes", "temporal": "given text"},
  {"subject": "the BN", "predicate": "has as input", "object": "sensor readings for sensor nodes and observed commands for command nodes"},
  {"subject": "the BN", "predicate": "has as output", "object": "query nodes that provide the health status of sensors and EPS components"},
  {

Unnamed: 0,subject,predicate,object,chunk,temporal
0,the BN,contains,nodes representing other parts of an EPS subsystem,15,
1,the BN,has,input or evidence nodes E,15,given text
2,E,belongs to,X,15,
3,the BN,has,Bayesian Inference Command nodes Health nodes Sensor nodes,15,given text
4,the BN,has as input,sensor readings for sensor nodes and observed commands for command nodes,15,
5,the BN,has as output,query nodes that provide the health status of sensors and EPS components,15,
6,EC,represents,command,15,
7,EC,belongs to,X,15,


--------------------
--- Running Total Triples Extracted: 87 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 16/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 16) ---
[
  {"subject": "nodes", "predicate": "represent", "object": "components of EPS"},
  {"subject": "nodes", "predicate": "serve", "object": "as output query nodes"},
  {"subject": "health status", "predicate": "provide", "object": "of sensors and EPS components"},
  {"subject": "EC", "predicate": "represent", "object": "commands from a user to the EPS"},
  {"subject": "ES", "predicate": "used", "object": "to input sensor readings from the EPS"},
  {"subject": "status nodes S", "predicate": "reflect", "object": "the EPS structure"},
  {"subject": "HC", "predicate": "equal", "object": "fHealthRelay"},
  {"subject": "HS", "predicate": "equal", "object": "fHealthSensor"},
  {"subject": "EC", "predicate": "equal"

Unnamed: 0,subject,predicate,object,chunk
0,nodes,represent,components of EPS,16
1,nodes,serve,as output query nodes,16
2,health status,provide,of sensors and EPS components,16
3,EC,represent,commands from a user to the EPS,16
4,ES,used,to input sensor readings from the EPS,16
5,status nodes S,reflect,the EPS structure,16
6,HC,equal,fHealthRelay,16
7,HS,equal,fHealthSensor,16
8,EC,equal,fCommandRelay,16
9,ES,equal,fFeedbackSensor,16


--------------------
--- Running Total Triples Extracted: 98 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 17/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 17) ---
[
  {
    "subject": "the small bayesian network (bn)",
    "predicate": "represents",
    "object": "an eps component, namely a relay"
  },
  {
    "subject": "the status of a relay",
    "predicate": "depends on",
    "object": "the command given to it, commandrelay",
    "temporal": "the given context"
  },
  {
    "subject": "the feedback message from the relay",
    "predicate": "depends on",
    "object": "the relay's status and the sensor's health",
    "temporal": "the given context"
  },
  {
    "subject": "an adapt bn or an arithmetic circuit compiled from it",
    "predicate": "is used",
    "object": "to compute a diagnosis",
    "temporal": "the given context"
  },
  {
    "subject": "the baye

Unnamed: 0,subject,predicate,object,chunk,temporal
0,the small bayesian network (bn),represents,"an eps component, namely a relay",17,
1,the status of a relay,depends on,"the command given to it, commandrelay",17,the given context
2,the feedback message from the relay,depends on,the relay's status and the sensor's health,17,the given context
3,an adapt bn or an arithmetic circuit compiled from it,is used,to compute a diagnosis,17,the given context
4,the bayesian network,contains,over 400 variables,17,
5,probabilistic queries,are executed,over the health variables he,17,the given context


--------------------
--- Running Total Triples Extracted: 104 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 18/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 18) ---
[
  {"subject": "circuit", "predicate": "compiled from", "object": "it"},
  {"subject": "bayesian network", "predicate": "developed", "object": "contains over 400 variables"},
  {"subject": "we", "predicate": "executed", "object": "probabilistic queries over health variables HE"},
  {"subject": "ace", "predicate": "used to compute", "object": "mpes and marginals/mlvs"},
  {"subject": "ace", "predicate": "computed", "object": "map mpe(he,e) and map mlv(he, e)"},
  {"subject": "samiam", "predicate": "used to compute", "object": "map(he,e)"},
  {"subject": "results", "predicate": "provided", "object": "in table 2 and figure 4"},
  {"subject": "health variables he", "predicate": "contains over 120 nodes"},
  

Unnamed: 0,subject,predicate,object,chunk
0,circuit,compiled from,it,18
1,bayesian network,developed,contains over 400 variables,18
2,we,executed,probabilistic queries over health variables HE,18
3,ace,used to compute,mpes and marginals/mlvs,18
4,ace,computed,"map mpe(he,e) and map mlv(he, e)",18
5,samiam,used to compute,"map(he,e)",18
6,results,provided,in table 2 and figure 4,18
7,table 2,shows,variables deemed to be non-healthy in different fault scenarios,18
8,map mpe(he; e),results,0.71 0.80 0.61 0.81 0.62 0.84 0.918.047.978.238.958.688.249.55,18
9,"map mlv(he, e)",computed by ace,304 305 306,18


--------------------
--- Running Total Triples Extracted: 114 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 19/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 19) ---
[
  {
    "subject": "adapt testbed",
    "predicate": "used",
    "object": "2 hz sampling rate",
    "temporal": "given text"
  },
  {
    "subject": "diagnostic results",
    "predicate": "turned out to be",
    "object": "same",
    "temporal": "given text"
  },
  {
    "subject": "results",
    "predicate": "shown",
    "object": "figure 4",
    "temporal": "given text"
  },
  {
    "subject": "queries",
    "predicate": "consolidated",
    "object": "table 2",
    "temporal": "given text"
  }
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 19) ---
[
  {
    "subject": "adapt testbed",
    "predicate": "used",


Unnamed: 0,subject,predicate,object,temporal,chunk
0,adapt testbed,used,2 hz sampling rate,given text,19
1,diagnostic results,turned out to be,same,given text,19
2,results,shown,figure 4,given text,19
3,queries,consolidated,table 2,given text,19


--------------------
--- Running Total Triples Extracted: 118 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 20/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 20) ---
[
  {"subject": "he", "predicate": "consolidated", "object": "map(he,e)"},
  {"subject": "adapt", "predicate": "uses", "object": "2 hz sampling rate"},
  {"subject": "execution time statistics", "predicate": "based on", "object": "execution times for all probabilistic queries during an experimental run"},
  {"subject": "each execution time", "predicate": "for", "object": "an entire inference step"},
  {"subject": "inference times for the arithmetic circuits", "predicate": "are", "object": "fast and predictable"},
  {"subject": "these factors", "predicate": "are important", "object": "for real-time applications including electrical power system health management"}
]
--------------------
4. Attempting to pa

Unnamed: 0,subject,predicate,object,chunk
0,he,consolidated,"map(he,e)",20
1,adapt,uses,2 hz sampling rate,20
2,execution time statistics,based on,execution times for all probabilistic queries during an experimental run,20
3,each execution time,for,an entire inference step,20
4,inference times for the arithmetic circuits,are,fast and predictable,20
5,these factors,are important,for real-time applications including electrical power system health management,20


--------------------
--- Running Total Triples Extracted: 124 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 21/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 21) ---
[
  {"subject": "arithmetic circuits", "predicate": "have", "object": "fast and predictable inference times", "temporal": "see figure 4"},
  {"subject": "arithmetic circuits", "predicate": "are important", "object": "for real-time applications including electrical power system health management"},
  {"subject": "electrical power systems", "predicate": "are crucially important", "object": "in spacecraft and aircraft"},
  {"subject": "we", "predicate": "have presented", "object": "a probabilistic approach to fault diagnosis in electrical power systems"},
  {"subject": "adapt", "predicate": "is", "object": "an electrical power system testbed at nasa ames"},
  {"subject": "adapt", "predicate": "can be represe

Unnamed: 0,subject,predicate,object,temporal,chunk
0,arithmetic circuits,have,fast and predictable inference times,see figure 4,21
1,arithmetic circuits,are important,for real-time applications including electrical power system health management,,21
2,electrical power systems,are crucially important,in spacecraft and aircraft,,21
3,we,have presented,a probabilistic approach to fault diagnosis in electrical power systems,,21
4,adapt,is,an electrical power system testbed at nasa ames,,21
5,adapt,can be represented,as a bayesian network,,21
6,bayesian network,is,the basis for answering diagnostic queries,,21
7,two challenges,are highlighted,the modelling and real-time reasoning challenges,,21
8,the modelling challenge,is overcome,by discussing how the eps bn is structured in a component-based and causal manner,,21


--------------------
--- Running Total Triples Extracted: 133 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 22/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 22) ---
[
  {"subject": "our setting", "predicate": "overcomes", "object": "aircraft modelling challenges"},
  {"subject": "EPS BN", "predicate": "is structured", "object": "component-based and causal manner"},
  {"subject": "our approach", "predicate": "meets", "object": "real-time challenge", "temporal": "associated with RTOSs used in spacecraft and aircraft"},
  {"subject": "approach", "predicate": "compiles", "object": "into arithmetic circuit"},
  {"subject": "inference", "predicate": "is fast and predictable", "object": "in arithmetic circuit"},
  {"subject": "BN-based fault diagnosis methodology", "predicate": "has been evaluated", "object": "through experiments using real-world data from ADAPT EPS"},
  {"

Unnamed: 0,subject,predicate,object,chunk,temporal
0,our setting,overcomes,aircraft modelling challenges,22,
1,EPS BN,is structured,component-based and causal manner,22,
2,our approach,meets,real-time challenge,22,associated with RTOSs used in spacecraft and aircraft
3,approach,compiles,into arithmetic circuit,22,
4,inference,is fast and predictable,in arithmetic circuit,22,
5,BN-based fault diagnosis methodology,has been evaluated,through experiments using real-world data from ADAPT EPS,22,
6,faults affecting Gizzi spacecraft health and status,can be catastrophic,if un-addressed with proper diagnosis in timely manner,22,
7,intelligent on board fault diagnosis,is crucial,for Gizzi spacecraft,22,


--------------------
--- Running Total Triples Extracted: 141 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 23/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 23) ---
[
  {
    "subject": "gizzi 1",
    "predicate": "presented",
    "object": "at",
    "temporal": "36thAnnual Small Satellite Conference"
  },
  {
    "subject": "our system",
    "predicate": "tested",
    "object": "in three experiments",
    "temporal": null
  },
  {
    "subject": "our system",
    "predicate": "evaluated",
    "object": "against alternative baseline approaches",
    "temporal": null
  },
  {
    "subject": "we",
    "predicate": "demonstrate",
    "object": "algorithm using three sets of realistic spacecraft telemetry data",
    "temporal": null
  },
  {
    "subject": "we",
    "predicate": "compare",
    "object": "against baseline methods",
    "temporal": null
  },
  {
    "subje

Unnamed: 0,subject,predicate,object,temporal,chunk
0,gizzi 1,presented,at,36thAnnual Small Satellite Conference,23
1,our system,tested,in three experiments,,23
2,our system,evaluated,against alternative baseline approaches,,23
3,we,demonstrate,algorithm using three sets of realistic spacecraft telemetry data,,23
4,we,compare,against baseline methods,,23
5,we,demonstrate,tractability of the algorithm on realistic hardware,,23
6,research in autonomous fault recovery,grown,significantly in recent years,,23


--------------------
--- Running Total Triples Extracted: 148 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 24/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 24) ---
[
  {"subject": "wang et al.", "predicate": "employ", "object": "deep learning", "temporal": "recent years"},
  {"subject": "wang et al.", "predicate": "use", "object": "stacked autoencoders", "temporal": "recent years"},
  {"subject": "stacked autoencoders", "predicate": "on", "object": "preprocessed power data", "temporal": "recent years"},
  {"subject": "deep learning neural network", "predicate": "initialized", "object": "using stacked autoencoders", "temporal": "recent years"},
  {"subject": "deep learning neural network", "predicate": "classify", "object": "type of fault", "temporal": "recent years"},
  {"subject": "fang et al.", "predicate": "use", "object": "unsupervised learning", "temporal": "re

Unnamed: 0,subject,predicate,object,temporal,chunk
0,wang et al.,employ,deep learning,recent years,24
1,wang et al.,use,stacked autoencoders,recent years,24
2,stacked autoencoders,on,preprocessed power data,recent years,24
3,deep learning neural network,initialized,using stacked autoencoders,recent years,24
4,deep learning neural network,classify,type of fault,recent years,24
5,fang et al.,use,unsupervised learning,recent years,24
6,fang et al.,use,supervised learning,recent years,24
7,unsupervised learning,involve,denoising autoencoder in training,recent years,24
8,supervised learning,involve,deep neural network,recent years,24
9,both unsupervised and supervised learning,extract,fault features,recent years,24


--------------------
--- Running Total Triples Extracted: 159 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 25/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 25) ---
[
  {"subject": "deep neural network", "predicate": "learns", "object": "extract fault features from EPS telemetry"},
  {"subject": "carbone et al.", "predicate": "develop", "object": "method for online monitoring of EPS data"},
  {"subject": "carbone et al.'s method", "predicate": "examines", "object": "short circuits and sensor failures in telemetry"},
  {"subject": "carbone et al.'s method", "predicate": "uses", "object": "modeling in a Kalman Filter-based approach"},
  {"subject": "daigle et al.", "predicate": "leverage", "object": "model-based approach based on residuals"},
  {"subject": "daigle et al.'s approach", "predicate": "extends", "object": "established QED method"},
  {"subject": "daigle et 

Unnamed: 0,subject,predicate,object,chunk
0,deep neural network,learns,extract fault features from EPS telemetry,25
1,carbone et al.,develop,method for online monitoring of EPS data,25
2,carbone et al.'s method,examines,short circuits and sensor failures in telemetry,25
3,carbone et al.'s method,uses,modeling in a Kalman Filter-based approach,25
4,daigle et al.,leverage,model-based approach based on residuals,25
5,daigle et al.'s approach,extends,established QED method,25
6,daigle et al.'s approach,examines,systems level approach to a subsystems (QED-PC - Possible Conflicts) approach,25
7,ahn et al.,use,semi-supervised approach for anomalous fault detection,25
8,ahn et al.'s approach,uses,variational autoencoder and Generative Adversarial Network model,25
9,ahn et al.'s approach,provides,reconstruction error values used to quantify norm deviation,25


--------------------
--- Running Total Triples Extracted: 170 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 26/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 26) ---
[
  {"subject": "gao et al.", "predicate": "use", "object": "semi-supervised approach for anomalous fault detection"},
  {"subject": "semi-supervised approach for anomalous fault detection", "predicate": "uses", "object": "variational autoencoder and generative adversarial network model"},
  {"subject": "variational autoencoder and generative adversarial network model", "predicate": "provides", "object": "reconstruction error values"},
  {"subject": "reconstruction error values", "predicate": "used", "object": "to quantify norm deviation"},
  {"subject": "gao et al.", "predicate": "use", "object": "principle component analysis for complexity reduction"},
  {"subject": "principle component analysis for com

Unnamed: 0,subject,predicate,object,chunk
0,gao et al.,use,semi-supervised approach for anomalous fault detection,26
1,semi-supervised approach for anomalous fault detection,uses,variational autoencoder and generative adversarial network model,26
2,variational autoencoder and generative adversarial network model,provides,reconstruction error values,26
3,reconstruction error values,used,to quantify norm deviation,26
4,gao et al.,use,principle component analysis for complexity reduction,26
5,principle component analysis for complexity reduction,via,feature extraction,26
6,feature extraction,fed,into binary support vector machine,26
7,binary support vector machine,classifies,faults,26
8,fault vectors,processed,through multi-class svm,26
9,multi-class svm,determines,type of fault,26


--------------------
--- Running Total Triples Extracted: 184 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 27/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 27) ---
[
  {"subject": "rudolf e. kálmán", "predicate": "developed", "object": "kalman filters", "temporal": "1960"},
  {"subject": "kalman filters", "predicate": "used", "object": "navigation", "temporal": "present day"},
  {"subject": "kalman filter", "predicate": "linearizes", "object": "satellite telemetry data"},
  {"subject": "kalman filter", "predicate": "predicts", "object": "expected mean (x)", "temporal": "present day"},
  {"subject": "kalman filter", "predicate": "predicts", "object": "covariance (σ)", "temporal": "present day"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 27) ---
[
  {
    "subject": "rudolf

Unnamed: 0,subject,predicate,object,temporal,chunk
0,rudolf e. kálmán,developed,kalman filters,1960,27
1,kalman filters,used,navigation,present day,27
2,kalman filter,linearizes,satellite telemetry data,,27
3,kalman filter,predicts,expected mean (x),present day,27
4,kalman filter,predicts,covariance (σ),present day,27


--------------------
--- Running Total Triples Extracted: 189 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 28/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 28) ---
[
  {"subject": "kalman filter", "predicate": "capable of", "object": "error detection", "temporal": "previous studies"},
  {"subject": "kalman filter", "predicate": "involves", "object": "reshaping gaussian curve"},
  {"subject": "kalman filter", "predicate": "uses", "object": "noisy sensor data"},
  {"subject": "prediction", "predicate": "is measured as", "object": "state"},
  {"subject": "state", "predicate": "equals", "object": "HkXk+rk"},
  {"subject": "Hk", "predicate": "is", "object": "measurement model matrix"},
  {"subject": "rk", "predicate": "is", "object": "measurement noise"},
  {"subject": "updating process", "predicate": "can be described as"},
  {"subject": "vk", "predicate": "equals", "ob

Unnamed: 0,subject,predicate,object,temporal,chunk
0,kalman filter,capable of,error detection,previous studies,28
1,kalman filter,involves,reshaping gaussian curve,,28
2,kalman filter,uses,noisy sensor data,,28
3,prediction,is measured as,state,,28
4,state,equals,HkXk+rk,,28
5,Hk,is,measurement model matrix,,28
6,rk,is,measurement noise,,28
7,vk,equals,yk−Hk,,28
8,S,equals,HkΣ′ kHT k+R,,28
9,K,equals,Σ′ kHT kS−1,,28


--------------------
--- Running Total Triples Extracted: 201 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 29/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 29) ---
[
  {"subject": "kalman filters", "predicate": "capable of", "object": "error detection"},
  {"subject": "autoencoders", "predicate": "are", "object": "unsupervised representation learners"},
  {"subject": "autoencoders", "predicate": "enforce", "object": "reconstruction loss"},
  {"subject": "network", "predicate": "learns", "object": "internal representation"},
  {"subject": "network", "predicate": "maintains", "object": "information bottleneck"},
  {"subject": "network", "predicate": "maps", "object": "high-dimensional input data to a lower dimensional representation space"},
  {"subject": "network", "predicate": "split into", "object": "encoder and decoder networks"}
]
--------------------
4. Attempti

Unnamed: 0,subject,predicate,object,chunk
0,kalman filters,capable of,error detection,29
1,autoencoders,are,unsupervised representation learners,29
2,autoencoders,enforce,reconstruction loss,29
3,network,learns,internal representation,29
4,network,maintains,information bottleneck,29
5,network,maps,high-dimensional input data to a lower dimensional representation space,29
6,network,split into,encoder and decoder networks,29


--------------------
--- Running Total Triples Extracted: 208 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 30/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 30) ---
[
  {"subject": "network", "predicate": "maps", "object": "high-dimensional input data to a lower dimensional representation space"},
  {"subject": "network", "predicate": "is split into", "object": "encoder f:Rn→Rm and decoder networks g:Rm→Rn"},
  {"subject": "encoder f:Rn→Rm", "predicate": "takes", "object": "data sample x∈Rn as input"},
  {"subject": "encoder f:Rn→Rm", "predicate": "produces", "object": "representation y"},
  {"subject": "decoder networks g:Rm→Rn", "predicate": "takes", "object": "representation y as input"},
  {"subject": "decoder networks g:Rm→Rn", "predicate": "produces", "object": "reconstructed sample x′"},
  {"subject": "reconstruction loss θ", "predicate": "is calculated as", "

Unnamed: 0,subject,predicate,object,chunk
0,network,maps,high-dimensional input data to a lower dimensional representation space,30
1,network,is split into,encoder f:Rn→Rm and decoder networks g:Rm→Rn,30
2,encoder f:Rn→Rm,takes,data sample x∈Rn as input,30
3,encoder f:Rn→Rm,produces,representation y,30
4,decoder networks g:Rm→Rn,takes,representation y as input,30
5,decoder networks g:Rm→Rn,produces,reconstructed sample x′,30
6,reconstruction loss θ,is calculated as,E((x−x′)2),30
7,reconstruction loss θ,corresponds with,mean square error between our input and our reconstructed output,30


--------------------
--- Running Total Triples Extracted: 216 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 31/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 31) ---
[
  {
    "subject": "causal graphs",
    "predicate": "have",
    "object": "strong theoretical foundation"
  },
  {
    "subject": "causal structure",
    "predicate": "enables",
    "object": "dynamic change of causal relationships in real-time"
  },
  {
    "subject": "causal graphs",
    "predicate": "represent",
    "object": "direction and strength of causal relationship"
  },
  {
    "subject": "causal graphs",
    "predicate": "suit",
    "object": "method for diagnosing spacecraft telemetry faults"
  },
  {
    "subject": "partial correlation algorithm",
    "predicate": "utilized",
    "object": "compute causal graph"
  },
  {
    "subject": "time series Xt",
    "predicate": "consist of",
    

Unnamed: 0,subject,predicate,object,chunk
0,causal graphs,have,strong theoretical foundation,31
1,causal structure,enables,dynamic change of causal relationships in real-time,31
2,causal graphs,represent,direction and strength of causal relationship,31
3,causal graphs,suit,method for diagnosing spacecraft telemetry faults,31
4,partial correlation algorithm,utilized,compute causal graph,31
5,time series Xt,consist of,N data points of temporal length t,31
6,Xj t,equals,"fj(P(Xj t), nj t)",31
7,fj,is,nonlinear functional dependency,31
8,nj,represents,mutually independent statistical noise,31
9,nodes in the temporal causal graph,represent,variable Xj t at different time points,31


--------------------
--- Running Total Triples Extracted: 227 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 32/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 32) ---
[
  {"subject": "the temporal causal graph", "predicate": "represents", "object": "the variable Xj t at different time points"},
  {"subject": "P(Xj t)", "predicate": "is", "object": "the causal parents of Xj t"},
  {"subject": "a causal link Xi t−τ→Xj t", "predicate": "exists", "object": "if Xi t−τ is a causal parent of Xj t"},
  {"subject": "the PCMCI algorithm", "predicate": "consists", "object": "of two stages"},
  {"subject": "the first stage of the PCMCI algorithm", "predicate": "is", "object": "PC condition selection to identify ˆP(Xj t)"},
  {"subject": "the second stage of the PCMCI algorithm", "predicate": "is", "object": "the momentary conditional independence test (MCI)"},
  {"subject": "MCI",

Unnamed: 0,subject,predicate,object,chunk
0,the temporal causal graph,represents,the variable Xj t at different time points,32
1,P(Xj t),is,the causal parents of Xj t,32
2,a causal link Xi t−τ→Xj t,exists,if Xi t−τ is a causal parent of Xj t,32
3,the PCMCI algorithm,consists,of two stages,32
4,the first stage of the PCMCI algorithm,is,PC condition selection to identify ˆP(Xj t),32
5,the second stage of the PCMCI algorithm,is,the momentary conditional independence test (MCI),32
6,MCI,tests,the condition Xi t−τ→Xj t,32
7,MCI,conditions,on both the parents Xj t and time-shifted parents Xi t−τ,32
8,the PC algorithm,initializes,"the preliminary parent ˆP(Xj t) ={Xt−1, Xt−2,...,X t−τmax} for every variable Xj t",32
9,the initial iteration,checks,if H0: Xj t̸⊥ ⊥Xi t−τ cannot be rejected at pPC,32


--------------------
--- Running Total Triples Extracted: 237 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 33/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 33) ---
[
  {
    "subject": "the preliminary parent ˆP(Xj t)",
    "predicate": "initialize",
    "object": "Xt−1, Xt−2,..., Xt−τmax"
  },
  {
    "subject": "H0: Xj t",
    "predicate": "cannot be rejected at pPC",
    "object": "Xi t−τ"
  },
  {
    "subject": "the parents",
    "predicate": "are then sorted",
    "object": "by their test statistic value"
  },
  {
    "subject": "Xj t",
    "predicate": "conduct",
    "object": "conditional independence tests Xj t⊥ ⊥ Xi t−τ|P"
  },
  {
    "subject": "P",
    "predicate": "are",
    "object": "the strongest parents in ˆP(Xj t)"
  },
  {
    "subject": "the algorithm",
    "predicate": "terminates",
    "object": "if no more conditions can be tested"
  },
  {
 

Unnamed: 0,subject,predicate,object,chunk
0,the preliminary parent ˆP(Xj t),initialize,"Xt−1, Xt−2,..., Xt−τmax",33
1,H0: Xj t,cannot be rejected at pPC,Xi t−τ,33
2,the parents,are then sorted,by their test statistic value,33
3,Xj t,conduct,conditional independence tests Xj t⊥ ⊥ Xi t−τ|P,33
4,P,are,the strongest parents in ˆP(Xj t),33
5,the algorithm,terminates,if no more conditions can be tested,33
6,X1 t−2,→,the conditions ˆP(X3 t),33
7,F,represent,"all data frames in a flight mission from a starting time step 1, to ending time step|F|",33
8,"Fa,b",represents,"a finite set of input frames fa, . . . f b",33


--------------------
--- Running Total Triples Extracted: 246 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 34/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 34) ---
[
  { "subject": "flight mission", "predicate": "starts", "temporal": "time step 1" },
  { "subject": "flight mission", "predicate": "ends", "temporal": "see Table 1" },
  { "subject": "input frames", "predicate": "represent", "object": "telemetry data" },
  { "subject": "frame", "predicate": "occurs", "object": "time step" },
  { "subject": "telemetry mnemonic", "predicate": "has", "object": "range of permissible values" },
  { "subject": "telemetry mnemonic", "predicate": "has", "object": "range of faulting values" },
  { "subject": "value assignment", "predicate": "within", "object": "permissible range", "temporal": "any mnemonic mj" },
  { "subject": "value assignment", "predicate": "outside", "object

Unnamed: 0,subject,predicate,object,chunk,temporal
0,input frames,represent,telemetry data,34,
1,frame,occurs,time step,34,
2,telemetry mnemonic,has,range of permissible values,34,
3,telemetry mnemonic,has,range of faulting values,34,
4,value assignment,within,permissible range,34,any mnemonic mj
5,value assignment,outside,permissible range,34,any mnemonic mj
6,spacecraft,has,telemetry readings,34,
7,telemetry readings,are,nominal,34,if all readings are within permissible range
8,spacecraft,is in,GREEN status,34,if all telemetry readings are nominal


--------------------
--- Running Total Triples Extracted: 255 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 35/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 35) ---
[
  {
    "subject": "spacecraft",
    "predicate": "has",
    "object": "telemetry readings",
    "temporal": "in flight"
  },
  {
    "subject": "spacecraft",
    "predicate": "is",
    "object": "in green status",
    "temporal": "with green mission status"
  },
  {
    "subject": "spacecraft",
    "predicate": "encounters",
    "object": "scenario",
    "temporal": "during flight"
  },
  {
    "subject": "scenario",
    "predicate": "involves",
    "object": "telemetry mnemonics",
    "temporal": "in faulting state"
  },
  {
    "subject": "faulting mnemonics",
    "predicate": "cause",
    "object": "red mission status"
  },
  {
    "subject": "red mission status",
    "predicate": "indicates",
    "

Unnamed: 0,subject,predicate,object,temporal,chunk
0,spacecraft,has,telemetry readings,in flight,35
1,spacecraft,is,in green status,with green mission status,35
2,spacecraft,encounters,scenario,during flight,35
3,scenario,involves,telemetry mnemonics,in faulting state,35
4,faulting mnemonics,cause,red mission status,,35
5,red mission status,indicates,symptomatic faulting mnemonics,,35
6,diagnosis process,operates,over faulting mnemonics and data frames,,35
7,diagnosis process,finds,root cause mnemonics,using diagnosis process,35


--------------------
--- Running Total Triples Extracted: 263 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 36/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 36) ---
[
  {
    "subject": "diagnosis process D",
    "predicate": "considers",
    "object": "single root cause mnemonic",
    "temporal": "research time"
  },
  {
    "subject": "given set of faulting mnemonics S",
    "predicate": "exists",
    "object": "root cause mnemonic r"
  },
  {
    "subject": "root cause mnemonic r",
    "predicate": "found through",
    "object": "diagnosis process D"
  },
  {
    "subject": "diagnosis process D",
    "predicate": "maps",
    "object": "set of symptoms to root cause r"
  },
  {
    "subject": "set of symptoms",
    "predicate": "mapped to",
    "object": "unique cause r",
    "temporal": "with consideration of past data frames Fa,b"
  }
]
--------------------
4. At

Unnamed: 0,subject,predicate,object,temporal,chunk
0,diagnosis process D,considers,single root cause mnemonic,research time,36
1,given set of faulting mnemonics S,exists,root cause mnemonic r,,36
2,root cause mnemonic r,found through,diagnosis process D,,36
3,diagnosis process D,maps,set of symptoms to root cause r,,36
4,set of symptoms,mapped to,unique cause r,"with consideration of past data frames Fa,b",36


--------------------
--- Running Total Triples Extracted: 268 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 37/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 37) ---
[
  {"subject": "the first fault", "predicate": "occurs", "object": "at f3", "temporal": "near-past"},
  {"subject": "the first fault", "predicate": "caused", "object": "a faulty sensor reading"},
  {"subject": "the first fault", "predicate": "symptom", "object": "current"},
  {"subject": "the second fault", "predicate": "occurs", "object": "at f6-f9", "temporal": "near-past"},
  {"subject": "the second fault", "predicate": "triggered", "object": "from a threshold breach in temperature"},
  {"subject": "the second fault", "predicate": "symptom", "object": "temperature"},
  {"subject": "the second fault", "predicate": "cause", "object": "acceleration"},
  {"subject": "the third fault", "predicate": "occurs

Unnamed: 0,subject,predicate,object,temporal,chunk
0,the first fault,occurs,at f3,near-past,37
1,the first fault,caused,a faulty sensor reading,,37
2,the first fault,symptom,current,,37
3,the second fault,occurs,at f6-f9,near-past,37
4,the second fault,triggered,from a threshold breach in temperature,,37
5,the second fault,symptom,temperature,,37
6,the second fault,cause,acceleration,,37
7,the third fault,occurs,at f13-f15,near-past,37
8,the third fault,triggered,due to a reason not specified,,37


--------------------
--- Running Total Triples Extracted: 277 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 38/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 38) ---
[
  {
    "subject": "spacecraft",
    "predicate": "experiences",
    "object": "temperature increase",
    "temporal": "f13-f15"
  },
  {
    "subject": "temperature increase",
    "predicate": "caused by",
    "object": "acceleration breach",
    "temporal": "f13-f15"
  },
  {
    "subject": "acceleration",
    "predicate": "exceeds",
    "object": "threshold",
    "temporal": "f13-f15"
  },
  {
    "subject": "temperature",
    "predicate": "reaches",
    "object": "125°F",
    "temporal": "f13"
  },
  {
    "subject": "temperature",
    "predicate": "reaches",
    "object": "151°F",
    "temporal": "f14"
  }
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JS

Unnamed: 0,subject,predicate,object,temporal,chunk
0,spacecraft,experiences,temperature increase,f13-f15,38
1,temperature increase,caused by,acceleration breach,f13-f15,38
2,acceleration,exceeds,threshold,f13-f15,38
3,temperature,reaches,125°F,f13,38
4,temperature,reaches,151°F,f14,38


--------------------
--- Running Total Triples Extracted: 282 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 39/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 39) ---
[
  {
    "subject": "the top table",
    "predicate": "shows",
    "object": "upper and lower thresholds of each mnemonic mi"
  },
  {
    "subject": "the bottom table",
    "predicate": "shows",
    "object": "15 time steps expressed as time frames f1, ... f15, with their corresponding status values"
  },
  {
    "subject": "the temperature",
    "predicate": "increases",
    "object": "rapidly",
    "temporal": "during the flight mission"
  },
  {
    "subject": "one of the batteries",
    "predicate": "catches",
    "object": "fire"
  },
  {
    "subject": "the event",
    "predicate": "triggers",
    "object": "an increase in temperature"
  },
  {
    "subject": "the increase in temperature",
    "pr

Unnamed: 0,subject,predicate,object,chunk,temporal
0,the top table,shows,upper and lower thresholds of each mnemonic mi,39,
1,the bottom table,shows,"15 time steps expressed as time frames f1, ... f15, with their corresponding status values",39,
2,the temperature,increases,rapidly,39,during the flight mission
3,one of the batteries,catches,fire,39,
4,the event,triggers,an increase in temperature,39,
5,the increase in temperature,causes,a threshold breach in the current,39,
6,the cause of the threshold breach,is,a dead battery,39,


--------------------
--- Running Total Triples Extracted: 289 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 40/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 40) ---
[
  { "subject": "isolated fault", "predicate": "encompasses", "object": "faults resulting from bit flips or broken sensors", "temporal": null },
  { "subject": "isolated fault", "predicate": "considered", "object": "spoofs", "temporal": null },
  { "subject": "fault at F3,3", "predicate": "considered", "object": "isolated fault", "temporal": null },
  { "subject": "known fault", "predicate": "encompasses", "object": "faults that have happened in the past", "temporal": null },
  { "subject": "jammed reaction wheel", "predicate": "is", "object": "known fault", "temporal": null }
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Da

Unnamed: 0,subject,predicate,object,temporal,chunk
0,isolated fault,encompasses,faults resulting from bit flips or broken sensors,,40
1,isolated fault,considered,spoofs,,40
2,"fault at F3,3",considered,isolated fault,,40
3,known fault,encompasses,faults that have happened in the past,,40
4,jammed reaction wheel,is,known fault,,40


--------------------
--- Running Total Triples Extracted: 294 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 41/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 41) ---
[
  {"subject": "jammed reaction wheel", "predicate": "is", "object": "culprit of", "temporal": "past"},
  {"subject": "fault occurring at F6,9", "predicate": "is", "object": "known fault"},
  {"subject": "fault occurring at F13,15", "predicate": "is", "object": "anomalous fault"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 41) ---
[
  {
    "subject": "jammed reaction wheel",
    "predicate": "is",
    "object": "culprit of",
    "temporal": "past"
  },
  {
    "subject": "fault occurring at F6,9",
    "predicate": "is",
    "object": "known fault"
  },
  {
    "subject": "fault occurring at F13,15",
    "predi

Unnamed: 0,subject,predicate,object,temporal,chunk
0,jammed reaction wheel,is,culprit of,past,41
1,"fault occurring at F6,9",is,known fault,,41
2,"fault occurring at F13,15",is,anomalous fault,,41


--------------------
--- Running Total Triples Extracted: 297 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 42/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 42) ---
[
  { "subject": "fault at f13,15", "predicate": "is considered", "object": "anomalous fault" },
  { "subject": "kalman construct", "predicate": "represents", "object": "how a particular telemetry mnemonic is performing relative to its own individual past history of performance" },
  { "subject": "ae construct", "predicate": "is", "object": "set of mnemonic names" }
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 42) ---
[
  {
    "subject": "fault at f13,15",
    "predicate": "is considered",
    "object": "anomalous fault"
  },
  {
    "subject": "kalman construct",
    "predicate": "represents",
    "object": "ho

Unnamed: 0,subject,predicate,object,chunk
0,"fault at f13,15",is considered,anomalous fault,42
1,kalman construct,represents,how a particular telemetry mnemonic is performing relative to its own individual past history of performance,42
2,ae construct,is,set of mnemonic names,42


--------------------
--- Running Total Triples Extracted: 300 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 43/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 43) ---
[
  {
    "subject": "the AE construct",
    "predicate": "represents",
    "object": "how a mnemonic is performing over a small window of time, relative to the other mnemonics in the frames",
    "temporal": "independent of other data points in the frame"
  },
  {
    "subject": "the Causality construct",
    "predicate": "represents",
    "object": "the relatedness of mnemonics to one another",
    "temporal": "over a small window of time"
  },
  {
    "subject": "the Kalman construct",
    "predicate": "shows",
    "object": "mnemonic-specific data over the life of the mission"
  },
  {
    "subject": "this candidate value",
    "predicate": "is breaking",
    "object": "its own Kalman value"
  },
  {


Unnamed: 0,subject,predicate,object,temporal,chunk
0,the AE construct,represents,"how a mnemonic is performing over a small window of time, relative to the other mnemonics in the frames",independent of other data points in the frame,43
1,the Causality construct,represents,the relatedness of mnemonics to one another,over a small window of time,43
2,the Kalman construct,shows,mnemonic-specific data over the life of the mission,,43
3,this candidate value,is breaking,its own Kalman value,,43
4,this candidate value,returns,a KNOWN FAULT,,43


--------------------
--- Running Total Triples Extracted: 305 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 44/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 44) ---
[
  {
    "subject": "the value which is most highly related to all faulting mnemonics",
    "predicate": "breaks",
    "object": "its own Kalman value",
    "temporal": "line 13"
  },
  {
    "subject": "the value which is most highly related to all faulting mnemonics",
    "predicate": "returns",
    "object": "KNOWN FAULT",
    "temporal": "line 13"
  },
  {
    "subject": "the mnemonic with its top-1 most related to ˆmAS the new candidate root cause",
    "predicate": "is considered",
    "temporal": "line 15"
  },
  {
    "subject": "the mnemonic with its top-1 most related to ˆmAS the new candidate root cause",
    "predicate": "repeats",
    "object": "Kalman check criterion",
    "temporal": "line

Unnamed: 0,subject,predicate,object,temporal,chunk
0,the value which is most highly related to all faulting mnemonics,breaks,its own Kalman value,line 13,44
1,the value which is most highly related to all faulting mnemonics,returns,KNOWN FAULT,line 13,44
2,the mnemonic with its top-1 most related to ˆmAS the new candidate root cause,repeats,Kalman check criterion,line 15,44
3,the search,does not consider,faulting mnemonics and visited mnemonics,line 16,44
4,the value which is most highly related to all faulting mnemonics,has,substantial individual error,line 18,44


--------------------
--- Running Total Triples Extracted: 310 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 45/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 45) ---
[
  {
    "subject": "the algorithm 2 walkdown diagnosis algorithm",
    "predicate": "uses",
    "object": "a causality matrix"
  },
  {
    "subject": "the causality matrix",
    "predicate": "is denoted as",
    "object": "c"
  },
  {
    "subject": "c",
    "predicate": "is composed of",
    "object": "a pair of matrices (⊙C, ⋆C)"
  },
  {
    "subject": "each frame",
    "predicate": "consists of",
    "object": "n mnemonics"
  },
  {
    "subject": "telemetry mnemonics",
    "predicate": "can break",
    "object": "their Kalman value"
  },
  {
    "subject": "telemetry mnemonics",
    "predicate": "are denoted as",
    "object": "⊙K"
  },
  {
    "subject": "the algorithm",
    "predicate": "maintai

Unnamed: 0,subject,predicate,object,chunk
0,the algorithm 2 walkdown diagnosis algorithm,uses,a causality matrix,45
1,the causality matrix,is denoted as,c,45
2,c,is composed of,"a pair of matrices (⊙C, ⋆C)",45
3,each frame,consists of,n mnemonics,45
4,telemetry mnemonics,can break,their Kalman value,45
5,telemetry mnemonics,are denoted as,⊙K,45
6,the algorithm,maintains,a static list of telemetry mnemonic names,45
7,the static list,is denoted as,⊙,45


--------------------
--- Running Total Triples Extracted: 318 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 46/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 46) ---
[
  {"subject": "system", "predicate": "tested", "object": "accuracy and success of diagnostic algorithm", "temporal": "software-only proof-of-concept setting"},
  {"subject": "system", "predicate": "tested", "object": "diagnostic algorithm", "temporal": "three data sets"},
  {"subject": "three data sets", "predicate": "had", "object": "known ground truth values"},
  {"subject": "one data set", "predicate": "had", "object": "inferred ground truth values"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 46) ---
[
  {
    "subject": "system",
    "predicate": "tested",
    "object": "accuracy and success of diagnostic

Unnamed: 0,subject,predicate,object,temporal,chunk
0,system,tested,accuracy and success of diagnostic algorithm,software-only proof-of-concept setting,46
1,system,tested,diagnostic algorithm,three data sets,46
2,three data sets,had,known ground truth values,,46
3,one data set,had,inferred ground truth values,,46


--------------------
--- Running Total Triples Extracted: 322 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 47/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 47) ---
[
  { "subject": "our diagnostic algorithm", "predicate": "was tested", "object": "on three data sets", "temporal": "in a software-only proof-of-concept setting" },
  { "subject": "our diagnostic system", "predicate": "was evaluated", "object": "for on-board tractability testing", "temporal": "in a hardware-in-the-loop setting" },
  { "subject": "testing", "predicate": "evaluated", "object": "the accuracy of our diagnostic system", "temporal": "in the first portion" },
  { "subject": "our system", "predicate": "was tested", "object": "on three data sets", "temporal": "across three separate experiments" },
  { "subject": "data sets", "predicate": "included", "object": "a set of time-synced frames of spacec

Unnamed: 0,subject,predicate,object,temporal,chunk
0,our diagnostic algorithm,was tested,on three data sets,in a software-only proof-of-concept setting,47
1,our diagnostic system,was evaluated,for on-board tractability testing,in a hardware-in-the-loop setting,47
2,testing,evaluated,the accuracy of our diagnostic system,in the first portion,47
3,our system,was tested,on three data sets,across three separate experiments,47
4,data sets,included,a set of time-synced frames of spacecraft telemetry,in each experiment,47
5,each data set,contained,"thresholds, or allowable state-based values",,47
6,telemetry testing,was simplified,by converting all tests into threshold-based testing,,47


--------------------
--- Running Total Triples Extracted: 329 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 48/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 48) ---
[
  { "subject": "continuous telemetry values", "predicate": "were converted", "object": "to decimal values" },
  { "subject": "discrete state-based mnemonics", "predicate": "were converted", "object": "to representative integer values" },
  { "subject": "representative integer values", "predicate": "were converted", "object": "into decimal equivalent values" },
  { "subject": "auto encoder", "predicate": "was trained", "object": "using a data split on our available samples" },
  { "subject": "data split", "predicate": "provided", "object": "50% fault data, and 50% non-faulting data" },
  { "subject": "diagnostic algorithm", "predicate": "used", "object": "threshold-based testing to detect/trigger faults"

Unnamed: 0,subject,predicate,object,chunk
0,continuous telemetry values,were converted,to decimal values,48
1,discrete state-based mnemonics,were converted,to representative integer values,48
2,representative integer values,were converted,into decimal equivalent values,48
3,auto encoder,was trained,using a data split on our available samples,48
4,data split,provided,"50% fault data, and 50% non-faulting data",48
5,diagnostic algorithm,used,threshold-based testing to detect/trigger faults,48
6,diagnostic algorithm,rendered,a root cause,48
7,algorithm,tested,against established baselines,48
8,hyperparameter tuning,performed,comprehensively,48
9,ablation testing,performed,on each component trained individually,48


--------------------
--- Running Total Triples Extracted: 340 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 49/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 49) ---
[
  {"subject": "toy data set", "predicate": "consisted of", "object": "11 flights of basic spacecraft housekeeping and physics data"},
  {"subject": "data set 1", "predicate": "had name", "object": "toy data set"},
  {"subject": "data set 1", "predicate": "consisted of", "object": "11 flights"},
  {"subject": "flights", "predicate": "of", "object": "spacecraft housekeeping and physics data"},
  {"subject": "data set 1", "predicate": "had complexity level", "object": "1"},
  {"subject": "data set 2", "predicate": "had name", "object": "simulation generated data set"},
  {"subject": "data set 2", "predicate": "consisted of", "object": "10 flights of sounding rocket data"},
  {"subject": "data set 2", "pred

Unnamed: 0,subject,predicate,object,chunk
0,toy data set,consisted of,11 flights of basic spacecraft housekeeping and physics data,49
1,data set 1,had name,toy data set,49
2,data set 1,consisted of,11 flights,49
3,flights,of,spacecraft housekeeping and physics data,49
4,data set 1,had complexity level,1,49
5,data set 2,had name,simulation generated data set,49
6,data set 2,consisted of,10 flights of sounding rocket data,49
7,data set 2,generated from,kerbal space program flight simulation,49
8,data set 2,had complexity level,2,49
9,data set 3,had name,sounding rocket data set,49


--------------------
--- Running Total Triples Extracted: 352 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 50/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 50) ---
[
  {"subject": "experiment 1", "predicate": "had", "object": "0.77 accuracy rate", "temporal": "overall"},
  {"subject": "experiment 1", "predicate": "correctly diagnosed", "object": "faults"},
  {"subject": "system", "predicate": "successfully classified", "object": "type of fault", "temporal": "70% of correctly diagnosed faults"},
  {"subject": "system", "predicate": "mis-classified", "object": "2 of 9 faults", "temporal": "as anomalous faults instead of nominal faults"},
  {"subject": "system", "predicate": "not fully successful", "object": "classification", "temporal": "in two missions"},
  {"subject": "mission", "predicate": "had", "object": "current jump"}
]
--------------------
4. Attempting to pa

Unnamed: 0,subject,predicate,object,temporal,chunk
0,experiment 1,had,0.77 accuracy rate,overall,50
1,experiment 1,correctly diagnosed,faults,,50
2,system,successfully classified,type of fault,70% of correctly diagnosed faults,50
3,system,mis-classified,2 of 9 faults,as anomalous faults instead of nominal faults,50
4,system,not fully successful,classification,in two missions,50
5,mission,had,current jump,,50


--------------------
--- Running Total Triples Extracted: 358 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 51/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 51) ---
[
  {
    "subject": "a mission",
    "predicate": "was caused by",
    "object": "a science instrument onboard",
    "temporal": "unspecified"
  },
  {
    "subject": "system",
    "predicate": "performed",
    "object": "walkdown method",
    "temporal": "unspecified"
  },
  {
    "subject": "system",
    "predicate": "was unable to find",
    "object": "root cause",
    "temporal": "unspecified"
  },
  {
    "subject": "science instrument readings",
    "predicate": "were",
    "object": "boolean indicators representing whether the instrument was on or not",
    "temporal": "unspecified"
  },
  {
    "subject": "simplicity of the readings",
    "predicate": "never able to break",
    "object": "its own

Unnamed: 0,subject,predicate,object,temporal,chunk
0,a mission,was caused by,a science instrument onboard,unspecified,51
1,system,performed,walkdown method,unspecified,51
2,system,was unable to find,root cause,unspecified,51
3,science instrument readings,were,boolean indicators representing whether the instrument was on or not,unspecified,51
4,simplicity of the readings,never able to break,its own Kalman filter,unspecified,51
5,another mission,was inaccurately diagnosed,one in which the spacecraft caught on fire,unspecified,51
6,cause,was external,to the satellite telemetry set,unspecified,51


--------------------
--- Running Total Triples Extracted: 365 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 52/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 52) ---
[
  {"subject": "experiment 2", "predicate": "had", "object": "0.7 accuracy rate", "temporal": "for diagnosing" },
  {"subject": "system", "predicate": "was able to", "object": "successfully classify the type of fault in 71% cases", "temporal": "of the correctly diagnosed faults" },
  {"subject": "algorithm", "predicate": "diagnosed", "object": "the change in velocity caused a communication drop", "temporal": "instead of the actual fault" }
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 52) ---
[
  {
    "subject": "experiment 2",
    "predicate": "had",
    "object": "0.7 accuracy rate",
    "temporal": "for diagn

Unnamed: 0,subject,predicate,object,temporal,chunk
0,experiment 2,had,0.7 accuracy rate,for diagnosing,52
1,system,was able to,successfully classify the type of fault in 71% cases,of the correctly diagnosed faults,52
2,algorithm,diagnosed,the change in velocity caused a communication drop,instead of the actual fault,52


--------------------
--- Running Total Triples Extracted: 368 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 53/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 53) ---
[
  { "subject": "the first mission", "predicate": "had", "object": "a change in velocity that caused a communication drop" },
  { "subject": "the first mission", "predicate": "was diagnosed", "object": "the change in the vessel's velocity" },
  { "subject": "the other missions", "predicate": "included", "object": "a mission where changes in pressure caused a navigation SEU" },
  { "subject": "the other missions", "predicate": "included", "object": "a mission where a decoupling on board caused a communication drop" },
  { "subject": "experiment 3", "predicate": "was composed", "object": "of one mission" },
  { "subject": "the ground truth diagnosis", "predicate": "for the nominal fault in skin temperature

Unnamed: 0,subject,predicate,object,chunk
0,the first mission,had,a change in velocity that caused a communication drop,53
1,the first mission,was diagnosed,the change in the vessel's velocity,53
2,the other missions,included,a mission where changes in pressure caused a navigation SEU,53
3,the other missions,included,a mission where a decoupling on board caused a communication drop,53
4,experiment 3,was composed,of one mission,53
5,the ground truth diagnosis,for the nominal fault in skin temperature,was the Z acceleration value,53
6,the algorithm,consistently diagnosed,the Z magnetometer reading,53
7,the algorithm,was unable to catch,Z acceleration as the root cause,53


--------------------
--- Running Total Triples Extracted: 376 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 54/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 54) ---
[
  {
    "subject": "gizzi 9",
    "predicate": "presented",
    "object": "36th annual small satellite conference"
  },
  {
    "subject": "z magnetometer reading",
    "predicate": "indicated",
    "object": "increased speed"
  },
  {
    "subject": "loss in communication with ground control",
    "predicate": "caused by",
    "object": "velocity"
  },
  {
    "subject": "top one-two diagnoses of ppo",
    "predicate": "compared to",
    "object": "ensemble approach"
  },
  {
    "subject": "top one-two diagnoses of ae",
    "predicate": "compared to",
    "object": "ensemble approach"
  },
  {
    "subject": "walkdown method diagnosis",
    "predicate": "compared to",
    "object": "ensemble approach"

Unnamed: 0,subject,predicate,object,chunk
0,gizzi 9,presented,36th annual small satellite conference,54
1,z magnetometer reading,indicated,increased speed,54
2,loss in communication with ground control,caused by,velocity,54
3,top one-two diagnoses of ppo,compared to,ensemble approach,54
4,top one-two diagnoses of ae,compared to,ensemble approach,54
5,walkdown method diagnosis,compared to,ensemble approach,54
6,construct,required,milliseconds per frame across 100 runs,54
7,ae,processed,1.05 ms per frame across 100 runs,54
8,causality,processed,5.05 ms per frame across 100 runs,54


--------------------
--- Running Total Triples Extracted: 385 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 55/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 55) ---
[
  {"subject": "ablation study", "predicate": "tested", "object": "ai components", "temporal": "100 runs"},
  {"subject": "ai components", "predicate": "tested for", "object": "processing time", "temporal": "100 runs"},
  {"subject": "frame sizes", "predicate": "composed of", "object": "mnemonics per frame"},
  {"subject": "batch experiment", "predicate": "tested against", "object": "reinforcement learning based Proximal Policy Optimization algorithm"},
  {"subject": "batch experiment", "predicate": "tested against", "object": "standalone autoencoder approach"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 55) --

Unnamed: 0,subject,predicate,object,temporal,chunk
0,ablation study,tested,ai components,100 runs,55
1,ai components,tested for,processing time,100 runs,55
2,frame sizes,composed of,mnemonics per frame,,55
3,batch experiment,tested against,reinforcement learning based Proximal Policy Optimization algorithm,,55
4,batch experiment,tested against,standalone autoencoder approach,,55


--------------------
--- Running Total Triples Extracted: 390 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 56/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 56) ---
[
  { "subject": "our algorithm", "predicate": "outperformed", "object": "top ranking baseline methods", "temporal": "experiments 1 and 2" },
  { "subject": "our algorithm", "predicate": "was tested", "object": "reinforcement learning based Proximal Policy Optimization algorithm", "temporal": "table 3" },
  { "subject": "our algorithm", "predicate": "was tested", "object": "standalone autoencoder approach", "temporal": "table 3" },
  { "subject": "our algorithm", "predicate": "performed", "object": "tractability testing", "temporal": "on a Raspberry Pi 3 B+" },
  { "subject": "our algorithm", "predicate": "isolated", "object": "performance tests on individual components of the framework", "temporal": "as 

Unnamed: 0,subject,predicate,object,temporal,chunk
0,our algorithm,outperformed,top ranking baseline methods,experiments 1 and 2,56
1,our algorithm,was tested,reinforcement learning based Proximal Policy Optimization algorithm,table 3,56
2,our algorithm,was tested,standalone autoencoder approach,table 3,56
3,our algorithm,performed,tractability testing,on a Raspberry Pi 3 B+,56
4,our algorithm,isolated,performance tests on individual components of the framework,as a part of an ablation approach,56


--------------------
--- Running Total Triples Extracted: 395 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 57/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 57) ---
[
  {"subject": "we", "predicate": "isolated", "object": "performance tests", "temporal": "not specified"},
  {"subject": "we", "predicate": "used", "object": "Python’s time module", "temporal": "not specified"},
  {"subject": "each component", "predicate": "was tested", "object": "against 10, 50, 100, and 200 mnemonics per frame", "temporal": "not specified"},
  {"subject": "the top performing component", "predicate": "was", "object": "the Kalman Filter", "temporal": "not specified"},
  {"subject": "the Kalman Filter", "predicate": "performed", "object": "0.026ms worse than the AE initially at 10 mpf", "temporal": "not specified"},
  {"subject": "the Kalman Filter", "predicate": "had", "object": "the sma

Unnamed: 0,subject,predicate,object,temporal,chunk
0,we,isolated,performance tests,not specified,57
1,we,used,Python’s time module,not specified,57
2,each component,was tested,"against 10, 50, 100, and 200 mnemonics per frame",not specified,57
3,the top performing component,was,the Kalman Filter,not specified,57
4,the Kalman Filter,performed,0.026ms worse than the AE initially at 10 mpf,not specified,57
5,the Kalman Filter,had,the smallest growth rate as the number of mnemonics increased,not specified,57
6,Causality,performed,most poorly overall,not specified,57
7,Causality,had,an initial processing time of 0.845ms for 10 mpf,not specified,57
8,Causality,had,a 6.414 growth rate,not specified,57


--------------------
--- Running Total Triples Extracted: 404 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 58/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 58) ---
[
  {"subject": "our algorithm", "predicate": "outperformed", "object": "Kalman and AE", "temporal": "at 200mpf"},
  {"subject": "our algorithm", "predicate": "has", "object": "a growth rate of 6.414"},
  {"subject": "Kalman", "predicate": "has", "object": "a growth rate of 9.38"},
  {"subject": "our algorithm", "predicate": "currently renders", "object": "an×nmatrix of associations at diagnosis time"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 58) ---
[
  {
    "subject": "our algorithm",
    "predicate": "outperformed",
    "object": "Kalman and AE",
    "temporal": "at 200mpf"
  },
  {
    "subject": "our al

Unnamed: 0,subject,predicate,object,temporal,chunk
0,our algorithm,outperformed,Kalman and AE,at 200mpf,58
1,our algorithm,has,a growth rate of 6.414,,58
2,Kalman,has,a growth rate of 9.38,,58
3,our algorithm,currently renders,an×nmatrix of associations at diagnosis time,,58


--------------------
--- Running Total Triples Extracted: 408 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 59/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 59) ---
[
  {"subject": "you", "predicate": "requested", "object": "causality vectors"},
  {"subject": "walkdown algorithm", "predicate": "excludes", "object": "queries on symptomatic mnemonics"},
  {"subject": "search", "predicate": "reduced to", "object": "(n−k)×(n−k) for k symptom mnemonics"},
  {"subject": "diagnosis", "predicate": "triggered", "object": "by at least one faulting mnemonic"},
  {"subject": "query time", "predicate": "constant", "temporal": "always"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 59) ---
[
  {
    "subject": "you",
    "predicate": "requested",
    "object": "causality vectors"
  },
  {


Unnamed: 0,subject,predicate,object,chunk
0,you,requested,causality vectors,59
1,walkdown algorithm,excludes,queries on symptomatic mnemonics,59
2,search,reduced to,(n−k)×(n−k) for k symptom mnemonics,59
3,diagnosis,triggered,by at least one faulting mnemonic,59


--------------------
--- Running Total Triples Extracted: 412 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 60/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 60) ---
[
  {"subject": "tianshe yang", "predicate": "et al", "object": "data mining- based fault detection and prediction methods for in-orbit satellite", "temporal": "2013"},
  {"subject": "aboul ella hassanien", "predicate": "et al", "object": "machine learning in telemetry data mining of space mission: basics, challenging and future directions", "temporal": "2020"},
  {"subject": "yixing wang", "predicate": "et al", "object": "deep learning neural network for power system fault diagnosis", "temporal": "2016"}
]
--------------------
4. Attempting to parse JSON from response...
   Successfully parsed JSON list directly.
--- Parsed JSON Data (Chunk 60) ---
[
  {
    "subject": "tianshe yang",
    "predicate": "e

Unnamed: 0,subject,predicate,object,temporal,chunk
0,tianshe yang,et al,data mining- based fault detection and prediction methods for in-orbit satellite,2013,60
1,aboul ella hassanien,et al,"machine learning in telemetry data mining of space mission: basics, challenging and future directions",2020,60
2,yixing wang,et al,deep learning neural network for power system fault diagnosis,2016,60


--------------------
--- Running Total Triples Extracted: 415 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 61/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 61) ---
[
  {"subject": "wang", "predicate": "co-authored", "object": "deep learning neural network for power system fault diagnosis", "temporal": "2016"},
  {"subject": "meiqin liu", "predicate": "co-authored", "object": "deep learning neural network for power system fault diagnosis", "temporal": "2016"},
  {"subject": "zhejing bao", "predicate": "co-authored", "object": "deep learning neural network for power system fault diagnosis", "temporal": "2016"},
  {"subject": "hongzheng fang", "predicate": "co-authored", "object": "spacecraft power system fault diagnosis based on dnn", "temporal": "2017"},
  {"subject": "hui shi", "predicate": "co-authored", "object": "spacecraft power system fault diagnosis based on d

Unnamed: 0,subject,predicate,object,temporal,chunk
0,wang,co-authored,deep learning neural network for power system fault diagnosis,2016.0,61
1,meiqin liu,co-authored,deep learning neural network for power system fault diagnosis,2016.0,61
2,zhejing bao,co-authored,deep learning neural network for power system fault diagnosis,2016.0,61
3,hongzheng fang,co-authored,spacecraft power system fault diagnosis based on dnn,2017.0,61
4,hui shi,co-authored,spacecraft power system fault diagnosis based on dnn,2017.0,61
5,yunfan dong,co-authored,spacecraft power system fault diagnosis based on dnn,2017.0,61
6,huanzhen fan,co-authored,spacecraft power system fault diagnosis based on dnn,2017.0,61
7,shuai ren,co-authored,spacecraft power system fault diagnosis based on dnn,2017.0,61
8,marc a carbone,co-authored,a multiple model based approach for deep space power system fault diagnosis,2019.0,61
9,jeffrey t csank,co-authored,a multiple model based approach for deep space power system fault diagnosis,2019.0,61


--------------------
--- Running Total Triples Extracted: 438 ---
--- Failed Chunks So Far: 0 ---

--- Processing Chunk 62/62 ---
1. Formatting User Prompt...
2. Sending request to LLM...
   LLM response received.
3. Extracting raw response content...
--- Raw LLM Output (Chunk 62) ---
[
  {"subject": "jung", "predicate": "published", "object": "deep generative models-based anomaly detection for spacecraft control systems", "temporal": "2020"},
  {"subject": "yu gao", "predicate": "presented", "object": "fault detection and diagnosis for spacecraft using principal component analysis and support vector machines", "temporal": "2012"},
  {"subject": "qin zhang", "predicate": "published", "object": "probabilistic reasoning based on dynamic causality trees/diagrams", "temporal": "1994"},
  {"subject": "jakob runge", "predicate": "published", "object": "detecting and quantifying causal associations in large nonlinear time series datasets", "temporal": "2019"}
]
--------------------
4. Attempt

Unnamed: 0,subject,predicate,object,temporal,chunk
0,jung,published,deep generative models-based anomaly detection for spacecraft control systems,2020,62
1,yu gao,presented,fault detection and diagnosis for spacecraft using principal component analysis and support vector machines,2012,62
2,qin zhang,published,probabilistic reasoning based on dynamic causality trees/diagrams,1994,62
3,jakob runge,published,detecting and quantifying causal associations in large nonlinear time series datasets,2019,62


--------------------
--- Running Total Triples Extracted: 442 ---
--- Failed Chunks So Far: 0 ---

✅ Finished processing all chunks.

✅ No failed chunks!

🧠 LLM Output for Complex Triple Detection:
 Here are the complex triples from the given list:

```json
[
  {
    "subject": "carlo cena",
    "predicate": "and colleagues",
    "object": "submitted",
    "temporal": "2024-05-27 (v1)",
    "chunk": 1
  },
  {
    "subject": "carlo cena",
    "predicate": "and colleagues",
    "object": "last revised",
    "temporal": "2024-12-02 (v2)",
    "chunk": 1
  },
  {
    "subject": "fault detection methods",
    "predicate": "required",
    "object": "space sector",
    "chunk": 1
  },
  {
    "subject": "this paper",
    "predicate": "proposes",
    "object": "an AI-based fault detection methodology",
    "chunk": 2
  },
  {
    "subject": "our study",
    "predicate": "focuses",
    "object": "on the application of a PI Real NVP model for fault detection in space systems",
    "chunk": 2
  

Unnamed: 0,subject,predicate,object,temporal,chunk,chunk_id
0,carlo cena,and colleagues,submitted,2024-05-27 (v1),1,d546631a-7ef0-44af-aa4c-adb4e683d2e3
1,carlo cena,and colleagues,last revised,2024-12-02 (v2),1,3ee67765-81d6-4e74-a73f-76d145e9c273
2,fault detection methods,required,space sector,,1,f96d6724-b016-4a8d-aa9d-3c956d1e02e8
3,this paper,proposes,an AI-based fault detection methodology,,2,bc893975-69b1-414b-a802-72c72c4aa456
4,our study,focuses,on the application of a PI Real NVP model for fault detection in space systems,,2,dfbaaaec-c722-4eaa-9267-3caa5eabc24c
...,...,...,...,...,...,...
437,min-qiang xu,co-authored,fault detection and diagnosis,,61,ebe745b6-0a82-4b9e-a162-2bf75c8d3ab7
438,jung,published,deep generative models-based anomaly detection for spacecraft control systems,2020,62,315f30eb-c062-452a-b2d2-0848d952f49e
439,yu gao,presented,fault detection and diagnosis for spacecraft using principal component analysis and support vector machines,2012,62,ea0c1246-e0f0-4343-b01e-9affddfa0812
440,qin zhang,published,probabilistic reasoning based on dynamic causality trees/diagrams,1994,62,26c4ca14-aab7-4988-9e11-12b629b7b8d5


-------------------------
Starting normalization and re-duplication of 442 triples
Processing triples for normalization (showing first 5 examples):

--- Example 1 ---
Original Triple (Chunk 1): {'subject': 'carlo cena', 'predicate': 'and colleagues', 'object': 'submitted', 'temporal': '2024-05-27 (v1)', 'chunk': 1, 'chunk_id': 'd546631a-7ef0-44af-aa4c-adb4e683d2e3'}
Normalized: SUB='carlo cena', PRED='and colleagues', OBJ='submitted'
Status: Kept (New Unique Triple)

--- Example 2 ---
Original Triple (Chunk 1): {'subject': 'carlo cena', 'predicate': 'and colleagues', 'object': 'last revised', 'temporal': '2024-12-02 (v2)', 'chunk': 1, 'chunk_id': '3ee67765-81d6-4e74-a73f-76d145e9c273'}
Normalized: SUB='carlo cena', PRED='and colleagues', OBJ='last revised'
Status: Kept (New Unique Triple)

--- Example 3 ---
Original Triple (Chunk 1): {'subject': 'fault detection methods', 'predicate': 'required', 'object': 'space sector', 'chunk': 1, 'chunk_id': 'f96d6724-b016-4a8d-aa9d-3c956d1e02e8'}


Unnamed: 0,subject,predicate,object,source_chunk
0,carlo cena,and colleagues,submitted,1
1,carlo cena,and colleagues,last revised,1
2,fault detection methods,required,space sector,1
3,this paper,proposes,an ai-based fault detection methodology,2
4,our study,focuses,on the application of a pi real nvp model for fault detection in space systems,2
...,...,...,...,...
435,min-qiang xu,co-authored,fault detection and diagnosis,61
436,jung,published,deep generative models-based anomaly detection for spacecraft control systems,62
437,yu gao,presented,fault detection and diagnosis for spacecraft using principal component analysis and support vector machines,62
438,qin zhang,published,probabilistic reasoning based on dynamic causality trees/diagrams,62


-------------------------
Initialized an empty NetworkX DiGraph.
--- Initial Graph Info ---
Type: DiGraph
Number of nodes: 0
Number of edges: 0
-------------------------
Adding triples to the NetworkX graph...

--- Graph Info after adding Triple #5 --- (our study -> on the application of a pi real nvp model for fault detection in space systems)
Type: DiGraph
Number of nodes: 9
Number of edges: 5

--- Graph Info after adding Triple #10 --- (physics-informed loss in ai models -> competitive advantage for space needs)
Type: DiGraph
Number of nodes: 17
Number of edges: 10

--- Graph Info after adding Triple #15 --- (arxivlabs -> experimental projects)
Type: DiGraph
Number of nodes: 23
Number of edges: 15

--- Graph Info after adding Triple #20 --- (arxiv -> values)
Type: DiGraph
Number of nodes: 29
Number of edges: 20

--- Graph Info after adding Triple #25 --- (ole j. mengshoel -> ole.j.mengshoel@nasa.gov)
Type: DiGraph
Number of nodes: 35
Number of edges: 25

--- Graph Info after adding 

Unnamed: 0,Node Sample
0,carlo cena
1,submitted
2,last revised
3,fault detection methods
4,space sector
5,this paper
6,an ai-based fault detection methodology
7,our study
8,on the application of a pi real nvp model for fault detection in space systems
9,our physics-informed approach



--- Sample Edges (First 10 with Labels) ---


Unnamed: 0,Source,Target,Label
0,carlo cena,submitted,and colleagues
1,carlo cena,last revised,and colleagues
2,carlo cena,physics-informed real nvp for satellite power system fault detection,co-authored
3,fault detection methods,space sector,required
4,this paper,an ai-based fault detection methodology,proposes
5,this paper,electrical power system fault diagnosis,discusses
6,this paper,diagnostic capability for electrical power system faults,develops
7,our study,on the application of a pi real nvp model for fault detection in space systems,focuses
8,our physics-informed approach,existing methods of fault detection,outperforms
9,the use of a physics-informed loss,in addressing satellite eps sub-system faults,has a competitive advantage


-------------------------
Preparing interactive visualization...
Graph seems valid for visualization (645 nodes, 438 edges).
Converting nodes...
Converted 645 nodes.
Converting edges...
Converted 438 edges.

--- Sample Cytoscape Node Data (First 2) ---
[
  {
    "data": {
      "id": "carlo cena",
      "label": "carlo\ncena",
      "degree": 3,
      "size": 25.0,
      "tooltip_text": "Entity: carlo cena\nDegree: 3"
    }
  },
  {
    "data": {
      "id": "submitted",
      "label": "submitted",
      "degree": 1,
      "size": 18.333333333333332,
      "tooltip_text": "Entity: submitted\nDegree: 1"
    }
  }
]

--- Sample Cytoscape Edge Data (First 2) ---
[
  {
    "data": {
      "id": "edge_0",
      "source": "carlo cena",
      "target": "submitted",
      "label": "and colleagues",
      "tooltip_text": "Relationship: and colleagues"
    }
  },
  {
    "data": {
      "id": "edge_1",
      "source": "carlo cena",
      "target": "last revised",
      "label": "and colleagues",

CytoscapeWidget(cytoscape_layout={'name': 'cose', 'nodeRepulsion': 4000, 'nodeOverlap': 40, 'idealEdgeLength':…


-------------------------
End of Visualization Step.
-------------------------
Number of triples in graph: 438
Subject: http://kg.local/deep%20learning%20neural%20network, Predicate: http://kg.local/classify Object http://kg.local/type%20of%20fault
Subject: http://kg.local/the%20mnemonic%20with%20its%20top-1%20most%20related%20to%20%CB%86mas%20the%20new%20candidate%20root%20cause, Predicate: http://kg.local/repeats Object http://kg.local/kalman%20check%20criterion
Subject: http://kg.local/toy%20data%20set, Predicate: http://kg.local/consisted%20of Object http://kg.local/11%20flights%20of%20basic%20spacecraft%20housekeeping%20and%20physics%20data
Subject: http://kg.local/hk, Predicate: http://kg.local/is Object http://kg.local/measurement%20model%20matrix
Subject: http://kg.local/the%20algorithm%202%20walkdown%20diagnosis%20algorithm, Predicate: http://kg.local/uses Object http://kg.local/a%20causality%20matrix
Subject: http://kg.local/ablation%20testing, Predicate: http://kg.local/per