## Setting up the environment

In [51]:
import os
from dotenv import load_dotenv
load_dotenv()

# Setup embedding model
from llama_index.core import Settings
from llama_index.core.callbacks import CallbackManager

# Turn off the verbose callback handlers
Settings.callback_manager = CallbackManager([])

# Use local embedding models served by LM Studio
# Use fake API key (LM Studio doesn't validate it)
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding(
    api_base = os.getenv("LM_STUDIO_API_BASE"),
    api_key = "whatever-it-is",
    model_name = os.getenv("LM_STUDIO_EMBEDDING_MODEL"),
    embed_batch_size = 50
)

# Embedding model verification
Settings.embed_model = embed_model
embed = embed_model.get_text_embedding("The quick brown fox jumps over the lazy dog.")
print(embed[:5])  # Should print a list of floats

[0.07404854893684387, 0.07512331753969193, -0.0032140822149813175, -0.022604335099458694, -0.019349098205566406]


In [52]:
# 使用智谱免费模型 glm-4-flash，提取节点关系
from llama_index.llms.zhipuai import ZhipuAI
llm_extraction = ZhipuAI(
    api_key=os.getenv("ZHIPU_API_KEY"),
    model="glm-4-flash"
)
Settings.llm = llm_extraction

# Verify the LLM
print(llm_extraction.complete("\nBriefly introduce yourself in 20 words."))

I am an AI language model, here to assist with information and answer questions.


## Pipeline 1: Collecting & preparing the documents

In [53]:
# Names for file storage
topic_name = "Solar_cell"
urls_file_name = topic_name + "_urls.txt"

import requests
from bs4 import BeautifulSoup
import re

with open(urls_file_name, 'r') as file:
    urls = [line.strip() for line in file]

print("Read URLs:")
for url in urls:
    print(url)

Read URLs:
https://en.wikipedia.org/wiki/Solar_cell
https://en.wikipedia.org/wiki/1973_oil_crisis
https://en.wikipedia.org/wiki/ARCO
https://en.wikipedia.org/wiki/Absorption_(electromagnetic_radiation)
https://en.wikipedia.org/wiki/Acrylate_polymer
https://en.wikipedia.org/wiki/Albedo
https://en.wikipedia.org/wiki/Albert_Einstein
https://en.wikipedia.org/wiki/Aleksandr_Stoletov
https://en.wikipedia.org/wiki/Alkaline_battery
https://en.wikipedia.org/wiki/Alternating_current
https://en.wikipedia.org/wiki/Aluminium%E2%80%93air_battery
https://en.wikipedia.org/wiki/American_Solar_Challenge
https://en.wikipedia.org/wiki/Amorphous_silicon
https://en.wikipedia.org/wiki/Ion
https://en.wikipedia.org/wiki/Anita_Ho-Baillie
https://en.wikipedia.org/wiki/Anode
https://en.wikipedia.org/wiki/Anomalous_photovoltaic_effect
https://en.wikipedia.org/wiki/Anti-reflective_coating
https://en.wikipedia.org/wiki/Antonio_Luque
https://en.wikipedia.org/wiki/ArXiv
https://en.wikipedia.org/wiki/Atmospheric_pressu

In [54]:
def clean_text(content):
    # Remove references and unwanted characters
    content = re.sub(r'\[\d+\]', '', content)   # Remove references
    content = re.sub(r'[^\w\s\.]', '', content)  # Remove punctuation (except periods)
    return content

def safe_file_name(s):
    # Replace spaces with underscores
    s = s.replace(' ', '_')
    
    # Remove any characters that are not allowed in file names
    safe_str = ''.join(c for c in s if c.isalpha() or c.isdigit() or c in [' ', '.', '_', '-'])
    
    return safe_str

def file_exists_and_has_content(file_path):
    # Check if the file exists
    if not os.path.exists(file_path):
        return False
    
    # Check if the file is not empty
    with open(file_path, 'r', encoding='utf-8') as file:
        first_char = file.read(1)
        if first_char:
            return True
        else:
            return False

def fetch_and_clean(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')

        # Prioritise "mw-parser-output" but fall back to "content" node if not found
        content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None
        
        # Remove specific unwanted sections, including nested ones
        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span', id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()  # Remove the section and its siblings
                section.parent.decompose()  # Remove the section itself
                section = content.find('span', id=section_title)

        # Extract and clean text
        text = content.get_text(separator=' ', strip=True)  # Use space as separator and strip whitespace
        text = clean_text(text)
        return text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None  # Return None if there's an error
    
# Directory to store the output file
output_dir = './documents/'
os.makedirs(output_dir, exist_ok=True)

from tqdm import tqdm

# Processing the URLs and skipping invalid ones
reload = True  # Set to True to reprocess all URLs
if reload==True:
    for url in tqdm(urls):
        article_name = url.split('/')[-1].replace('.html', '')
        filename = os.path.join(output_dir, f"{safe_file_name(article_name)}.txt")

        if file_exists_and_has_content(filename) is True:
            continue
        else:
            clean_article_text = fetch_and_clean(url)
            if clean_article_text:  # Only write if text is not None
                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(clean_article_text)

100%|██████████| 51/51 [00:21<00:00,  2.38it/s]


In [55]:
from llama_index.core import SimpleDirectoryReader

# Load documents
documents = SimpleDirectoryReader("./documents").load_data()
print(documents[0])

Doc ID: 618143ae-6e87-48ef-939e-356255120d2f
Text: American oil company For other uses see Arco disambiguation .
Atlantic Richfield Company Logo used from 19702000 still used as a
secondary logo Trade name ARCO Company type Subsidiary Industry
Petroleum Predecessor Atlantic Petroleum Richfield Oil Corporation
Founded January 3 1966  59 years ago  19660103  Defunct April 18 2000
25 years ago  20...


## Pipeline 2: Creating vector store

In [56]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

# Path for vector store and dataset
# os.environ['ACTIVELOOP_TOKEN'] = os.getenv('ACTIVELOOP_TOKEN')
# database = "hub://honglin/solar_cell_01" # hosted database by active loop
database = "./dataset/solar_cell_01" # local storage
vector_store_path = database
dataset_path = database

# Create an index over the documents
# Overwrites the existing dataset if True
ow = True

if ow==True:
    try:
        vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=ow)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex.from_documents(documents, storage_context, show_progress=True, embed_model=Settings.embed_model)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(f"Error type: {type(e)}")
        print(f"Error traceback: {e.__traceback__}")



Parsing nodes:   0%|          | 0/43 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/323 [00:00<?, ?it/s]

Uploading data to deeplake dataset.


100%|██████████| 323/323 [00:00<00:00, 1157.40it/s]

Dataset(path='./dataset/solar_cell_01', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (323, 1)     str     None   
 metadata     json      (323, 1)     str     None   
 embedding  embedding  (323, 768)  float32   None   
    id        text      (323, 1)     str     None   





In [57]:
import deeplake
ds = deeplake.load(dataset_path)
ds.summary()

./dataset/solar_cell_01 loaded successfully.

Dataset(path='./dataset/solar_cell_01', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (323, 768)  float32   None   
    id        text      (323, 1)     str     None   
 metadata     json      (323, 1)     str     None   
   text       text      (323, 1)     str     None   




In [58]:
import pandas as pd
import numpy as np

# Create a dictionary to hold the data
data = {}

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    # Check if the tensor is multi-dimensional
    if tensor_data.ndim > 1:
        # Flatten multi-dimensional tensors
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        # Convert 1D tensors directly to lists and decode text
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

In [59]:
# Function to display a selected record
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record.get("id", "N/A"),
        "Metadata": record.get("metadata", "N/A"),
        "Text": record.get("text", "N/A"),
        "Embedding": record.get("embedding", "N/A")
    }

    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

# Example usage
rec = 0  # Replace with the desired record number
display_record(rec)

ID:
['67490a92-0d7f-4579-9c83-0af631af6d6a']

Metadata:
file_path: /home/ongin/github/RAG-pR/RAG-Wikipedia/documents/ARCO.txt
file_name: ARCO.txt
file_type: text/plain
file_size: 28010
creation_date: 2025-06-11
last_modified_date: 2025-06-11
_node_content: {"id_": "67490a92-0d7f-4579-9c83-0af631af6d6a", "embedding": null, "metadata": {"file_path": "/home/ongin/github/RAG-pR/RAG-Wikipedia/documents/ARCO.txt", "file_name": "ARCO.txt", "file_type": "text/plain", "file_size": 28010, "creation_date": "2025-06-11", "last_modified_date": "2025-06-11"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "618143ae-6e87-48ef-939e-356255120d2f", "node_type": "4", "metadata": {"file_path": "/home/ongin/github/RAG-pR/RAG-Wikipedia/documents/ARCO.txt", "

## Pipeline 3: Knowledge graph index-based RAG

In [60]:
from llama_index.core import Document

# Ensure 'text' column is of type string
df['text'] = df['text'].astype(str)
# Create documents with IDs
documents = [Document(text=row['text'], doc_id=str(row['id'])) for _, row in df.iterrows()]

### Generating the graph index

In [61]:
from llama_index.core import KnowledgeGraphIndex
import time
from typing import List

# Start the timer
start_time = time.time()

class RobustKnowledgeGraphIndex(KnowledgeGraphIndex):
    def _extract_triplets(self, text: str) -> List[tuple]:
        """Extract triplets with error handling to ignore failures."""
        try:
            # Call the parent class's triplet extraction method
            return super()._extract_triplets(text)
        except Exception as e:
            # Log the error (optional) and return an empty list to continue processing
            print(f"Error extracting triplets for text chunk: {e}")
            return []

# Graph index with embeddings
graph_index = RobustKnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=5,
    include_embeddings=True,
    show_progress=False,
)

# Stop the timer
end_time = time.time()

# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Index creation time: {elapsed_time:.4f} seconds")

Error extracting triplets for text chunk: Error code: 400, with error text {"contentFilter":[{"level":1,"role":"user"}],"error":{"code":"1301","message":"系统检测到输入或生成内容可能包含不安全或敏感内容，请您避免输入易产生敏感内容的提示语，感谢您的配合。"}}
Index creation time: 1008.9247 seconds


In [65]:
print(type(graph_index))

# Save the graph index to a file
graph_saving_path = "./dataset/graph_index"
graph_index.storage_context.persist(persist_dir=graph_saving_path)

<class '__main__.RobustKnowledgeGraphIndex'>


In [66]:
# similarity_top_k
k = 3
# temperature
temp = 0.1
# number_output
mt = 1024
graph_query_engine = graph_index.as_query_engine(similarity_top_k=k, temperature=temp, num_output=mt)

### Displaying the graph

In [75]:
## create graph
from pyvis.network import Network

g = graph_index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)

# Set node and edge properties: colors and sizes
for node in net.nodes:
    node['color'] = 'lightgray'
    node['size'] = 10

for edge in net.edges:
    edge['color'] = 'black'
    edge['width'] = 1

In [77]:


fgraph="Knowledge_graph_"+ topic_name + ".html"
net.write_html(fgraph)
print(fgraph)



Knowledge_graph_Solar_cell.html


## Interacting with the Knowledge graph index

In [5]:
import time
import textwrap

from llama_index.core import StorageContext, load_graph_from_storage

graph_saving_path = "./dataset/graph_index"
storage_context = StorageContext.from_defaults(persist_dir=graph_saving_path)
graph_index = load_graph_from_storage(storage_context, 0)

def execute_query(user_input, k=3, temp=0.1, mt=1024):

    # Start the timer
    start_time = time.time()

    # Execute the query with additional parameters
    response = graph_query_engine.query(user_input)

    # Stop the timer
    end_time = time.time()

    # Calculate and print the execution time
    elapsed_time = end_time - start_time
    print(f"Query execution time: {elapsed_time:.4f} seconds")

    # Print the response, wrapped to 100 characters per line
    print(textwrap.fill(str(response), 100))
    return response

ValueError: 
******
Could not load OpenAI model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

To disable the LLM entirely, set llm=None.
******

In [81]:
user_query="What is the primary goal of bifacial panels? And what solar cells can be used for this type of panel?"

In [82]:
import time
import textwrap
import sys
import io
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 100))



Query execution time: 8.9139 seconds
The primary goal of bifacial panels is to maximize the energy output by utilizing light reflected
from the ground or surrounding surfaces, effectively capturing energy from both the front and back
sides of the solar cell. This is particularly beneficial for installations on roofs or in
agricultural settings where the reflection of sunlight can enhance overall efficiency. Solar cells
that can be used for this type of panel include those with Passivated Emitter Rear Contact (PERC),
Passivated Emitter Rear Locallydiffused (PERL), Passivated Emitter Rear Totally diffused (PERT),
Heterojunction with Intrinsic Thinlayer (HIT), Interdigitated Back Contact (IBC), and HIT, which are
designed to be bifacial and can generate electricity from both the front and rear surfaces.


### Similarity re-rank

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B')

def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.71k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [2]:
import time
import textwrap
import sys
import io

user_query="Which experts are often associated with Solar Cell theory?"
# Start the timer
start_time = time.time()
# Capture the output
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
response = execute_query(user_query)
# Restore stdout
sys.stdout = old_stdout
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(textwrap.fill(str(response), 100))

NameError: name 'execute_query' is not defined