In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

## Preparing the data for processing

### Pipeline 1: Collecting and preparing the documents

In [None]:
# File name for file management
graph_name = "Marketing"

ufilename = graph_name + "_urls.txt"

import requests
from bs4 import BeautifulSoup
import re

with open(ufilename, 'r') as file:
    urls = [line.strip() for line in file]

print("Read URLs:")
for url in urls:
    print(url)

In [None]:
def clean_text(content):
    # Remove references and unwanted characters
    content = re.sub(r'\[\d+\]', '', content)   # Remove references
    content = re.sub(r'[^\w\s\.]', '', content)  # Remove punctuation (except periods)
    return content

def safe_file_name(s):
    # Replace spaces with underscores
    s = s.replace(' ', '_')
    
    # Remove any characters that are not allowed in file names
    safe_str = ''.join(c for c in s if c.isalpha() or c.isdigit() or c in [' ', '.', '_', '-'])
    
    return safe_str

def file_exists_and_has_content(file_path):
    # Check if the file exists
    if not os.path.exists(file_path):
        return False
    
    # Check if the file is not empty
    with open(file_path, 'r', encoding='utf-8') as file:
        first_char = file.read(1)
        if first_char:
            return True
        else:
            return False

def fetch_and_clean(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.content, 'html.parser')

        # Prioritise "mw-parser-output" but fall back to "content" node if not found
        content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
        if content is None:
            return None
        
        # Remove specific unwanted sections, including nested ones
        for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
            section = content.find('span', id=section_title)
            while section:
                for sib in section.parent.find_next_siblings():
                    sib.decompose()  # Remove the section and its siblings
                section.parent.decompose()  # Remove the section itself
                section = content.find('span', id=section_title)

        # Extract and clean text
        text = content.get_text(separator=' ', strip=True)  # Use space as separator and strip whitespace
        text = clean_text(text)
        return text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None  # Return None if there's an error
    
# Directory to store the output file
output_dir = './data/'
os.makedirs(output_dir, exist_ok=True)

# Processing the URLs and skipping invalid ones
reload = False  # Set to True to reprocess all URLs
if reload==True:
    for url in urls:
        article_name = url.split('/')[-1].replace('.html', '')
        filename = os.path.join(output_dir, f"{safe_file_name(article_name)}.txt")

        if file_exists_and_has_content(filename) is True:
            print(f"Existed {filename}")
            continue
        else:
            clean_article_text = fetch_and_clean(url)
            if clean_article_text:  # Only write if text is not None
                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(clean_article_text)
                    print(f"Saved {filename}")

In [None]:
from llama_index.core import SimpleDirectoryReader

# Load documents
documents = SimpleDirectoryReader("./data").load_data()
print(documents[0])

## Pipeline 2: Creating and populating the Deeplake Vector Store

In [None]:
# Setup embedding model
from llama_index.core import Settings


# # MistralAI embedding
# # rate limit of 1 request per second, set a large batch size to avoid rate limiting...
# from llama_index.embeddings.mistralai import MistralAIEmbedding
# embedding_model_name = "mistral-embed"
# embed_model = MistralAIEmbedding(
#     model_name = embedding_model_name,
#     api_key = os.getenv("MISTRAL_API_KEY"),
#     embed_batch_size = 30
# )


# Using local embedding models served by LM Studio
# Use fake API key (LM Studio doesn't validate it)
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding(
    api_base = os.getenv("LM_STUDIO_API_BASE"),
    api_key = "whatever-is-in-lmstudio",
    model_name = os.getenv("LM_STUDIO_EMBEDDING_MODEL"),
    embed_batch_size = 10
)



# Embedding model verification
Settings.embed_model = embed_model
embed = embed_model.get_text_embedding("The quick brown fox jumps over the lazy dog.")
print(embed[:5])  # Should print a list of floats

In [None]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

# Path for vector store and dataset
# os.environ['ACTIVELOOP_TOKEN'] = os.getenv('ACTIVELOOP_TOKEN')
# database = "hub://honglin/marketing01" # hosted deeplake database
database = "./dataset/marketing01" # local storage
vector_store_path = database
dataset_path = database

# Create an index over the documents
# Overwrites the existing dataset if True
ow = False

if ow==True:
    try:
        vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=ow)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex.from_documents(documents, storage_context, show_progress=True, embed_model=Settings.embed_model)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(f"Error type: {type(e)}")
        print(f"Error traceback: {e.__traceback__}")

In [None]:
import deeplake
ds = deeplake.load(dataset_path)
ds.summary()

In [None]:
import pandas as pd
import numpy as np

# Create a dictionary to hold the data
data = {}

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
    tensor_data = ds[tensor_name].numpy()

    # Check if the tensor is multi-dimensional
    if tensor_data.ndim > 1:
        # Flatten multi-dimensional tensors
        data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
    else:
        # Convert 1D tensors directly to lists and decode text
        if tensor_name == "text":
            data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
        else:
            data[tensor_name] = tensor_data.tolist()

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

In [None]:
# Function to display a selected record
def display_record(record_number):
    record = df.iloc[record_number]
    display_data = {
        "ID": record.get("id", "N/A"),
        "Metadata": record.get("metadata", "N/A"),
        "Text": record.get("text", "N/A"),
        "Embedding": record.get("embedding", "N/A")
    }

    # Print the ID
    print("ID:")
    print(display_data["ID"])
    print()

    # Print the metadata in a structured format
    print("Metadata:")
    metadata = display_data["Metadata"]
    if isinstance(metadata, list):
        for item in metadata:
            for key, value in item.items():
                print(f"{key}: {value}")
            print()
    else:
        print(metadata)
    print()

    # Print the text
    print("Text:")
    print(display_data["Text"])
    print()

    # Print the embedding
    print("Embedding:")
    print(display_data["Embedding"])
    print()

# Example usage
rec = 0  # Replace with the desired record number
display_record(rec)

## Update original documents

In [None]:
from llama_index.core import Document

# Ensure 'text' column is of type string
df['text'] = df['text'].astype(str)
# Create documents with IDs
documents = [Document(text=row['text'], doc_id=str(row['id'])) for _, row in df.iterrows()]

## Knowledge Graph Index-based RAG

### Generating the Knowledge Graph Index

In [None]:
from llama_index.core import KnowledgeGraphIndex, Settings

# Using local models served by LM Studio
# from llama_index.llms.openai import OpenAI
# Settings.llm = OpenAI(
#     api_key=os.getenv("MISTRAL_API_KEY"),
#     api_base=os.getenv("LM_STUDIO_API_BASE"),
#     model_name= os.getenv("LM_STUDIO_LLM_MODEL"),
# )


# 使用智谱免费模型 glm-4-flash
from llama_index.llms.zhipuai import ZhipuAI
Settings.llm = ZhipuAI(
    api_key=os.getenv("ZHIPU_API_KEY"),
    model="glm-4-flash"
)

import time
# Start the timer
start_time = time.time()

# Graph index with embeddings
os.environ["OPENAI_API_KEY"] = "sk-anything"
graph_index = KnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=2,
    include_embeddings=True,
    show_progress=True
)

# Stop the timer
end_time = time.time()

# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Index creation time: {elapsed_time:.4f} seconds")