# 1.Installation of Packages

In [1]:
!pip install -qq kuzu==0.0.6 llama-index==0.9.48 pypdf pyvis  html2text umap-learn plotly

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m26.1 MB

# 2.Import Required Packages


In [26]:
# Regular imports
import pandas as pd
import numpy as np
import os
import kuzu                           # Graph database library

from google.colab import userdata    # Importing specific functionality from google.colab

# Llama index imports
from llama_index.graph_stores import KuzuGraphStore
from llama_index import (
    ServiceContext,
    KnowledgeGraphIndex              # Index for knowledge graphs
)
from llama_index.readers import (
    SimpleWebPageReader,             # Reader for simple web pages
)
from llama_index.llms import OpenAI
from llama_index.storage.storage_context import StorageContext

# Visualization libraries
from pyvis.network import Network
from IPython.display import HTML
import plotly.express as px
import umap


# 3.Configue the Environment Setup


In [3]:
os.environ["OPENAI_API_KEY"] = userdata.get('oai')

#4.Prepare for Kùzu Database
Kùzu is an embedded graph database management system designed for high-speed query processing and scalability.

## Create database and initialise them.


In [4]:
db = kuzu.Database("my_kg")
graph_store = KuzuGraphStore(db)

#5.Building the Knowledge Graph
We will use set of websites to build the KG out of.

This can be replaced with the documents aswell.

In [5]:
websites = ["https://neurons-lab.com/"]
documents = SimpleWebPageReader(html_to_text=True).load_data(websites)

## Define LLM


In [6]:
llm = OpenAI(temperature=0, model="gpt-3.5-turbo",api_key=userdata.get('oai'))
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)

## Autogenerate Triplets

1. Utilize LLM to efficiently load data extracted from the website.
2. Employ chunking techniques to organize and structure the loaded data, optimizing it for subsequent processing stages.
3. Leverage LLM to detect and classify entities residing within the nodes of the data.
4. Thoroughly analyze the interconnectedness between these detected entities, unveiling intricate relationships and associations.


## Options to play with
`max_triplets_per_chunk` = 2

In [7]:
storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(documents=documents,
                                           max_triplets_per_chunk=2,
                                           storage_context=storage_context,
                                           service_context=service_context,
                                           show_progress=True,
                                           include_embeddings=True)


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Processing nodes:   0%|          | 0/14 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

#6.Querying the Knowledge Graph

`tree_summarize`= Concatenate the chunks as much as possible to fit within the context window using the summary_template prompt, and split them if needed (again with a TokenTextSplitter and some text overlap).

In [14]:
question = "what does neurons labs does best?"

In [15]:
query_engine = index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize"
)

response1 = query_engine.query(question)

display(Markdown(f"<b>{response1}</b>"))

<b>Neurons Lab specializes in developing solutions and is known for its expertise in AI, advanced science, and business. They work with a Talent Pool of over 500 highly skilled individuals, including PhD-level applied scientists, recognized DS/ML/AI Engineers, and MLOps specialists, to quickly assemble teams for developing solutions in various industries such as HealthTech, CleanTech, and RetailTech. Additionally, Neurons Lab aids companies in obtaining grants and other assistance through their advanced-tier AWS partnership and access to a vast VC network.</b>

## Query with embeddings
Query using top 5 triplets plus keywords (duplicate triplets are removed)

![graph](https://raw.githubusercontent.com/goodrahstar/rag_llm_kg/main/img/retrive.png)

In [16]:
query_engine = index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5,
)

response2 = query_engine.query(question)
display(Markdown(f"<b>{response2}</b>"))


<b>Neurons Lab specializes in developing innovative solutions and collaborates with companies across various industries. They offer handbooks of best practices to aid in the implementation of innovation effectively.</b>

In [17]:
query_engine = index.as_query_engine(
    include_text=True,
    response_mode="refine",
    embedding_mode="hybrid",
    similarity_top_k=5,
)

response3 = query_engine.query(question)
display(Markdown(f"<b>{response3}</b>"))


<b>Neurons Lab excels in implementing solutions quickly and efficiently, leveraging expertise in AI, advanced science, and business to help businesses grow and attract clients.</b>

#7.Knowledge Graph Visualization

In [21]:

## Define the output directory for the graph
graph_output_directory = './plot1.html'

## Get the networkx graph from an index object (not shown in the provided code)
g = index.get_networkx_graph()

## Create a Network object with specific settings
net = Network(
    notebook=False,
    cdn_resources="remote",
    height="500px",
    width="60%",
    select_menu=True,
    filter_menu=False,
)

## Load the networkx graph into the Network object
net.from_nx(g)

## Apply the force atlas 2 algorithm to arrange the nodes
net.force_atlas_2based(central_gravity=0.015, gravity=-31)

## Save the graph to the specified output directory
net.show(graph_output_directory, notebook=False)

## Display the graph in the Jupyter Notebook
HTML(filename=graph_output_directory)


./plot1.html


## Visualise the embeddings

In [22]:
def get_embeddings_df(index):
    """
    Extracts embeddings from the index and converts them into a DataFrame.

    Args:
    - index: The index containing embeddings.

    Returns:
    - embeddings_df: DataFrame containing embeddings.
    """
    # Extract embeddings as a dictionary
    embeddings = index.index_struct.to_dict()

    # Convert embeddings dictionary to DataFrame and extract 'embedding_dict' column
    embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict']

    # Drop rows with NaN values
    embeddings_df = embeddings_df.dropna()

    return embeddings_df


embeddings_df = get_embeddings_df(index)

In [24]:
embeddings_df.head()

('Neurons-lab', 'Is', 'Ai solution development experts')                                 [0.001776020391844213, 0.0035660252906382084, ...
('Neurons-lab', 'Located in', 'Berkeley')                                                [-0.005942783784121275, 0.005329221952706575, ...
('Neurons-lab.com', 'Developing', 'Ai-driven medical transcription & billing system')    [-0.013013646006584167, 0.007769237272441387, ...
('Healthtech', 'Explore', 'Story')                                                       [0.013551237992942333, -0.010170500725507736, ...
('Neurons-lab.com', 'Is', 'Website')                                                     [-0.0051589952781796455, 0.008321189321577549,...
Name: embedding_dict, dtype: object

In [27]:

def visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=3):
    # Convert Series to DataFrame
    embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])

    # Perform UMAP dimensionality reduction
    umap_embedded = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        random_state=42,
    ).fit_transform(embedding_df.values)

    # Plot the UMAP embedding
    umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2','UMAP Dimension 3'])
    umap_df['Label'] = embedding_series.index
    # Plot the UMAP embedding using Plotly Express
    fig = px.scatter_3d(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',z='UMAP Dimension 3',hover_data=['Label'], title='UMAP Visualization of Embeddings')
    fig.show()


visualize_embeddings(embeddings_df)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
