In [8]:
%pip install cmem-cmempy llama-index python-dotenv llama-index-vector-stores-postgres

Note: you may need to restart the kernel to use updated packages.


In [9]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [10]:
import os

import openai

openai.api_key = os.environ["OPENAI_API_KEY"]

In [11]:
!docker pull ankane/pgvector
!docker run -d -e POSTGRES_USER=eccenca -e POSTGRES_PASSWORD=eccenca -e POSTGRES_DB=eccenca --name pgvector-eccenca -p 5432:5432 ankane/pgvector

Using default tag: latest
latest: Pulling from ankane/pgvector
Digest: sha256:956744bd14e9cbdf639c61c2a2a7c7c2c48a9c8cdd42f7de4ac034f4e96b90f8
Status: Image is up to date for ankane/pgvector:latest
docker.io/ankane/pgvector:latest
docker: Error response from daemon: Conflict. The container name "/pgvector-eccenca" is already in use by container "96a9465720137f2fd7cc29e12a3fb68a7a18a782ff10af02498e5459d2fc8424". You have to remove (or rename) that container to be able to reuse that name.
See 'docker run --help'.


In [12]:
from IPython.display import Markdown

from llama_index_cmem.readers.cmem import CMEMReader

reader = CMEMReader()
documents = reader.load_data(graph="<http://ld.company.org/prod-inst/>")
display(Markdown(f"### Documents loaded: __{len(documents)}__ "))

### Documents loaded: __20043__ 

In [13]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore

hybrid_vector_store = PGVectorStore.from_params(
    database="eccenca",
    host="localhost",
    password="eccenca",
    port="5432",
    user="eccenca",
    table_name="prod-inst_hybrid",
    embed_dim=1536,  # openai embedding dimension
    hybrid_search=True,
    hnsw_kwargs={
        "hnsw_m": 16,
        "hnsw_ef_construction": 64,
        "hnsw_ef_search": 40,
        "hnsw_dist_method": "vector_cosine_ops",
    },
)

storage_context = StorageContext.from_defaults(vector_store=hybrid_vector_store)
hybrid_index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True
)
hybrid_query_engine = hybrid_index.as_query_engine(vector_store_query_mode="hybrid")

Parsing nodes: 100%|██████████| 20043/20043 [00:03<00:00, 6332.94it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:26<00:00, 77.58it/s] 
PG Setup: Error creating HNSW index: (psycopg2.errors.SyntaxError) syntax error at or near "-"
LINE 1: CREATE INDEX IF NOT EXISTS data_prod-inst_hybrid_embedding_i...
                                            ^

[SQL: CREATE INDEX IF NOT EXISTS data_prod-inst_hybrid_embedding_idx ON public.data_prod-inst_hybrid USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64)]
(Background on this error at: https://sqlalche.me/e/20/f405)
Generating embeddings: 100%|██████████| 2048/2048 [00:22<00:00, 91.16it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [00:24<00:00, 84.82it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [00:20<00:00, 98.15it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [00:20<00:00, 98.84it/s] 
Generating embeddings: 100%|██████████| 2048/2048 [00:28<00:00, 70.87it/s]
Generating embedding

In [14]:
question = "What information do you have about Liese Adam? Give me as many details as possible."
answer = hybrid_query_engine.query(question)
display(Markdown(f"## Question: _{question}_"))
display(Markdown(f"### Answer:\n\n{answer}"))

## Question: _What information do you have about Liese Adam? Give me as many details as possible._

### Answer:

Liese Adam's name is Liese Adam and her email is Liese.Adam@company.org.