In [1]:
import langchain
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_groq import ChatGroq

  from tqdm.autonotebook import tqdm


In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os

In [None]:
# read the doc
def read_doc(dir):
    file_loader = PyPDFDirectoryLoader(dir)
    documents = file_loader.load()
    return documents


In [2]:
# doc = read_doc('documents/')
from langchain.schema import Document
documents = [
    {
        "stockcode": "A123",
        "description": "High-quality Widget",
        "price": 19.99,
        "metadata": {
            "stockcode": "A123"
        }
    },
    {
        "stockcode": "B456",
        "description": "Durable Gadget",
        "price": 29.99,
        "metadata": {
            "stockcode": "B456"
        }
    },
    {
        "stockcode": "C789",
        "description": "Versatile Tool",
        "price": 39.99,
        "metadata": {
            "stockcode": "C789"
        }
    },
    {
        "stockcode": "D012",
        "description": "Reliable Appliance",
        "price": 49.99,
        "metadata": {
            "stockcode": "D012"
        }
    },
    {
        "stockcode": "E345",
        "description": "Innovative Device",
        "price": 59.99,
        "metadata": {
            "stockcode": "E345"
        }
    }
]

langchain_documents = []

for item in documents:
    # Create the page content by combining description and price
    page_content = f"{item['description']} - Price: ${item['price']}"
    
    # Create a Document object
    doc = Document(
        page_content=page_content,
        metadata={
            "stockcode": item['stockcode'],
            "price": item['price']
        }
    )
    
    langchain_documents.append(doc)



In [3]:
## divide the docks into chunks
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc =text_splitter.split_documents(docs)
    return doc

In [4]:
splits = chunk_data(docs=langchain_documents)

In [5]:
# embeddings
embeddings = HuggingFaceEmbeddings(model_name="distiluse-base-multilingual-cased-v2")
embeddings



HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
), model_name='distiluse-base-multilingual-cased-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
vectors = embeddings.embed_query("how are you?")

In [None]:
# vector search db in pinecone
pinecone_api_key = os.getenv('pinecone_api_key')
pinecone_env = os.getenv('pinecone_environment')
os.environ['PINECONE_API_KEY'] = os.getenv('pinecone_api_key')

pc = Pinecone(
    api_key=pinecone_api_key,
)

index_name=os.getenv('pinecone_index_name')
type(index_name)


In [None]:
# pc = Pinecone(
#     api_key="ea8e8742-034e-423d-b89b-469e0068cf77",
# )
index = pc.Index(index_name)

In [None]:
# # upsert the data to pinecone
# for document in doc

In [None]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(splits))]

vector_store.add_documents(documents=splits, ids=uuids)

In [None]:
results = vector_store.similarity_search(query="explain the story of Martin Luther King",k=1)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

In [None]:
!pip install    

In [None]:
!pip install tiktoken

In [6]:
from langchain.vectorstores.pgvector import PGVector
from langchain.indexes import SQLRecordManager, index

connection_str = "postgresql+psycopg2://postgres:test@localhost:5432/vector-db"
collection_name = "products"

In [7]:
vectorstore = PGVector(
    embedding_function=embeddings,
    collection_name=collection_name,
    connection_string=connection_str
)

  vectorstore = PGVector(
  vectorstore = PGVector(


In [8]:
namespace = f"pgvector/{collection_name}"

record_manager = SQLRecordManager(
    namespace, db_url=connection_str
)

In [9]:
record_manager.create_schema()

In [10]:
splits

[Document(metadata={'stockcode': 'A123', 'price': 19.99}, page_content='High-quality Widget - Price: $19.99'),
 Document(metadata={'stockcode': 'B456', 'price': 29.99}, page_content='Durable Gadget - Price: $29.99'),
 Document(metadata={'stockcode': 'C789', 'price': 39.99}, page_content='Versatile Tool - Price: $39.99'),
 Document(metadata={'stockcode': 'D012', 'price': 49.99}, page_content='Reliable Appliance - Price: $49.99'),
 Document(metadata={'stockcode': 'E345', 'price': 59.99}, page_content='Innovative Device - Price: $59.99')]

In [11]:
# index
index(
    splits,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="stockcode"
)

{'num_added': 5, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [12]:
splits[1].page_content= "modified"
del splits[0]
splits.append(Document(page_content="new created", metadata={"stockcode": "A11101"}))

In [13]:
# index
index(
    splits,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="stockcode"
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 3, 'num_deleted': 1}

In [None]:
from langchain.schema import Document

docs[1].page_content = "updated"
del docs[0]
docs.append(Document(page_content="new content", metadata={"source": "important"}))

In [None]:
db = PGVector.from_documents(embedding=embeddings, documents=splits, collection_name=collection_name, connection_string=connection_str)

In [None]:
query = "explain the story of Wright brothers"
db.similarity_search_with_relevance_scores(query, k=5)

In [None]:
from langchain.vectorstores.pgvector import PGVector
from langchain.indexes import SQLRecordManager, index
from langchain.schema import Document

connection_str = "postgresql+psycopg2://postgres:test@localhost:5432/vector-db"
collection_name = "products"

# Initialize vectorstore
vectorstore = PGVector.from_documents(
    splits,
    embeddings,
    collection_name=collection_name,
    connection_string=connection_str
)

namespace = f"pgvector/{collection_name}"

# Initialize record manager
record_manager = SQLRecordManager(
    namespace, db_url=connection_str
)
record_manager.create_schema()

# Function to perform indexing
def perform_indexing(documents):
    index(
        documents,
        record_manager,
        vectorstore,
        cleanup="full",  # Changed from "incremental" to "full"
        source_id_key="stockcode"
    )

# Initial indexing
perform_indexing(splits)

# Modify existing document
splits[1].page_content = "modified"

# Delete a document
del splits[0]

# Add new document
splits.append(Document(page_content="new created", metadata={"stockcode": "A11101"}))

# Perform indexing again with updated documents
perform_indexing(splits)

In [None]:
from langchain.vectorstores.pgvector import PGVector
from langchain.indexes import SQLRecordManager, index
from langchain.schema import Document
from sqlalchemy import create_engine, text

connection_str = "postgresql+psycopg2://postgres:test@localhost:5432/vector-db"
collection_name = "products"

# Initialize vectorstore
vectorstore = PGVector(
    collection_name=collection_name,
    connection_string=connection_str,
    embedding_function=embeddings
)

namespace = f"pgvector/{collection_name}"

# Initialize record manager
record_manager = SQLRecordManager(
    namespace, db_url=connection_str
)
record_manager.create_schema()

# Function to perform indexing
def perform_indexing(documents):
    # engine = create_engine(connection_str)
    # with engine.connect() as conn:
    #     # First, get the collection_id
    #     result = conn.execute(text(f"SELECT uuid FROM langchain_pg_collection WHERE name = '{collection_name}'"))
    #     collection_id = result.scalar()
        
    #     if collection_id:
    #         # Delete existing records
    #         conn.execute(text(f"DELETE FROM langchain_pg_embedding WHERE collection_id = '{collection_id}'"))
    #         conn.execute(text(f"DELETE FROM {namespace}"))
    #         conn.commit()
    #     else:
    #         print(f"Collection '{collection_name}' not found. It will be created.")

    # Now, add all documents as new
    # vectorstore.add_documents(documents)
    
    # Update the record manager
    index(
        documents,
        record_manager,
        vectorstore,
        cleanup="incremental",
        source_id_key="stockcode"
    )

# Initial indexing
perform_indexing(splits)

# Modify existing document
splits[1].page_content = "modified"

# Delete a document
del splits[0]

# Add new document
splits.append(Document(page_content="new created", metadata={"stockcode": "A11101"}))

# Perform indexing again with updated documents
perform_indexing(splits)