In [None]:
!pip install langchain  openai tqdm pinecone langchain-community gradio

In [26]:
AZURE_OPENAI_ENDPOINT="https://mobi-dev-openai.openai.azure.com/openai/deployments/insurance-text-embedding-3-small/embeddings?api-version=2023-05-15"
AZURE_OPENAI_API_KEY="f5da280ab5fd4f6cb1bcf296b49339f4"
AZURE_OPENAI_API_VERSION="2023-05-15"
PINECONE_API_KEY="pcsk_4wCzBu_M5yMXdFNFRBL8NZU2XB4GrADRbynuMK7ww1GkSyWt7ER5cimPr1awGis9Hi6563"
GROQ_API_KEY="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"
index_name = "gst-chat-agent"

In [33]:
import os
from typing import List, Dict
from openai import AzureOpenAI
from pinecone import Pinecone
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm
import time


class DocumentEmbedder:
    def __init__(
        self,
        chunk_size: int = 500,
        chunk_overlap: int = 50,
        batch_size: int = 20,
        model: str = "insurance-text-embedding-3-small"  # Your Azure deployment name
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.batch_size = batch_size
        self.model = model
        
        # Initialize Azure OpenAI client
        self.client = AzureOpenAI(
            api_key=AZURE_OPENAI_API_KEY,
            api_version=AZURE_OPENAI_API_VERSION,
            azure_endpoint=AZURE_OPENAI_ENDPOINT
        )
        
        # Initialize Pinecone with new API
        self.pc = Pinecone(api_key=PINECONE_API_KEY)

    def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for a list of texts using Azure OpenAI"""
        try:
            response = self.client.embeddings.create(
                input=texts,
                model=self.model
            )
            return [data.embedding for data in response.data]
        except Exception as e:
            print(f"Error generating embeddings: {str(e)}")
            raise

    def load_pdf(self, file_path: str) -> List:
        """Load PDF and return list of pages"""
        print(f"Loading PDF: {file_path}")
        loader = PyPDFLoader(file_path)
        pages = loader.load()
        return pages

    def chunk_text(self, documents: List) -> List:
        """Split documents into chunks"""
        print("Chunking documents...")
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len
        )
        chunks = splitter.split_documents(documents)
        return chunks

    def process_document(self, file_path: str, index_name: str):
        """Process a document end-to-end: load, chunk, embed, and store"""
        # Ensure Pinecone index exists
        if index_name not in self.pc.list_indexes().names():
            print(f"Creating new Pinecone index: {index_name}")
            self.pc.create_index(
                name=index_name,
                dimension=1536,  # Ada-002 embedding dimension
                metric='cosine'
            )
        
        # Get Pinecone index
        index = self.pc.Index(index_name)
        
        # Load and chunk document
        pages = self.load_pdf(file_path)
        chunks = self.chunk_text(pages)
        
        print(f"Processing {len(chunks)} chunks in batches of {self.batch_size}")
        
        # Process in batches
        for i in tqdm(range(0, len(chunks), self.batch_size)):
            batch = chunks[i:i + self.batch_size]
            
            try:
                # Get texts for the batch
                texts = [doc.page_content for doc in batch]
                
                # Generate embeddings
                embeddings = self.generate_embeddings(texts)
                
                # Prepare vectors for Pinecone
                vectors = []
                for j, embedding in enumerate(embeddings):
                    metadata = {
                        'text': texts[j],
                        'source': batch[j].metadata.get('source', ''),
                        'page': batch[j].metadata.get('page', 0)
                    }
                    vectors.append({
                        'id': f"doc_{i+j}",
                        'values': embedding,
                        'metadata': metadata
                    })
                
                # Upsert to Pinecone with new format
                index.upsert(vectors=vectors)
                
                # Respect rate limits
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error processing batch {i}-{i+self.batch_size}: {str(e)}")
                continue

    def query_similar(self, query: str, index_name: str, top_k: int = 5) -> List[Dict]:
        """Query similar documents using the embedded query"""
        # Generate embedding for the query
        query_embedding = self.generate_embeddings([query])[0]
        
        # Query Pinecone
        index = self.pc.Index(index_name)
        results = index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True
        )
        
        # Format results
        similar_docs = []
        for match in results.matches:
            similar_docs.append({
                'score': match.score,
                'text': match.metadata['text'],
                'page': match.metadata['page'],
                'source': match.metadata['source']
            })
        
        return similar_docs

def main():
    # Initialize embedder
    embedder = DocumentEmbedder(
        chunk_size=500,
        chunk_overlap=50,
        batch_size=20
    )
    
    # Process document
    pdf_path = "data/gst_circulars_all_in_one_place.pdf"
    index_name = "gst-chat-agent"
    
    # Process and store document
    embedder.process_document(pdf_path, index_name)
    
    # Example query
    query = "What is the main topic of the document?"
    results = embedder.query_similar(query, index_name)
    
    print("\nQuery Results:")
    for i, result in enumerate(results, 1):
        print(f"\n{i}. Similarity Score: {result['score']:.4f}")
        print(f"Page: {result['page']}")
        print(f"Text: {result['text'][:200]}...")

if __name__ == "__main__":
    main()



TypeError: ChatInterface.__init__() got an unexpected keyword argument 'retry_btn'

In [28]:
embedder = DocumentEmbedder(
        chunk_size=500,
        chunk_overlap=50,
        batch_size=20
    )
query = "Transfer of Property Act, 1882"
results = embedder.query_similar(query, index_name)
    
print("\nQuery Results:")
for i, result in enumerate(results, 1):
    print(f"\n{i}. Similarity Score: {result['score']:.4f}")
    print(f"Page: {result['page']}")
    print(f"Text: {result['text']}...")


Query Results:

1. Similarity Score: 0.5560
Page: 39.0
Text: to it in Section 3 of the Transfer of Property Act, 1882 (4 of 1882). 
Section 3 of the Transfer of Property Act, 1882 provides the definition of 
“actionable claim” as below: - 
“"actionable claim" means a claim to any debt, other than a debt 
secured by mortgage of  immovable property or by hypothecation or 
pledge of  movable property,  or to any beneficial interest in movable 
property not in the possession, either actual or  constructive, of the...

2. Similarity Score: 0.5310
Page: 1998.0
Text: Index  
Page 1998 of 2063 
 
Where the property to be provisionally attached consists of the share or interest of the 
concerned taxable person in property belonging to him and another as co-owners, the 
provisional attachment shall be made by order to the concerned person prohibiting him 
from transferring the share or interest or charging it in any way. 
3.8 Property exempt from attachment 
All such property as is by the Code 

In [32]:
from groq import Groq

client = Groq(api_key=GROQ_API_KEY)
completion = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[
        {
            "role": "system",
            "content": "YOU ARE AN EXPERT IN GST EXPERT AGENT. ANSWER CUSTOEMR QUERIES. keep the answer short and crt othe point"
        },
        {
            "role": "user",
            "content": "what are Category of registered\npersons"
        },
        {
            "role": "assistant",
            "content": "Under GST, there are 4 categories of registered persons:\n\n1. Regular taxpayer\n2. Composition taxpayer\n3. Input Service Distributor (ISD)\n4. SEZ (Special Economic Zone) unit/Developer"
        },
        {
            "role": "user",
            "content": "what is ISD ?\n"
        },

    ],
    temperature=1,
    max_completion_tokens=1024,
    top_p=1,
    stream=True,
    stop=None,
)

for chunk in completion:
    print(chunk.choices[0].delta.content or "", end="")


ISD stands for Input Service Distributor. It is an office that receives invoices for services used by multiple branches/offices of an organization and distributes the input tax credit to those branches/offices.