In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

print("Loading CSV file...")

: 

In [17]:
# ...existing code...
from pathlib import Path
from langchain_community.document_loaders import CSVLoader

def process_all_csvs(csv_directory, *, delimiter=",", encoding="utf-8"):
    """Process all CSV files in a directory"""
    all_documents = []
    csv_dir = Path(csv_directory)

    csv_files = list(csv_dir.glob("**/*.csv"))
    print(f"Found {len(csv_files)} CSV files to process")

    for csv_file in csv_files:
        print(f"\nProcessing: {csv_file.name}")
        try:
            # Try newer signature first, then fallback for older versions
            try:
                loader = CSVLoader(
                    file_path=str(csv_file),
                    csv_args={"delimiter": delimiter},
                    encoding=encoding,
                    autodetect_encoding=True,
                )
            except TypeError:
                loader = CSVLoader(
                    file_path=str(csv_file),
                    csv_args={"delimiter": delimiter},
                    encoding=encoding,
                )

            documents = loader.load()

            for i, doc in enumerate(documents, start=1):
                doc.metadata["source_file"] = csv_file.name
                doc.metadata["file_type"] = "csv"
                doc.metadata.setdefault("row", i)

            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} rows")

        except Exception as e:
            print(f"  ✗ Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all CSVs in the data directory (relative path)
all_pdf_documents = process_all_csvs("../data")


Found 1 CSV files to process

Processing: CRM_Donor_Simulation_Dataset.csv
  ✓ Loaded 5000 rows

Total documents loaded: 5000


In [18]:
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import CSVLoader
import chromadb
from chromadb.config import Settings
import uuid
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Any,Tuple,Dict
print("Initializing embedding model...")
  



Initializing embedding model...


In [19]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [135]:
chunks=split_documents(all_pdf_documents)
chunks

Split 5000 documents into 5000 chunks

Example chunk:
Content: DonorID: D00001
FirstName: Danielle
LastName: Anderson
Email: tina69@salazar.com
Phone: (716)033-8417x8760
City: New Ethan
State: AK
ZipCode: 9539
LastDonationDate: 9/3/2024
TotalGifts: 10
TotalAmount...
Metadata: {'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 0, 'source_file': 'CRM_Donor_Simulation_Dataset.csv', 'file_type': 'csv'}


[Document(metadata={'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 0, 'source_file': 'CRM_Donor_Simulation_Dataset.csv', 'file_type': 'csv'}, page_content='DonorID: D00001\nFirstName: Danielle\nLastName: Anderson\nEmail: tina69@salazar.com\nPhone: (716)033-8417x8760\nCity: New Ethan\nState: AK\nZipCode: 9539\nLastDonationDate: 9/3/2024\nTotalGifts: 10\nTotalAmountDonated: 8481.55\nEventParticipation: Yes\nEngagementScore: 56'),
 Document(metadata={'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 1, 'source_file': 'CRM_Donor_Simulation_Dataset.csv', 'file_type': 'csv'}, page_content='DonorID: D00002\nFirstName: Angel\nLastName: Hart\nEmail: garciatyler@gmail.com\nPhone: (874)907-2648\nCity: West Ashleymouth\nState: MS\nZipCode: 49618\nLastDonationDate: 5/6/2022\nTotalGifts: 5\nTotalAmountDonated: 14102.15\nEventParticipation: Yes\nEngagementScore: 11'),
 Document(metadata={'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 2, 'source_file': 'CRM_Donor

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

: 

In [6]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x147be4ec0>

In [7]:
import chromadb; 

print(chromadb.__version__)


class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "crm_donors_csv", persist_directory: str = "../data/vector_store_v3"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory,settings=Settings(
        anonymized_telemetry=False,
        allow_reset=True,          # lets you reset if you need to
    ),)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

1.3.4
Vector store initialized. Collection: crm_donors_csv
Existing documents in collection: 10000


<__main__.VectorStore at 0x148170d70>

In [11]:
chunks=split_documents(all_pdf_documents)
chunks

Split 5000 documents into 5000 chunks

Example chunk:
Content: DonorID: D00001
FirstName: Danielle
LastName: Anderson
Email: tina69@salazar.com
Phone: (716)033-8417x8760
City: New Ethan
State: AK
ZipCode: 9539
LastDonationDate: 9/3/2024
TotalGifts: 10
TotalAmount...
Metadata: {'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 0, 'source_file': 'CRM_Donor_Simulation_Dataset.csv', 'file_type': 'csv'}


[Document(metadata={'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 0, 'source_file': 'CRM_Donor_Simulation_Dataset.csv', 'file_type': 'csv'}, page_content='DonorID: D00001\nFirstName: Danielle\nLastName: Anderson\nEmail: tina69@salazar.com\nPhone: (716)033-8417x8760\nCity: New Ethan\nState: AK\nZipCode: 9539\nLastDonationDate: 9/3/2024\nTotalGifts: 10\nTotalAmountDonated: 8481.55\nEventParticipation: Yes\nEngagementScore: 56'),
 Document(metadata={'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 1, 'source_file': 'CRM_Donor_Simulation_Dataset.csv', 'file_type': 'csv'}, page_content='DonorID: D00002\nFirstName: Angel\nLastName: Hart\nEmail: garciatyler@gmail.com\nPhone: (874)907-2648\nCity: West Ashleymouth\nState: MS\nZipCode: 49618\nLastDonationDate: 5/6/2022\nTotalGifts: 5\nTotalAmountDonated: 14102.15\nEventParticipation: Yes\nEngagementScore: 11'),
 Document(metadata={'source': '../data/CRM_Donor_Simulation_Dataset.csv', 'row': 2, 'source_file': 'CRM_Donor

In [12]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 5000 texts...


Batches: 100%|██████████| 157/157 [00:13<00:00, 11.71it/s]


Generated embeddings with shape: (5000, 384)
Adding 5000 documents to vector store...
Successfully added 5000 documents to vector store
Total documents in collection: 15000


In [13]:
class RAGRetriever:
    print("Initializing RAG Retriever...")
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)



Initializing RAG Retriever...


In [14]:
rag_retriever

<__main__.RAGRetriever at 0x148dcfb60>

In [15]:
rag_retriever.retrieve("Jill ")

Retrieving documents for query: 'Jill '
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.96it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)





[{'id': 'doc_a7749ab5_3507',
  'content': 'DonorID: D03508\nFirstName: Laurie\nLastName: Douglas\nEmail: jennifer03@ellis-foster.com\nPhone: 833.311.2979\nCity: Port Jillmouth\nState: AZ\nZipCode: 10142\nLastDonationDate: 7/10/2024\nTotalGifts: 19\nTotalAmountDonated: 9974.58\nEventParticipation: No\nEngagementScore: 64',
  'metadata': {'content_length': 266,
   'source_file': 'CRM_Donor_Simulation_Dataset.csv',
   'source': '../data/CRM_Donor_Simulation_Dataset.csv',
   'file_type': 'csv',
   'doc_index': 3507,
   'row': 3507},
  'similarity_score': 0.06564366817474365,
  'distance': 0.9343563318252563,
  'rank': 1},
 {'id': 'doc_b733db68_3507',
  'content': 'DonorID: D03508\nFirstName: Laurie\nLastName: Douglas\nEmail: jennifer03@ellis-foster.com\nPhone: 833.311.2979\nCity: Port Jillmouth\nState: AZ\nZipCode: 10142\nLastDonationDate: 7/10/2024\nTotalGifts: 19\nTotalAmountDonated: 9974.58\nEventParticipation: No\nEngagementScore: 64',
  'metadata': {'content_length': 266,
   'source':