## hugging face embeddings was always giving the error pip install sentence-transformers

In [3]:
from langchain_core.runnables.config import run_in_executor
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document  # Updated import

# Load cleaned transcript
loader = TextLoader("data/text_files/cleaned_transcript.txt") 
documents = loader.load()

# Split with overlap for context
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)

In [4]:
# Load KWIC data for boosting
import pandas as pd
kwic_df = pd.read_csv("data/csv_files/kwic_results.csv")
keywords = kwic_df['keyword'].unique().tolist()

# Add keyword metadata to chunks
for chunk in chunks:
    chunk_keywords = [kw for kw in keywords if kw.lower() in chunk.page_content.lower()]
    chunk.metadata["keywords"] = chunk_keywords

In [17]:
import spacy
from langchain_core.embeddings import Embeddings

class SpacyEmbeddings(Embeddings):
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        
    def embed_documents(self, texts):
        return [self.nlp(text).vector.tolist() for text in texts]
    
    def embed_query(self, text):
        return self.nlp(text).vector.tolist()

# Usage
embeddings = SpacyEmbeddings()
vector_store = FAISS.from_documents(chunks, embeddings)

In [20]:
from langchain.retrievers import EnsembleRetriever
from langchain.vectorstores import FAISS

# Initialize FAISS retriever
vector_retriever = vector_store.as_retriever(
    search_type="mmr",  # Maximal Marginal Relevance
    search_kwargs={"k": 5, "fetch_k": 20}
)

# Optional: Add BM25 for hybrid search
from langchain.retrievers import BM25Retriever
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 3

# Combine retrievers
ensemble_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3]
)

In [25]:
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.environ["GROQ_API_KEY"]

# Initialize Groq client (5x faster than local LLMs)
groq_llm = ChatGroq(
    temperature=0.1,
    model_name="gemma2-9b-it",  # Fastest model
    max_tokens=1024
)

# Reuse your existing FAISS retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=groq_llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True
)

# Query function remains the same
def answer_query(query):
    result = qa_chain({"query": query})
    return {
        "answer": result["result"],
        "sources": list(set([doc.metadata["source"] for doc in result["source_documents"]]))
    }

# Usage
response = answer_query("What was discussed about California Burrito?")
print(f"Answer: {response['answer']}\nSources: {response['sources']}")

Answer: This text discusses the founding and growth of a Mexican-inspired restaurant chain called California Burrito in India. 


Here are some key points:

* **Founder:** Bert Mueller, who moved to India in 2011.
* **Inspiration:**  Bert wanted to bring authentic Mexican food to India, noticing the lack of options beyond Taco Bell.
* **Challenges:**  Adapting to the Indian market and sourcing ingredients was a major hurdle.
* **Growth:**  The first store opened in 2012 and has since expanded to 103 locations.
* **Financial Success:**  The first store generated $500,000 in its first year. 



Let me know if you have any other questions about California Burrito. 

Sources: ['data/text_files/cleaned_transcript.txt']
