In [27]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load environment variables
load_dotenv()
api_key = os.environ["GROQ_API_KEY"]

# 1. Load your cleaned transcript
with open("data/cleaned_transcript.txt", "r") as f:
    cleaned_text = f.read()

# 2. Configure Groq (uses Mixtral 8x7b by default)
groq_llm = ChatGroq(
    api_key = api_key,
    temperature=0.2,  # Lower = more factual
    model_name="gemma2-9b-it",  # Fastest model
    max_tokens=1024
)

# 3. Split text into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,  # Optimal for Groq's context window
    chunk_overlap=200
)
texts = text_splitter.create_documents([cleaned_text])

# 4. Summarize using map-reduce (best for long docs)
chain = load_summarize_chain(
    groq_llm,
    chain_type="map_reduce",  # Alternatives: "stuff" (short docs), "refine" (highest quality)
    verbose=False  # Set to True to see intermediate steps
)
summary = chain.run(texts)

# 5. Save the summary
with open("data/groq_summary.txt", "w") as f:
    f.write(summary)

print("✅ Summary saved to data/groq_summary.txt")
print("\nSummary Preview:\n", summary[:500] + "...")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

✅ Summary saved to data/groq_summary.txt

Summary Preview:
 Bert Mueller, a 35-year-old American, founded California Burrito, a successful Mexican-inspired restaurant chain in India.  He overcame challenges like sourcing ingredients and adapting flavors to the local palate by investing in his own avocado farm and building a strong supply chain.  With 103 stores and $23 million in revenue, the company aims to expand to 300 stores by 2030 and potentially go public.  Mueller embraces the challenges and rewards of living and working in India, finding the cul...


In [42]:
import spacy
from spacy import displacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")  # You can also use en_core_web_lg for better accuracy

# Read the summarized text from a file (you can also replace this with your own string)
with open("data/groq_summary.txt", "r") as f:
    summary_text = f.read()

# Apply the NLP pipeline
doc = nlp(summary_text)

# Print the named entities
print("Named Entities:\n")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Optional: visualize entities in browser
displacy.serve(doc, style="ent",port = 5005)

Named Entities:

Bert Mueller (PERSON)
35-year-old (DATE)
American (NORP)
California Burrito (ORG)
Mexican (NORP)
India (GPE)
103 (CARDINAL)
$23 million (MONEY)
300 (CARDINAL)
2030 (DATE)
Mueller (PERSON)
India (GPE)





Using the 'ent' visualizer
Serving on http://0.0.0.0:5005 ...

Shutting down server on port 5005.


In [44]:
import json
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# --- Required Inputs ---
# Replace these with your actual variables
full_transcript = open("data/cleaned_transcript.txt", "r").read()
summary = open("data/groq_summary.txt", "r").read()  # optional, if you've saved it
chunk_size = 5

# Split and create chunks
sentences = full_transcript.strip().split('.')
transcript_chunks = [
    {
        "chunk_id": i + 1,
        "text": '. '.join(sentences[i:i + chunk_size]).strip()
    }
    for i in range(0, len(sentences), chunk_size)
]

# Extract named entities
doc = nlp(full_transcript)
entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]

# Create JSON structure
transcript_json = {
    "transcript_id": "california_burrito_story",
    "summary": summary.strip(),
    "entities": entities,
    "transcript_chunks": transcript_chunks
}

# Save to file
with open("data/transcript_data.json", "w") as f:
    json.dump(transcript_json, f, indent=4)

print("✅ transcript_data.json saved successfully.")

✅ transcript_data.json saved successfully.


In [None]:
# if the data is large then i would use json format in faiss otherwise not

In [None]:
import re
import nltk
import faiss
import numpy as np
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer

# Download punkt tokenizer from nltk
nltk.download('punkt')

# Load the model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Or use any other model

# Load transcript
with open("data/cleaned_transcript.txt", "r") as f:
    raw_text = f.read()

# 2. Convert sentences to embeddings
embeddings = model.encode(sentences)

# 3. Convert embeddings to numpy array
embeddings = np.array(embeddings)

# 4. Create the FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance for similarity search
index.add(embeddings)  # Add embeddings to the index

# Save the FAISS index (optional, to reload it later)
faiss.write_index(index, 'data/embeddings/faiss_index.index')

# 5. Perform a Search
def search(query, top_k=3):
    # Convert the query to embeddings
    query_embedding = model.encode([query])

    # Search the FAISS index for the top_k most similar sentences
    distances, indices = index.search(np.array(query_embedding), top_k)

    # Retrieve and print the top_k most similar sentences
    results = []
    for i in range(top_k):
        result = {
            "sentence": sentences[indices[0][i]],  # Get sentence by index
            "distance": distances[0][i]  # Get distance from query
        }
        results.append(result)
    
    return results

# Example search query
query = "Tell me about the restaurant journey"
search_results = search(query)

# Print search results
for result in search_results:
    print(f"Sentence: {result['sentence']}\nDistance: {result['distance']}\n")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guranshchugh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentence: 
started Mexican restaurants
Distance: 0.9324820041656494

Sentence: 
cuisine to India
Distance: 1.0712473392486572

Sentence:   when I looked at starting a Mexican-inspired restaurant in India, there was just Taco Bell
Distance: 1.1184529066085815

