# L2: Build Your First Semantic Cache

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Load the FAQ Dataset

In [2]:
import pandas as pd
import numpy as np
import time

from cache.faq_data_container import FAQDataContainer

faq_data = FAQDataContainer()
faq_df = faq_data.faq_df
test_df = faq_data.test_df

Loaded 8 FAQ entries
Loaded 80 test queries


In [3]:
faq_df.head().style

Unnamed: 0,id,question,answer
0,0,How do I get a refund?,"To request a refund, visit your orders page and select **Request Refund**. Refunds are processed within 3-5 business days."
1,1,Can I reset my password?,Click **Forgot Password** on the login page and follow the email instructions. Check your spam folder if you don't see the email.
2,2,Where is my order?,Use the tracking link sent to your email after shipping. Orders typically arrive within 2-7 business days depending on your location.
3,3,How long is the warranty?,All electronic products include a 12-month warranty from the purchase date. Extended warranties are available for purchase.
4,4,Do you ship internationally?,"Yes, we ship to over 50 countries worldwide. International shipping fees and delivery times vary by destination."


## Create Embeddings for Semantic Search

In [4]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-mpnet-base-v2")

faq_embeddings = encoder.encode(faq_df["question"].tolist())

print(f"Sample (first 10 dimensions): {faq_embeddings[0][:10]}")

Sample (first 10 dimensions): [ 0.02746388  0.04248317 -0.02679512  0.03786724 -0.03876131 -0.00109462
 -0.01158369  0.03070284 -0.00653773 -0.01955782]


## Implement Semantic Search

In [5]:
def cosine_dist(a: np.array, b: np.array):
    """Compute cosine distance between two sets of vectors."""
    a_norm = np.linalg.norm(a, axis=1)
    b_norm = np.linalg.norm(b) if b.ndim == 1 else np.linalg.norm(b, axis=1)
    sim = np.dot(a, b) / (a_norm * b_norm)
    return 1 - sim


def semantic_search(query: str) -> tuple:
    """Find the most similar FAQ question to the query."""
    query_embedding = encoder.encode([query])[0]

    distances = cosine_dist(faq_embeddings, query_embedding)

    # Find the most similar question (lowest distance)
    best_idx = int(np.argmin(distances))
    best_distance = distances[best_idx]

    return best_idx, best_distance

In [6]:
idx, distance = semantic_search(
    "How long will it take to get a refund for my order?"
)

print(f"Most similar FAQ: {faq_df.iloc[idx]['question']}")
print(f"Answer: {faq_df.iloc[idx]['answer']}")
print(f"Cosine distance: {distance:.3f}")

Most similar FAQ: How do I get a refund?
Answer: To request a refund, visit your orders page and select **Request Refund**. Refunds are processed within 3-5 business days.
Cosine distance: 0.331


## Build a Simple Semantic Cache

In [7]:
def check_cache(query: str, distance_threshold: float = 0.3):
    """
    Semantic cache lookup for previously asked questions.
    Returns a dictionary with answer if hit, None if miss.
    """
    idx, distance = semantic_search(query)

    if distance <= distance_threshold:
        return {
            "prompt": faq_df.iloc[idx]["question"],
            "response": faq_df.iloc[idx]["answer"],
            "vector_distance": float(distance),
        }

    return None  # Cache miss

In [8]:
test_queries = [
    "Is it possible to get a refund?",
    "I want my money back",
    "What are your business hours?",  # Should miss
]

for query in test_queries:
    result = check_cache(query, distance_threshold=0.3)
    if result:
        print(f"‚úÖ HIT: '{query}' -> {result['response'][:50]}...")
        print(f"   Distance: {result['vector_distance']:.3f}\n")
    else:
        print(f"‚ùå MISS: '{query}'\n")

‚úÖ HIT: 'Is it possible to get a refund?' -> To request a refund, visit your orders page and se...
   Distance: 0.262

‚ùå MISS: 'I want my money back'

‚ùå MISS: 'What are your business hours?'



### Add entries to the cache

In [9]:
def add_to_cache(question: str, answer: str):
    """
    Add a new Q&A pair to our simple in-memory cache.
    Extends both the DataFrame and embeddings matrix.
    """
    global faq_df, faq_embeddings

    new_row = pd.DataFrame({"question": [question], "answer": [answer]})
    faq_df = pd.concat([faq_df, new_row], ignore_index=True)

    # Generate embedding for the new question
    new_embedding = encoder.encode([question])

    # Add to embeddings matrix
    faq_embeddings = np.vstack([faq_embeddings, new_embedding])

    print(f"‚úÖ Added to cache: '{question}'")

In [10]:
print("Original cache size:", len(faq_df))

new_entries = [
    (
        "What are your business hours?",
        "We're open Monday-Friday 9 AM to 6 PM EST. Weekend support is available for urgent issues.",
    ),
    (
        "Do you have a mobile app?",
        "Yes! Our mobile app is available on both iOS and Android. Search for 'CustomerApp' in your app store.",
    ),
    (
        "How do I update my payment method?",
        "Go to Account Settings > Payment Methods to add, edit, or remove payment options.",
    ),
]

for question, answer in new_entries:
    add_to_cache(question, answer)

print(f"\nCache now has {len(faq_df)} total entries")

Original cache size: 8
‚úÖ Added to cache: 'What are your business hours?'
‚úÖ Added to cache: 'Do you have a mobile app?'
‚úÖ Added to cache: 'How do I update my payment method?'

Cache now has 11 total entries


In [11]:
test_extended_queries = [
    "What time do you open?",  
    "Is there a phone app?", 
    "How can I change my payment method?",
]

for query in test_extended_queries:
    result = check_cache(query, distance_threshold=0.3)
    if result:
        print(f"‚úÖ HIT: '{query}' -> {result['response'][:50]}...")
        print(f"   Distance: {result['vector_distance']:.3f}\n")
    else:
        print(f"‚ùå MISS: '{query}'\n")

‚úÖ HIT: 'What time do you open?' -> We're open Monday-Friday 9 AM to 6 PM EST. Weekend...
   Distance: 0.289

‚úÖ HIT: 'Is there a phone app?' -> Yes! Our mobile app is available on both iOS and A...
   Distance: 0.265

‚úÖ HIT: 'How can I change my payment method?' -> Go to Account Settings > Payment Methods to add, e...
   Distance: 0.118



## Setup Redis
### Uninstall regular Redis if you installed it
brew uninstall redis

### Install Redis Stack
brew tap redis-stack/redis-stack
brew install redis-stack

### Start Redis Stack
docker run -d -p 6379:6379 --name redis-stack redis/redis-stack:latest

In [12]:
import os

REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")

In [13]:
import redis

try:
    r = redis.Redis.from_url(REDIS_URL)
    r.ping()
    print("‚úÖ Redis is running and accessible")
except redis.ConnectionError:
    print("‚ùå Cannot connect to Redis")
    raise

‚úÖ Redis is running and accessible


### Using a Cache-Optimized Embedding Model (langcache-embed-v1)
https://huggingface.co/redis/langcache-embed-v1

In [14]:
from redisvl.utils.vectorize import HFTextVectorizer
from redisvl.extensions.cache.embeddings import EmbeddingsCache

langcache_embed = HFTextVectorizer(
    model="redis/langcache-embed-v1",
    cache=EmbeddingsCache(redis_client=r, ttl=3600)
)

13:19:10 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps
13:19:10 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: redis/langcache-embed-v1


### Create the Redis Semantic Cache

In [15]:
from redisvl.extensions.cache.llm import SemanticCache

cache = SemanticCache(
    name="faq-cache",
    vectorizer=langcache_embed,
    redis_client=r,
    distance_threshold=0.3
)

13:19:12 redisvl.index.index INFO   Index already exists, not overwriting.


### Load the Cache with FAQ Data

In [16]:
for i in range(len(faq_df)):
    cache.store(
        prompt=faq_df.iloc[i]["question"],
        response=faq_df.iloc[i]["answer"]
    )

In [17]:
result = cache.check("I need a refund for my purchase")

In [18]:
result

[{'entry_id': '60fd55b8527fcd2bf427d81dc3f4c47c4bf8904c9802ffecbcf2c02b38f537ac',
  'prompt': 'How do I get a refund?',
  'response': 'To request a refund, visit your orders page and select **Request Refund**. Refunds are processed within 3-5 business days.',
  'vector_distance': 0.256069898605,
  'inserted_at': 1763846352.57,
  'updated_at': 1763846352.57,
  'key': 'faq-cache:60fd55b8527fcd2bf427d81dc3f4c47c4bf8904c9802ffecbcf2c02b38f537ac'}]

### Implement TTL (time-to-live) policy to keep cache fresh

In [19]:
cache.set_ttl(86400)

## End-to-End LLM Example

In [20]:
from cache.config import load_openai_key
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

load_openai_key()

MODEL_NAME = "gpt-4o-mini"

llm = ChatOpenAI(
    model=MODEL_NAME,
    temperature=0.1,
    max_tokens=150,
)

> OpenAI API key is already loaded in the environment


In [21]:
def get_llm_response(question: str) -> str:
    prompt = f"""
    You are a helpful customer support assistant. Answer this customer question concisely and professionally:
    
    Question: {question}
    
    Provide a helpful response in 1-2 sentences. If you don't have specific information, give a general helpful response.
    """
    response = llm.invoke([HumanMessage(content=prompt)])
    return response.content.strip()

In [22]:
from cache.evals import PerfEval

perf_eval = PerfEval()

test_questions = [
    "How can I get my money back?",
    "I want a refund please",
    "What's your return policy?",
    "I forgot my password",
    "Can you help me reset my password?",
    "What are your shipping costs?",
    "Do you offer installation services?",
    "Can I schedule a phone call with support?",
    "How do I cancel my subscription?",
    "How much does shipping cost?",
    "I need to cancel my account",
]

perf_eval.set_total_queries(len(test_questions))

In [23]:
with perf_eval:
    for i, question in enumerate(test_questions, 1):
        print(f"\n[{i}] Question: '{question}'")

        perf_eval.start()

        if cached_result := cache.check(question):
            # Cache HIT
            perf_eval.tick("cache_hit")
            print(
                f"    ‚úÖ CACHE HIT (distance: {cached_result[0]['vector_distance']:.3f})"
            )
            print(f"    üìã Cached question: {cached_result[0]['prompt'][:80]}...")
            print(f"    üìã Cached response: {cached_result[0]['response'][:80]}...")
        else:
            # Cache MISS - call LLM
            perf_eval.tick("cache_miss")  # Time for cache check
            print(f"    ‚ùå CACHE MISS")
            print(f"    ü§ñ Calling LLM... ", end="")

            # Call LLM and track the call
            perf_eval.start()
            llm_response = get_llm_response(question)
            perf_eval.tick("llm_call")
            perf_eval.record_llm_call(MODEL_NAME, question, llm_response)
            print(f"    üí¨ LLM response: {llm_response[:80]}...")
            cache.store(prompt=question, response=llm_response)


[1] Question: 'How can I get my money back?'
    ‚úÖ CACHE HIT (distance: 0.249)
    üìã Cached question: How do I get a refund?...
    üìã Cached response: To request a refund, visit your orders page and select **Request Refund**. Refun...

[2] Question: 'I want a refund please'
    ‚úÖ CACHE HIT (distance: 0.161)
    üìã Cached question: How do I get a refund?...
    üìã Cached response: To request a refund, visit your orders page and select **Request Refund**. Refun...

[3] Question: 'What's your return policy?'
    ‚ùå CACHE MISS
    ü§ñ Calling LLM... 13:19:14 httpx INFO   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
13:19:14 openai._base_client INFO   Retrying request to /chat/completions in 0.449784 seconds
13:19:16 httpx INFO   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
13:19:16 openai._base_client INFO   Retrying request to /chat/completions in 0.803757 seconds
13:19:17 

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
np.mean(perf_eval.durations_by_label['cache_hit'])

np.float64(0.030449322291782925)

In [None]:
np.mean(perf_eval.durations_by_label['llm_call'])

np.float64(0.027960777282714844)

<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"><b>Note:</b> In the above experiment we measure the latency of the cache response and a mocked latency of an LLM call. The mocked LLM call is a dummy function that sleeps for a random amount of time. The randomness in the results mainly comes from the randomness we introduced to mock the LLM. The results show us what we can typically see in practice.</p>

In [None]:
cache.clear()