In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import singlestoredb as s2
from openai import OpenAI

In [3]:
# Directly set API keys and connection URI
OPENAI_API_KEY = ""
SINGLESTORE_URI = ""


In [4]:
# Initialize clients
openai_client = OpenAI(api_key=OPENAI_API_KEY)
conn = s2.connect(SINGLESTORE_URI)
cursor = conn.cursor()

In [5]:
# Step 1: Create table if not exists
cursor.execute("""
CREATE TABLE IF NOT EXISTS faqs (
    id INT PRIMARY KEY,
    question TEXT,
    answer TEXT,
    embedding VECTOR(1536)
);
""")

0

In [12]:
# Step 2: Sample data
qa_data = {
    "question": [
        "What is AI?",
        "What is Pinecone?",
        "How do embeddings work?",
        "What is GPT?",
        "What is machine learning?"
    ],
    "answer": [
        "AI is the simulation of human intelligence in machines.",
        "Pinecone is a vector database that enables efficient search and retrieval.",
        "Embeddings are a numerical representation of text that captures semantic meaning.",
        "GPT is a transformer-based model that generates human-like text.",
        "Machine learning is a subset of AI where machines learn from data to make decisions."
    ]
}
df = pd.DataFrame(qa_data)
df.sample()

Unnamed: 0,question,answer
2,How do embeddings work?,Embeddings are a numerical representation of t...


In [16]:
import numpy as np
for i, row in df.iterrows():
    dummy_embedding = np.random.rand(1536).astype(np.float32).tobytes()
    cursor.execute(
        "INSERT INTO faqs (id, question, answer, embedding) VALUES (%s, %s, %s, %s)",
        (i, row['question'], row['answer'], dummy_embedding)
    )
conn.commit()

In [18]:
def retrieve_similar(query, top_k=3):
    cursor.execute("SELECT question, answer FROM faqs LIMIT %s", (top_k,))
    return cursor.fetchall()

def generate_answer(query):
    matches = retrieve_similar(query)
    context = "\n".join([f"{q}: {a}" for q, a in matches])
    simulated_response = f"[SIMULATED ANSWER]\nContext used:\n{context}\n\nOriginal Question: {query}"
    return simulated_response

In [19]:
query = "What is machine learning - please explain in 300 words"
answer = generate_answer(query)
print(f"Question: {query}\nAnswer: {answer}")

Question: What is machine learning - please explain in 300 words
Answer: [SIMULATED ANSWER]
Context used:
What is GPT?: GPT is a transformer-based model that generates human-like text.
What is Pinecone?: Pinecone is a vector database that enables efficient search and retrieval.
What is AI?: AI is the simulation of human intelligence in machines.

Original Question: What is machine learning - please explain in 300 words


In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import singlestoredb as s2
import numpy as np
from openai import OpenAI

# Directly set API keys and connection URI
OPENAI_API_KEY = "your-openai-api-key"
SINGLESTORE_URI = ""

# Initialize clients
openai_client = OpenAI(api_key=OPENAI_API_KEY)
conn = s2.connect(SINGLESTORE_URI)
cursor = conn.cursor()

# Step 1: Create table if not exists
cursor.execute("""
CREATE TABLE IF NOT EXISTS faqs (
    id INT PRIMARY KEY,
    question TEXT,
    answer TEXT,
    embedding BLOB
);
""")

# Step 2: Sample data
qa_data = {
    "question": [
        "What is AI?",
        "What is Pinecone?",
        "How do embeddings work?",
        "What is GPT?",
        "What is machine learning?"
    ],
    "answer": [
        "AI is the simulation of human intelligence in machines.",
        "Pinecone is a vector database that enables efficient search and retrieval.",
        "Embeddings are a numerical representation of text that captures semantic meaning.",
        "GPT is a transformer-based model that generates human-like text.",
        "Machine learning is a subset of AI where machines learn from data to make decisions."
    ]
}
df = pd.DataFrame(qa_data)

# Step 3: Insert data with actual OpenAI embeddings (converted to BLOB)
for i, row in df.iterrows():
    response = openai_client.embeddings.create(input=[row['answer']], model="text-embedding-ada-002")
    embedding = np.array(response.data[0].embedding, dtype=np.float32).tobytes()
    cursor.execute(
        "INSERT INTO faqs (id, question, answer, embedding) VALUES (%s, %s, %s, %s)",
        (i, row['question'], row['answer'], embedding)
    )
conn.commit()

# Step 4: Retrieval function using OpenAI embedding (non-indexed fallback)
def retrieve_similar(query, top_k=3):
    response = openai_client.embeddings.create(input=[query], model="text-embedding-ada-002")
    query_vector = np.array(response.data[0].embedding, dtype=np.float32)

    cursor.execute("SELECT id, answer, embedding FROM faqs")
    results = cursor.fetchall()
    scored = []
    for i, answer, emb_blob in results:
        stored_vec = np.frombuffer(emb_blob, dtype=np.float32)
        score = np.dot(stored_vec, query_vector) / (np.linalg.norm(stored_vec) * np.linalg.norm(query_vector))
        scored.append((score, i, answer))

    top = sorted(scored, reverse=True)[:top_k]
    return [(f"FAQ {i}", ans, score) for score, i, ans in top]

# Step 5: GPT generation function

def generate_answer(query):
    matches = retrieve_similar(query)
    context = "\n".join([f"{q}: {a}" for q, a, _ in matches])
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Use this context to answer the question:\n{context}\n\nQuestion: {query}"}
    ]
    response = openai_client.chat.completions.create(
        model="gpt-4-turbo",
        messages=messages,
        max_tokens=500,
        temperature=0.7
    )
    return response.choices[0].message.content

# Step 6: Test the RAG system
if __name__ == "__main__":
    query = "What is machine learning - please explain in 300 words"
    answer = generate_answer(query)
    print(f"\nQuestion: {query}\n\nAnswer: {answer}")