In [2]:
import os
import numpy as np
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from concurrent.futures import ThreadPoolExecutor, as_completed
from numpy.linalg import norm
from tqdm import tqdm

os.environ["OPENAI_API_KEY"] = ''
client = OpenAI()

# RAG (Retrieval-Augmented Generation)

En esta práctica implementaremos un sistema RAG (Retrieval-Augmented Generation).

Como retrieval corpus (o proprietary knowledge base), usaremos los libros en Inglés de Harry Potter, con un cambio fundamental, los nombres de los personajes estarán modificados según reza la siguiente lista:


In [4]:
embeddings_dir = '../data/practica_rag/embeddings/'
embeddings = np.load(os.path.join(embeddings_dir, "embeddings.npy"))
texts = np.load(os.path.join(embeddings_dir, "texts.npy"))

In [5]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [6]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

def search_embeddings(query, top_k=5):
    query_embedding = get_embedding(query)
    similarities = np.array([cosine_similarity(query_embedding, emb) for emb in embeddings])
    top_k_indices = similarities.argsort()[-top_k:][::1]
    results = [texts[i] for i in top_k_indices[::-1]]
    return results

In [7]:
# Function to query GPT with search results as context
def query_gpt_with_context(query, top_k=10):
    context_results = search_embeddings(query, top_k=top_k)
    context = "\n\n".join(context_results)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    print(f"This is the full prompt with the context of selected fragments:\n\n {prompt}")
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": "Answer the question based on the provided context."},
                  {"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

Reemplacemos ahora los nombres en una lista de preguntas de Harry Potter, y pongamos a prueba nuestro RAG!

In [8]:
query = "What is the name of Villalba's faithful companion and dog"
answer = query_gpt_with_context(query)
print("Answer:", answer)

This is the full prompt with the context of selected fragments:

 Context:
Villalba was standing outside his hut, one hand on the collar of his enormous black boarhound, Fang. There were several open wooden crates on the ground at his feet, and Fang was whimpering and straining at his collar, apparently keen to investigate the contents more closely. As they drew nearer, an odd rattling noise reached their ears, punctuated by what sounded like minor explosions.

Roberto, Diego, and Carolina had always known that Villalba had an unfortunate liking for large and monstrous creatures. During their first year at Hogwarts he had tried to raise a dragon in his little wooden house, and it would be a long time before they forgot the giant, three- headed dog he'd christened "Fluffy." And if, as a boy, Villalba had heard that a monster was hidden somewhere in the castle, Roberto was sure he'd have gone to any lengths for a glimpse of it. He'd probably thought

Villalba was sitting in his shirtslee