In [1]:
!pip install requests faiss-cpu sentence-transformers python-dotenv

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, faiss-cpu
Successfully installed faiss-cpu-1.10.0 python-dotenv-1.0.1


In [2]:
import torch
import os
import requests
import faiss
from google.colab import userdata
from transformers import pipeline
from sentence_transformers import SentenceTransformer

In [4]:
api_key = userdata.get("GROQ_API_KEY")
if not api_key:
    raise ValueError("GROQ_API_KEY not found! Set it using os.environ or a .env file.")
endpoint = "https://api.groq.com/openai/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

In [14]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


In [11]:
def load_documents():
    """Load predefined documents for retrieval."""
    return [
        "Fast language models improve efficiency in NLP applications by minimizing latency and enhancing user experience.",
        "Machine learning models require high computational power to generate responses quickly in real-time applications.",
        "Retrieval-augmented generation improves response accuracy by combining document retrieval with language models.",
        "FAISS is an efficient library for fast nearest neighbor search, used for document retrieval."
    ]

In [12]:
def create_faiss_index(documents):
    """Create a FAISS index from document embeddings."""
    document_embeddings = embedding_model.encode(documents, convert_to_tensor=True).cpu().detach().numpy()
    dimension = document_embeddings.shape[1]

    index = faiss.IndexFlatL2(dimension)
    index.add(document_embeddings)

    return index, documents

In [13]:
def retrieve_documents(query, index, documents, k=2):
    """Retrieve top k similar documents based on the query."""
    query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().detach().numpy()
    distances, indices = index.search(query_embedding, k)

    retrieved_docs = [documents[i] for i in indices[0]]
    return " ".join(retrieved_docs)

In [15]:

def generate_response(context, query):
    """Send the query with retrieved context to the Groq API and return the response."""
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "llama-3.3-70b-versatile",
        "messages": [
            {
                "role": "user",
                "content": f"Using the following context: '{context}', {query}"
            }
        ]
    }

    response = requests.post(endpoint, headers=headers, json=payload, verify=False)

    if response.status_code == 200:
        data = response.json()
        return data["choices"][0]["message"]["content"]
    else:
        return f"Error {response.status_code}: {response.text}"

In [16]:
def main():
    """Main function to execute retrieval-augmented generation (RAG) flow."""
    documents = load_documents()
    print(f"Loaded {len(documents)} predefined documents.")

    index, documents = create_faiss_index(documents)

    query = "Explain the importance of fast language models"
    context = retrieve_documents(query, index, documents)
    print(f"Retrieved Context: {context}")

    response = generate_response(context, query)
    print("Generated Response:", response)


In [17]:
main()

Loaded 4 predefined documents.
Retrieved Context: Fast language models improve efficiency in NLP applications by minimizing latency and enhancing user experience. Retrieval-augmented generation improves response accuracy by combining document retrieval with language models.




Generated Response: Fast language models play a vital role in Natural Language Processing (NLP) applications by significantly enhancing efficiency, user experience, and overall performance. The importance of fast language models can be understood from the following aspects:

1. **Minimizing Latency**: Fast language models process and respond to user queries in real-time, minimizing latency and wait times. This is particularly crucial in applications where timely responses are essential, such as customer service chatbots, voice assistants, and real-time language translation.

2. **Enhancing User Experience**: By providing quick and accurate responses, fast language models improve the overall user experience. Users are more likely to engage with an application or system that responds promptly, making it more likely to achieve its intended purpose, whether that's providing information, answering questions, or completing tasks.

3. **Improving Response Accuracy**: When combined with retrie