In [1]:
# First, install required packages
!pip install -q transformers sentence-transformers faiss-cpu torch datasets



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from datasets import load_dataset
import textwrap
import re


In [3]:
# Load a lightweight LLM model for generation
# model_name = "facebook/opt-350m"  # You can change this to other models
# Load a better model - using TinyLlama as it's more capable but still lightweight
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="auto"
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
# Initialize the text generation pipeline with better parameters
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,  # Increased max length
    do_sample=True,
    temperature=0.3,  # Reduced temperature for more focused responses
    top_p=0.9,
    truncation=True,
    pad_token_id=tokenizer.eos_token_id
)

In [10]:
# Load sentence transformer for embedding
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Expanded knowledge base with more detailed information
knowledge_base = [
    "The capital of France is Paris. Paris is famous for the Eiffel Tower, which stands 324 meters tall and was completed in 1889. The city is also known for its art museums, cuisine, and fashion.",
    "Python is a high-level programming language created by Guido van Rossum in 1991. It emphasizes code readability with notable use of whitespace and is widely used in web development, data science, and artificial intelligence.",
    "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed.",
    "The Great Wall of China stretches over 13,000 miles across China. Construction began more than 2,000 years ago during the Ming Dynasty.",
    "William Shakespeare was an English playwright who wrote Romeo and Juliet, along with many other famous plays and sonnets in the late 16th century.",
    "Presiden Negara Indonesia pada tahun 2024 adalah Bapak Prabowo Subianto. Dia akan memimpin sampai tahun 2029.",
]

In [11]:
# Create FAISS index for efficient similarity search
embedding_size = embedder.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(embedding_size)

# Convert documents to embeddings and add to FAISS index
embeddings = embedder.encode(knowledge_base)
index.add(embeddings.astype('float32'))



In [7]:
def retrieve_relevant_context(query, k=1):  # Reduced k to 1 for more focused context
    """Retrieve the k most relevant documents for the query."""
    query_embedding = embedder.encode([query])
    distances, indices = index.search(query_embedding.astype('float32'), k)
    relevant_docs = [knowledge_base[i] for i in indices[0]]
    return "\n".join(relevant_docs)

def clean_response(text):
    """Clean up the generated response."""
    # Remove repetitive question and context
    text = re.sub(r'Question:.*?Answer:', '', text, flags=re.DOTALL)
    # Remove the initial "Let me help you with that"
    text = re.sub(r'Let me help you with that\.?\s*', '', text)
    # Remove any trailing incomplete sentences
    text = re.sub(r'[^.!?]+$', '', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def generate_response(query, context):
    """Generate a response using the LLM with the retrieved context."""
    prompt = f"""<|im_start|>system
You are a helpful AI assistant. Use the provided context to answer questions accurately and concisely.
<|im_end|>
<|im_start|>user
Context: {context}

Question: {query}
<|im_end|>
<|im_start|>assistant
Let me answer based on the provided context."""

    try:
        response = generator(
            prompt,
            max_length=512,
            do_sample=True,
            temperature=0.3,
            num_return_sequences=1,
            return_full_text=True
        )

        generated_text = response[0]['generated_text']
        # Extract the assistant's response
        assistant_response = generated_text.split("<|im_start|>assistant")[-1]
        # Clean up the response
        cleaned_response = clean_response(assistant_response)
        return cleaned_response

    except Exception as e:
        return f"Error generating response: {str(e)}"

def ask_question(question):
    """Ask a question and get a response using the RAG system."""
    try:
        print("🔍 Retrieving relevant context...")
        context = retrieve_relevant_context(question)
        print("\nRelevant context found:")
        print(textwrap.fill(context, width=80))

        print("\n🤖 Generating response...")
        response = generate_response(question, context)

        print("\nFinal response:")
        print(textwrap.fill(response, width=80))

    except Exception as e:
        print(f"Error: {str(e)}")


inference

In [12]:
# Test the RAG system
print("Testing the improved RAG system...\n")
test_questions = [
    # "What is the capital of France and what is it known for?",
    # "Can you tell me about Python programming language and who created it?",
    # "What is machine learning?",
    # "How can I start learning machine learning?",
    # "Where is Cocos Island?",
    "Siapa presiden negara Indonesia?",
]

for question in test_questions:
    print("-" * 80)
    print(f"Question: {question}")
    ask_question(question)
    print("\n")

Testing the improved RAG system...

--------------------------------------------------------------------------------
Question: Siapa presiden negara Indonesia?
🔍 Retrieving relevant context...

Relevant context found:
Presiden Negara Indonesia pada tahun 2024 adalah Bapak Prabowo Subianto. Dia
akan memimpin sampai tahun 2029.

🤖 Generating response...

Final response:
Let me answer based on the provided context. Bapak Prabowo Subianto will be the
president of Indonesia until 2029.


