In [1]:
pip install langchain openai faiss-cpu tiktoken python-dotenv


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
text_data = """
AI Research Interns build ETL pipelines for LLMs.
They use vector databases like FAISS and Chroma.
RAG systems combine retrieval with generation.
Chunking improves LLM context handling.
"""


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import FakeEmbeddings

# -----------------------------
# 1️⃣ EXTRACT
# -----------------------------
def extract_text():
    return """
    AI Research Interns build ETL pipelines for LLMs.
    They use vector databases like FAISS and Chroma.
    RAG systems combine retrieval with generation.
    Chunking improves LLM context handling.
    """

# -----------------------------
# 2️⃣ TRANSFORM (CHUNKING)
# -----------------------------
def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=80,
        chunk_overlap=10
    )
    return splitter.split_text(text)

# -----------------------------
# 3️⃣ LOAD (VECTOR DATABASE)
# -----------------------------
def load_to_faiss(chunks):
    embeddings = FakeEmbeddings(size=384)
    vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

# -----------------------------
# 4️⃣ RETRIEVE (RAG - RETRIEVAL PART)
# -----------------------------
def retrieve_context(vector_store, question):
    docs = vector_store.similarity_search(question, k=2)
    return "\n".join([doc.page_content for doc in docs])

# -----------------------------
# RUN PIPELINE
# -----------------------------
if __name__ == "__main__":
    text = extract_text()
    chunks = chunk_text(text)
    vector_store = load_to_faiss(chunks)

    question = "What is the purpose of chunking in LLMs?"
    context = retrieve_context(vector_store, question)

    print("Retrieved Context:\n", context)


Retrieved Context:
 Chunking improves LLM context handling.
RAG systems combine retrieval with generation.
