## 1.Data ingestion pipeline


In [61]:
from pathlib import Path
from langchain_classic.document_loaders import PyMuPDFLoader

In [62]:
# read all the pdfs inside the directory
def process_all_pdfs(directory):
    '''Process all pdfs in a directory using PyMuPDF'''

    all_documents = []
    pdf_dir = Path(directory)

    # finding all pdfs recursively
    pdf_files = list(pdf_dir.glob('**/*.pdf'))

    print(f"\n====== Found {len(pdf_files)} PDF files to process ======")

    for file in pdf_files:
        print(f"\nProcessing: {file.name} file")

        try:
            loader = PyMuPDFLoader(
                str(file)
            )
            documents = loader.load()

            # .extend() adds individual items to the list
            all_documents.extend(documents)

            print(
                f"\n‚úÖ Successfully Loaded <{len(documents)}> pages from {file.name}")
            print("=" * 50)

        except Exception as e:
            print(f"‚ùå Error processing {file.name}: {e}")
            continue

    print(f"\n\nTotal documents loaded: <{len(all_documents)}>")
    return all_documents

In [63]:
all_pdf_docs = process_all_pdfs("data/pdfs")



Processing: Deep Learning 101.pdf file

‚úÖ Successfully Loaded <266> pages from Deep Learning 101.pdf

Processing: DeepSeek_OCR_paper.pdf file

‚úÖ Successfully Loaded <22> pages from DeepSeek_OCR_paper.pdf

Processing: mathematics-ML.pdf file

‚úÖ Successfully Loaded <266> pages from mathematics-ML.pdf

Processing: ML.pdf file

‚úÖ Successfully Loaded <169> pages from ML.pdf

Processing: pp_report_1.pdf file

‚úÖ Successfully Loaded <14> pages from pp_report_1.pdf

Processing: PP_REPORT_2.pdf file

‚úÖ Successfully Loaded <9> pages from PP_REPORT_2.pdf


Total documents loaded: <746>


In [64]:
all_pdf_docs[0]

Document(metadata={'producer': 'xdvipdfmx (20250205); modified using OpenPDF UNKNOWN', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-10-15T19:40:49+11:00', 'source': 'data\\pdfs\\Deep Learning 101.pdf', 'file_path': 'data\\pdfs\\Deep Learning 101.pdf', 'total_pages': 266, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-15T22:07:21+05:30', 'trapped': '', 'modDate': "D:20251015220721+05'30'", 'creationDate': "D:20251015194049+11'00'", 'page': 0}, page_content='')

## 2.splitting documents into chunks


In [65]:
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter

In [66]:
def split_docs(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )

    chunked_documents = text_splitter.split_documents(documents)

    print("\n‚úÖDocument Chunked successfully!")
    print(
        f"Splitted <{len(documents)}> documents into <{len(chunked_documents)}> chunks.")
    print("=" * 50)

    return chunked_documents

In [67]:
chunks = split_docs(all_pdf_docs)


‚úÖDocument Chunked successfully!
Splitted <746> documents into <1445> chunks.


## 3.Embeddings and vectore store


In [68]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.vectorstores import FAISS

In [69]:
def embed_and_store(chunks):

    try:
        print("\nEmbedding Initiallizing...")
        print("=" * 50)

        embedding_model = HuggingFaceEmbeddings(
            model_name="BAAI/bge-small-en-v1.5", show_progress=True,
            model_kwargs={
                'device': 'cpu'
            },
            encode_kwargs={
                'batch_size': 32,
                'normalize_embeddings': True
            }

        )

        print("\nVectorStore Initializing...")
        print("=" * 50)

        # Creates a new FAISS index from scratch
        vectorstore = FAISS.from_documents(
            documents=chunks,
            embedding=embedding_model,
            distance_strategy='COSINE'  # Better for normalized embeddings
        )

        print("\n‚úÖ Embedding and Storing in FAISS Vectorstore successful!")

        print(f"\nVector dimension: {vectorstore.index.d}")
        print(f"üìä Total vectors: {vectorstore.index.ntotal}")
        print("=" * 50)

        # Memory usage (approximate)
        memory_mb = (vectorstore.index.ntotal *
                     vectorstore.index.d * 4) / (1024 * 1024)
        print(f"üíæ Approximate memory: {memory_mb:.2f} MB")

        print(f"Total Vectors in the store: <{vectorstore.index.ntotal}>")
        print("=" * 50)

        # Save
        vectorstore.save_local("faiss_index")
        print("\n‚úÖ Successfully saved the FAISS index locally")

        return vectorstore

    except Exception as e:
        print(f"‚ùå Error during embedding and storing: {e}")

In [70]:
vectorstore = embed_and_store(chunks)


Embedding Initiallizing...

VectorStore Initializing...


Batches:   0%|          | 0/46 [00:00<?, ?it/s]


‚úÖ Embedding and Storing in FAISS Vectorstore successful!

Vector dimension: 384
üìä Total vectors: 1445
üíæ Approximate memory: 2.12 MB
Total Vectors in the store: <1445>

‚úÖ Successfully saved the FAISS index locally


##


In [71]:
# # Load later
# vectorstore = FAISS.load_local("faiss_index", embeddings=HuggingFaceEmbeddings(
#     model="BAAI/bge-small-en-v1.5"), allow_dangerous_deserialization=True)

# vectorstore.add_documents(new_chunks)

## 4. create RAG pipeline


In [72]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_classic.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

True

In [73]:
def simple_rag(query, vectorstore):
    # Retrieve similar documents
    similar_docs = vectorstore.similarity_search(
        query=query,
        k=3
    )

    # Initialize the Google Generative AI chat model
    chat_model = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro"
    )

    # Create a prompt by combining the query with the content of similar documents
    context = "\n\n".join([doc.page_content for doc in similar_docs])

    prompt = ChatPromptTemplate.from_template(
        '''
    Using the following context to answer the question below. 
    If the context is insufficient, please try to generate the answer based on your own knowledge:
    <context>
    {context}
    </context>

    question: {query}
    '''
    )

    prompt = prompt.format_prompt(
        context=context,
        query=query
    )

    # Generate a response using the chat model
    response = chat_model.invoke(prompt)

    print(context)

    return response.content

In [79]:
query = "what is llm fine-tuning?"
rag_response = simple_rag(query, vectorstore)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[Intermediate]
7.1
Parameter Norm Penalties
[Intermediate]
Parameter norm penalties constrain model capacity by penalizing large weights.
7.1.1
Intuition: Shrinking the Model‚Äôs ‚ÄùComplexity‚Äù
Think of a model as a musical band with many instruments (parameters). If every instrument plays loudly (large weights), the
result can be noisy and overfit to the training song. Norm penalties are like asking the band to lower the volume uniformly
(L2) or mute many instruments entirely (L1) so the melody (true signal) stands out. This discourages memorization and
encourages simpler patterns that generalize.
7.1.2
L2 Regularization (Weight Decay)
Add squared L2 norm of weights to the loss:
ÀúL(Œ∏) = L(Œ∏) + Œª
2 ‚à•w‚à•2
(7.1)
91

12.2
Natural Language Processing
[Beginner]
12.2.1
Text Classification
Categorize text documents using pretrained transformers and task headsÕæ fine-tuning is data-efficient and standard
Devlin2018Õæ Prince2023Õæ D2LChapterAttention.
‚Ä¢ Sentiment analysis: Positive/

In [80]:
print(rag_response)

Based on the provided context and general knowledge, here is an explanation of LLM fine-tuning:

LLM fine-tuning is the process of taking a large, pre-trained language model (like BERT, RoBERTa, or GPT) and further training it on a smaller, task-specific dataset. The goal is to adapt the model's general capabilities to excel at a particular task.

From the context:

*   **Application:** The context provides a specific example of fine-tuning for **text classification**. It describes a process where a "pretrained transformer" is adapted to categorize documents for tasks like sentiment analysis, spam detection, or topic classification.
*   **Method:** This is achieved by adding a "task head" (e.g., a Softmax head for classification) to the pre-trained model and then continuing the training process.
*   **Benefit:** The context highlights that this process is **"data-efficient and standard,"** meaning you don't need a massive dataset to achieve good performance, making it a practical appro