In [64]:
import os
import pdfplumber
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import markdown
from bs4 import BeautifulSoup
import re
import unicodedata
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fati1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Document Parsing

### Data Loading 

File Path here

In [65]:
file_path = './problemStatement.md'

Get the file extension and parse it accordingly

In [66]:
# get file extension
def get_file_extension(file_path):
    return os.path.splitext(file_path)[-1].lower()


In [67]:
# use the appropriate text parser based on file extension (pdf, markdown, txt)
def parse_file(file_path):
    file_extension = get_file_extension(file_path)
    
    if file_extension == '.pdf':
        with pdfplumber.open(file_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() + '\n'
            return text
    elif file_extension == '.md' or file_extension == '.markdown':
        with open(file_path, "r", encoding="utf-8") as file:
            html = markdown.markdown(file.read())
        return BeautifulSoup(html, "html.parser").get_text()
    elif file_extension == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    

    

In [68]:
raw_text = parse_file(file_path)

In [69]:
print(raw_text)

Project Title: Document Summarization using Retrieval-Augmented
Generation (RAG)
Objective:
To develop a summarization system that combines retrieval-based context
selection with large language model (LLM) generation. The system should
accept a long document and generate a concise, coherent summary using
semantic chunking and RAG.

Project Tasks:
1. Document Ingestion
● Accept documents in PDF, TXT, or Markdown format.
● Split into semantically meaningful chunks using sliding windows or
semantic segmenters.
2. Embedding & Retrieval
● Convert chunks to vector embeddings using SentenceTransformers or
OpenAI API. ● Store in FAISS or Chroma vector DB.
● Perform semantic retrieval for a general summary query (e.g.,
"Summarize this document").
3. Summary Generation
● Use top-k retrieved chunks and pass them into a pre-trained LLM
(e.g., GPT, LLaMA, Mistral).
● Generate a final summary that is coherent, fluent, and accurate.
4. Output Presentation
● Display the retrieved context and the gener

### Preprocessing

Cleaning the text

In [70]:

def clean_text(text):
    # Normalize line breaks and spaces
    text = re.sub(r'\r\n|\r', '\n', text)           # Convert \r\n or \r to \n
    text = re.sub(r'\n{2,}', '\n\n', text)          # Collapse many newlines into 2
    text = re.sub(r'[ \t]+', ' ', text)             # Remove extra spaces/tabs

    # Normalize unicode 
    text = unicodedata.normalize("NFKD", text)
    def add_period_to_bullet(match):
        line = match.group(0).strip()
        if not line.endswith('.'):
            return line + '.'
        return line

    # Add periods to lines that start with bullet markers (before removing markers)
    text = re.sub(r'(?m)^\s*[-*+]\s+(.*)', lambda m: "- " + add_period_to_bullet(m), text)
    # Remove common bullet points
    text = re.sub(
        r'[\u2022\u2023\u25E6\u2043\u2219\u25AA\u25AB\u25CB\u25CF\u25A0\u25B8\u29BE\u29BF]',
          '', text)

    # Remove markdown or ASCII-style tables
    text = re.sub(r'\|.*?\|', '', text)      # Remove markdown tables
    text = re.sub(r'[-=]{3,}', '', text)     # Remove underlines in tables
    text = re.sub(r'^\s*[\-\*+]\s+', '', text, flags=re.MULTILINE)  # Bulleted list lines

    # Remove figure/table/image captions
    text = re.sub(r'(Figure|Table|Image|Chart|Diagram)\s*\d+[\.:]?', '', text, flags=re.IGNORECASE)

    # Remove bracketed footnotes like [1], [12], (Fig. 3), etc.
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\(.*?fig.*?\)', '', text, flags=re.IGNORECASE)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Fix line breaks and hyphens split across lines
    text = re.sub(r'-\n', '', text)  # Remove hyphenated line-breaks
    text = re.sub(r'\n+', '\n', text)  # Collapse newlines
    text = re.sub(r'[ \t]+', ' ', text)  # Normalize spaces

    # Strip remaining non-ASCII or odd symbols
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # before every \n add a period if it doesn't end with one special character
    text = re.sub(r'(?<![.!?:])\n', '. \n', text)     

    return text.strip()


In [71]:
text = clean_text(raw_text)
print(text)

Project Title: Document Summarization using Retrieval-Augmented. 
Generation (RAG). 
Objective:
To develop a summarization system that combines retrieval-based context. 
selection with large language model (LLM) generation. The system should. 
accept a long document and generate a concise, coherent summary using. 
semantic chunking and RAG.
Project Tasks:
1. Document Ingestion. 
 Accept documents in PDF, TXT, or Markdown format.
 Split into semantically meaningful chunks using sliding windows or. 
semantic segmenters.
2. Embedding & Retrieval. 
 Convert chunks to vector embeddings using SentenceTransformers or. 
OpenAI API. Store in FAISS or Chroma vector DB.
 Perform semantic retrieval for a general summary query (e.g.,. 
"Summarize this document").
3. Summary Generation. 
 Use top-k retrieved chunks and pass them into a pre-trained LLM. 
(e.g., GPT, LLaMA, Mistral).
 Generate a final summary that is coherent, fluent, and accurate.
4. Output Presentation. 
 Display the retrieved conte

## Chunking

Download the embedding model from hugging face

In [72]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

We will be using the semantic Chunker from langchain

In [73]:
semantic_chunker = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")


In [74]:
chunks = semantic_chunker.split_text(text)

metadatas = [
    {
        "source": file_path,
        "chunk_index": i,
        "length": len(chunks[i])

    }
    for i in range(len(chunks))
]


In [75]:
print(f"Total number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} length: {len(chunk)} characters")
    print(f"Chunk {i+1}:\n{chunk}\n")

Total number of chunks: 6

Chunk 1 length: 1208 characters
Chunk 1:
Project Title: Document Summarization using Retrieval-Augmented. Generation (RAG). Objective:
To develop a summarization system that combines retrieval-based context. selection with large language model (LLM) generation. The system should. accept a long document and generate a concise, coherent summary using. semantic chunking and RAG. Project Tasks:
1. Document Ingestion. Accept documents in PDF, TXT, or Markdown format. Split into semantically meaningful chunks using sliding windows or. semantic segmenters. 2. Embedding & Retrieval. Convert chunks to vector embeddings using SentenceTransformers or. OpenAI API. Store in FAISS or Chroma vector DB. Perform semantic retrieval for a general summary query (e.g.,. "Summarize this document"). 3. Summary Generation. Use top-k retrieved chunks and pass them into a pre-trained LLM. (e.g., GPT, LLaMA, Mistral). Generate a final summary that is coherent, fluent, and accurate. 4. 

## Vector Storage

In [76]:
db = Chroma(
    persist_directory="chroma_store",
    embedding_function=embedding_model
)

In [None]:
docs = list(set(chunks))  
db.add_texts( texts=chunks, metadatas=metadatas)

In [None]:
db.persist()

  db.persist()


## Query Embedding

Embedd the query

In [None]:
query = "What is the project about?"
query_embedding = embedding_model.embed_query(query)

## Similarity Search

In [None]:
# make the chroma store act like a retriever
retriever = db.as_retriever()

In [None]:
# perform the similarity search to get the most relevant chunks
results = retriever.get_relevant_documents(query)

In [None]:

print(f"Query: {query}\n")
for i, result in enumerate(results):
    print(f"Result {i+1}:\n{result.page_content}\n")
    print(f"Metadata: {result.metadata}\n")

Query: What is the project about?

Result 1:
encouraged to be creative and modular in design. We will evaluate your. effort and understanding, not just the final output. Submission: ZIP file containing code, a PDF report, and sample. results.

Metadata: {'source': './problemStatement.md', 'chunk_index': 3, 'length': 197}

Result 2:
Documentation. Max Points. 15. 15. 20. 20. 10. 10. 10. Evaluation Criteria. Clean loading, chunking, and formatting. Efficient use of vector DB and embeddings. Relevance of selected content to the document's core idea. Fluency, coverage, and accuracy of the summary. Clear, modular, and reproducible code. Display of retrieved content and generated results. ReadMe clarity, report explanation,and visual aids. Submission Requirements:
 Python code with requirements.txt or environment.yml. ReadMe with setup and usage guide. Sample summarization runs for at least 3 different documents. PDF report (2 pages max).

Metadata: {'source': './problemStatement.md', 'chunk

## LLM

### Summarization

In [None]:
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
summarization_model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer_sum = AutoTokenizer.from_pretrained(summarization_model_name)
model_sum = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)

pipe_sum = pipeline(
    "text2text-generation",
    model=model_sum,
    tokenizer=tokenizer_sum,
    max_length=1024
)

# Wrap it with LangChain
llm_sum = HuggingFacePipeline(pipeline=pipe_sum)


Device set to use cpu


In [None]:

def get_summary(vectorstore, llm_sum, k=5):
    
    collection = vectorstore._collection
    all_docs = collection.get(include=["documents", "embeddings"])

    if all_docs["embeddings"] is None or len(all_docs["embeddings"]) == 0:
        raise ValueError("No embeddings found in the vectorstore!")

    embeddings = np.array(all_docs["embeddings"])
    documents = all_docs["documents"]

    # Computing the centroid of all embeddings
    centroid = np.mean(embeddings, axis=0).reshape(1, -1)

    salience_scores = np.linalg.norm(embeddings - centroid, axis=1)

    # Step 4: Get indices of the top-k most salient chunks
    top_k_indices = salience_scores.argsort()[::-1][:k]
    salient_chunks = [documents[i] for i in top_k_indices]

    input_text = "You are an academic writing assistant. Summarize the following document in elegant, natural language. Make sure it reads smoothly and sounds professional. Avoid copying bullet points verbatim. Use proper grammar and punctuation.\n"
    input_text = " ".join(salient_chunks)
    # print(f"Input text length for summarization: {len(input_text)} characters")
    # print(f"Input text:\n{input_text}\n")
    if len(input_text) > 3000:  
        input_text = input_text[:3000]

    summary = llm_sum.invoke(input_text)
    return summary

In [None]:
print(
    get_summary_from_central_chunks(db, llm_sum, k=5)
)


 This is a simplified project focusing on the fundamentals of summarization using pre-trained models and vector search . The system should accept a long document and generate a concise, coherent summary using semantic chunking and RAG . We will evaluate your effort and understanding, not just the final output .


### General Query Answering

In [None]:
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

# Wrap it with LangChain
llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cpu


Now generating the final prompt by combining the query and the chunks

In [None]:
context = "\n\n".join([doc.page_content for doc in results[0:1]])


In [None]:

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Use the following context to answer the question at the end. 
    Even if the question is not directly answered in the context, say "It's not clearly mentioned but my best guess is"
    and use the context to provide a guess.
    Give a detailed answer based on the context provided.
    Context:
    {context}

    Question:
    {question}

    Answer:"""
)


In [None]:
from langchain.chains import LLMChain

rag_chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)


In [None]:

response = rag_chain.run({
    "context": context,
    "question": query
})

print(response)


We will evaluate your effort and understanding, not just the final output. Submission: ZIP file containing code, a PDF report, and sample. results.


In [None]:
db.delete_collection()