# Retrival Augmented generation(RAG) for DATA MINING Text Book

# Data Loading and Preprocessing

### Import Libraries

In [1]:
# import necessary libraries
import pymupdf
import os
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bdcalling123\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data Acquisition

In [2]:
# load PDF files from the specified directory
pdf_path = 'Reference Book.pdf' # data mining textbook
doc = pymupdf.open(pdf_path)

In [3]:
import fitz  # PyMuPDF

pdf_path = "Reference Book.pdf"
with fitz.open(pdf_path) as doc:
    print(f"Num of Pages: {doc.page_count}")
    print(f"Metadata: {doc.metadata}")
    print(f"Table of Contents: {doc.get_toc()}")

    image_count = 0
    link_count = 0
    fonts = set()

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_count += len(page.get_images(full=True))
        link_count += len(page.get_links())
        # No official page.get_fonts() — so this part will error!

    embedded = doc.embedded_file_names() if hasattr(doc, "embedded_file_names") else []

    print(f"number of images on the document: {image_count}")
    print(f"number of links on the document: {link_count}")
    print(f"number of embedded files on the document: {len(embedded)}")


Num of Pages: 740
Metadata: {'format': 'PDF 1.6', 'title': 'Data Mining. Concepts and Techniques, 3rd Edition (The Morgan Kaufmann Series in Data Management Systems)', 'author': 'Jiawei Han, Micheline Kamber, Jian Pei', 'subject': 'Morgan Kaufmann 2011', 'keywords': '0123814790\r\n9780123814791', 'creator': '', 'producer': '', 'creationDate': "D:20151005145245+03'00'", 'modDate': "D:20151220163746+03'30'", 'trapped': '', 'encryption': None}
Table of Contents: [[1, 'Front Cover ', 1], [1, 'Data Mining: Concepts and Techniques', 6], [1, 'Copyright', 7], [1, 'Dedication', 8], [1, 'Table of Contents', 10], [1, 'Foreword', 20], [1, 'Foreword to Second Edition', 22], [1, 'Preface', 24], [1, 'Acknowledgments', 32], [1, 'About the Authors', 36], [1, 'Chapter 1. Introduction', 38], [2, '1.1 Why Data Mining?', 38], [2, '1.2 What Is Data Mining?', 42], [2, '1.3 What Kinds of Data Can Be Mined?', 45], [2, '1.4 What Kinds of Patterns Can Be Mined?', 52], [2, '1.5 Which Technologies Are Used?', 60],

### Parsing and Cleaning

In [4]:
import fitz
import unicodedata
import re
from collections import Counter, defaultdict
from nltk.tokenize import sent_tokenize

In [5]:
# Text and formula normalization helpers
def normalize_unicode(text):
    return unicodedata.normalize("NFKC", text)

def fix_hyphenation(text):
    text = re.sub(r'-\s*\n\s*', '', text)
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    return text

def strip_refs_and_pnums(text):
    text = re.sub(r'\[\s*\d+\s*\]', '', text)
    return text

def clean_text(text):
    text = normalize_unicode(text)
    text = fix_hyphenation(text)
    text = strip_refs_and_pnums(text)
    text = re.sub(r'[\x00-\x1f\x7f]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Heading/Section detection (font size and regex)
def is_heading(text, max_span, median_size, heading_scale):
    heading_like = median_size > 0 and max_span >= median_size * heading_scale
    regex_like = bool(re.match(r'^\s*((Chapter|CHAPTER|Section|SECTION)[\s\d.:]+|(\d+\.)+\s+\w+)', text))
    return heading_like or regex_like

def is_tabular(text):
    tabular_keywords = ['table', 'Tab', 'tabular']
    return (text.count('\t') > 2 or text.count('|') > 2 or len(re.findall(r'\s{4,}', text)) > 1
            or any(k.lower() in text.lower() for k in tabular_keywords))

# Formula helpers
_THEOREM_HEADING_RE = re.compile(r'^\s*(Theorem|Lemma|Proposition|Corollary|Definition|Claim|Remark|Proof)\b', re.IGNORECASE)
_LATEX_INLINE_RE = re.compile(r'(\$\$.*?\$\$|\$.*?\$|\\\[.*?\\\]|\\\(.+?\\\))', re.DOTALL)
_MATH_SYMBOLS = set(list("αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ∑∏∫√∞≤≥±≈⇒⇔∀∃∈∉∂×÷→←|=^_<>"))

def detect_formulas_in_text(text, symbol_threshold=3, frac_threshold=0.02):
    formulas = []
    for m in _LATEX_INLINE_RE.finditer(text):
        formulas.append(m.group(0).strip())
    candidates = re.split(r'(?<=[\.\;\:\n])\s+', text)
    for seg in candidates:
        seg = seg.strip()
        if not seg: continue
        sym_count = sum(1 for ch in seg if ch in _MATH_SYMBOLS)
        if sym_count >= symbol_threshold or (sym_count / max(1, len(seg)) > frac_threshold):
            if seg not in formulas:
                formulas.append(seg)
    return formulas

def _save_image_from_xref(doc, page, xref, out_dir, prefix="formula"):
    try: pix = fitz.Pixmap(doc, xref)
    except Exception: return None
    os.makedirs(out_dir, exist_ok=True)
    fn = f"{prefix}_p{page.number+1}_xref{xref}.png"
    out_path = os.path.join(out_dir, fn)
    try:
        if pix.n < 5: pix.save(out_path)
        else:
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.save(out_path)
            pix0 = None
        pix = None
    except Exception: return None
    return out_path

def _image_bbox_if_available(page, xref):
    if hasattr(page, "get_image_bbox"):
        try: return page.get_image_bbox(xref)
        except Exception: return None
    return None

def extract_pdf_structure(pdf_path, min_block_len=20, heading_scale=1.25, out_image_dir="formulas"):
    doc = fitz.open(pdf_path)
    top_cands, bot_cands = [], []
    page_blocks = []

    for pno in range(doc.page_count):
        page = doc.load_page(pno)
        d = page.get_text("dict")
        blocks = [] ; sizes = []
        for b in d["blocks"]:
            if b.get("type", 0) == 0:
                for line in b.get("lines", []):
                    for span in line.get("spans", []):
                        sizes.append(span.get("size", 0))
        median_size = (sorted(sizes)[len(sizes)//2] if sizes else 0)
        image_xrefs = []
        for img in page.get_images(full=True):
            xref = img[0]
            ibox = _image_bbox_if_available(page, xref)
            image_xrefs.append({"xref": xref, "bbox": ibox})

        for b in d["blocks"]:
            if b.get("type", 0) != 0: continue
            text = " ".join(span.get("text","") for line in b.get("lines", []) for span in line.get("spans", []))
            text = text.strip()
            if not text or len(text) < min_block_len: continue
            max_span = max(span.get("size", 0) for line in b.get("lines", []) for span in line.get("spans", [])) if b.get("lines") else 0
            fonts = [span.get("font","") for line in b.get("lines", []) for span in line.get("spans", [])]
            heading_candidate = is_heading(text, max_span, median_size, heading_scale)
            tabular_candidate = is_tabular(text)
            theorem_like = _THEOREM_HEADING_RE.match(text)
            formulas_in_block = detect_formulas_in_text(text)
            blocks.append({
                "text": text,
                "bbox": b.get("bbox"),
                "is_heading": heading_candidate,
                "is_tabular": tabular_candidate,
                "is_formula": bool(formulas_in_block),  # Not strict: just means math detected
                "formulas": formulas_in_block,
                "fonts": fonts,
                "theorem_like": bool(theorem_like),
                "max_span": max_span
            })

        sorted_blocks = sorted(blocks, key=lambda x: x["bbox"][1])
        if sorted_blocks:
            for b in sorted_blocks[:2]:
                if 0 < len(b["text"]) < 120: top_cands.append(b["text"])
            for b in sorted_blocks[-2:]:
                if 0 < len(b["text"]) < 120: bot_cands.append(b["text"])
        page_blocks.append({"pno": pno + 1, "blocks": blocks, "image_xrefs": image_xrefs})

    header = Counter(top_cands).most_common(1)
    footer = Counter(bot_cands).most_common(1)
    header_text = header[0][0] if header and header[0][1] > max(2, len(doc)//10) else None
    footer_text = footer[0][0] if footer and footer[0][1] > max(2, len(doc)//10) else None

    # Chunk by headings, attach tables/images/formulas, merge/split for optimal size
    chunks = []
    for page in page_blocks:
        current_heading = None
        accum = []
        bbox_accum = []
        tabular_blocks = []
        formulas_in_chunk = []
        formula_images_chunk = []
        for b in page["blocks"]:
            if header_text and b["text"] == header_text: continue
            if footer_text and b["text"] == footer_text: continue
            if b["is_heading"]:
                if accum:
                    raw = " ".join(accum)
                    cleaned = clean_text(raw)
                    if cleaned:
                        # Find formula images by proximity (like your previous proximity logic)
                        chunk_images = []
                        if bbox_accum:
                            xs = [ (bb[0]+bb[2])/2 for bb in bbox_accum ]
                            ys = [ (bb[1]+bb[3])/2 for bb in bbox_accum ]
                            cx = sum(xs)/len(xs); cy = sum(ys)/len(ys)
                            for xrefinfo in page["image_xrefs"]:
                                ib = xrefinfo.get("bbox")
                                if ib:
                                    icx = (ib[0]+ib[2])/2
                                    icy = (ib[1]+ib[3])/2
                                    dist = ((icx-cx)**2 + (icy-cy)**2)**0.5
                                    if dist<=150 or (icx>=min(xs)-10 and icx<=max(xs)+10):
                                        saved = _save_image_from_xref(doc, doc.load_page(page["pno"]-1), xrefinfo["xref"], out_image_dir, prefix="formula")
                                        if saved: chunk_images.append(saved)
                        chunks.append({
                            "source": pdf_path,
                            "pages": (page["pno"], page["pno"]),
                            "heading": current_heading,
                            "text": cleaned,
                            "bbox": bbox_accum,
                            "images": chunk_images,  # Only formula images near the chunk
                            "tables": tabular_blocks,
                            "formulas": formulas_in_chunk,
                            "confidence": 0.9
                        })
                    accum, bbox_accum, tabular_blocks, formulas_in_chunk, formula_images_chunk = [],[],[],[],[]
                current_heading = b["text"]
                continue
            # Main chunk population
            accum.append(b["text"])
            bbox_accum.append(b["bbox"])
            if b["is_tabular"]: tabular_blocks.append(b["text"])
            if b["is_formula"]: formulas_in_chunk.extend(b["formulas"])
        # flush page end
        if accum:
            raw = " ".join(accum)
            cleaned = clean_text(raw)
            if cleaned:
                # Formula image proximity
                chunk_images = []
                if bbox_accum:
                    xs = [ (bb[0]+bb[2])/2 for bb in bbox_accum ]
                    ys = [ (bb[1]+bb[3])/2 for bb in bbox_accum ]
                    cx = sum(xs)/len(xs); cy = sum(ys)/len(ys)
                    for xrefinfo in page["image_xrefs"]:
                        ib = xrefinfo.get("bbox")
                        if ib:
                            icx = (ib[0]+ib[2])/2
                            icy = (ib[1]+ib[3])/2
                            dist = ((icx-cx)**2 + (icy-cy)**2)**0.5
                            if dist<=150 or (icx>=min(xs)-10 and icx<=max(xs)+10):
                                saved = _save_image_from_xref(doc, doc.load_page(page["pno"]-1), xrefinfo["xref"], out_image_dir, prefix="formula")
                                if saved: chunk_images.append(saved)
                chunks.append({
                    "source": pdf_path,
                    "pages": (page["pno"], page["pno"]),
                    "heading": current_heading,
                    "text": cleaned,
                    "bbox": bbox_accum,
                    "images": chunk_images,
                    "tables": tabular_blocks,
                    "formulas": formulas_in_chunk,
                    "confidence": 0.85
                })

    # Postprocess: merge small, split large, preserve formulas/images
    merged = []
    i = 0
    while i < len(chunks):
        cur = chunks[i]
        text = cur["text"]
        if len(text) < 300 and i+1 < len(chunks) and chunks[i+1]["source"] == cur["source"]:
            nxt = chunks[i+1]
            merged.append({
                "source": cur["source"],
                "pages": (cur["pages"][0], nxt["pages"][1]),
                "heading": cur["heading"] or nxt["heading"],
                "text": clean_text(cur["text"] + " " + nxt["text"]),
                "bbox": cur["bbox"] + nxt["bbox"],
                "images": cur["images"] + nxt["images"],
                "tables": cur.get("tables",[]) + nxt.get("tables",[]),
                "formulas": cur.get("formulas",[]) + nxt.get("formulas",[]),
                "confidence": max(cur["confidence"], nxt["confidence"])
            })
            i += 2
        else:
            if len(text) > 1200:
                sents = sent_tokenize(text)
                cur_block = ""
                for s in sents:
                    if len(cur_block) + len(s) + 1 <= 800:
                        cur_block = (cur_block + " " + s).strip()
                    else:
                        merged.append({**cur, "text": cur_block})
                        cur_block = s
                if cur_block:
                    merged.append({**cur, "text": cur_block})
            else:
                merged.append(cur)
            i += 1

    return merged
chunks = extract_pdf_structure(pdf_path)

# Chunk Embedding and Indexing

In [7]:
# Convert to DataFrame
df = pd.DataFrame(chunks)
print(df.head())

               source   pages heading  \
0  Reference Book.pdf  (3, 3)    None   
1  Reference Book.pdf  (3, 3)    None   
2  Reference Book.pdf  (4, 4)    None   
3  Reference Book.pdf  (4, 4)    None   
4  Reference Book.pdf  (4, 4)    None   

                                                text  \
0                                                      
1  The Morgan Kaufmann Series in Data Management ...   
2                                                      
3  Data Modeling Essentials, 3 rd Edition Graeme ...   
4  Hall Joe Celko’s Data and Databases: Concepts ...   

                                                bbox images  \
0  [(118.4000015258789, 81.22136688232422, 401.36...     []   
1  [(118.4000015258789, 81.22136688232422, 401.36...     []   
2  [(148.39999389648438, 81.15653228759766, 487.1...     []   
3  [(148.39999389648438, 81.15653228759766, 487.1...     []   
4  [(148.39999389648438, 81.15653228759766, 487.1...     []   

                                     

In [8]:
#embedding chunks using sentence transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")  # or domain-specific model

# Embed the text of each chunk
df['embedding'] = df['text'].apply(lambda x: model.encode(x))


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [9]:
def chunk_context(row):
    context = row['text']
    if row['formulas']:
        context += " " + " ".join(row['formulas'])
    if row['tables']:
        context += " " + " ".join(row['tables'])
    return context

df['embedding'] = df.apply(chunk_context, axis=1).apply(lambda x: model.encode(x))


# index the embeddings

In [11]:
import numpy as np
import faiss

embeddings = np.stack(df['embedding'].to_list())
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

# Save the index for later
faiss.write_index(index, "ds_book_faiss.index")

In [13]:
query = "What is the data mining?"
query_emb = model.encode(query)

D, I = index.search(np.array([query_emb]), k=5)  # Get top-5 matches
for idx in I[0]:
    print(df.iloc[idx][['heading', 'pages', 'text']])


heading                                                 None
pages                                             (191, 191)
text       Yet according to this view, data mining covers...
Name: 518, dtype: object
heading         13.4.1  Ubiquitous and Invisible Data Mining
pages                                             (655, 655)
text       Data mining is present in many aspects of our ...
Name: 2050, dtype: object
heading    Steps 1 through 4 are different forms of data ...
pages                                               (45, 45)
text       8 Chapter 1 Introduction As a general technolo...
Name: 70, dtype: object
heading    Another challenge for both content-based and c...
pages                                             (655, 655)
text       618 Chapter 13 Data Mining Trends and Research...
Name: 2049, dtype: object
heading                            1.7.1  Mining Methodology
pages                                               (66, 66)
text       Mining various and new kinds of kn