## Data preparation for the RAG tool that will serve information to the model

Developer: Eliel Paes    
Last update: 2025-12-10

### Install

In [0]:
!pip install -r requirements.txt "unstructured[local-inference]"

### 1 Imports

In [0]:
import io
import os
import pandas as pd
import json
import fitz

from llama_index.core.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.core import Document, set_global_tokenizer
from typing import Iterator
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf
from unstructured.partition.auto import partition
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from mlflow.deployments import get_deploy_client

client = get_deploy_client("databricks")

### 1 Data preparation

#### 1.1 Extracting pdf raw text

In [0]:
# Defining some variables
articles_path = "/Volumes/workspace/default/study_files"
catalog = "workspace"
db_name = "default"
table_name = f"pdf_raw_text"

# reading pdf files as binary
# df schema will be: 
      # |-- path: string (nullable = true)
      # |-- modificationTime: timestamp (nullable = true)
      # |-- length: long (nullable = true)
      # |-- content: binary (nullable = true)      
df = (spark.read.format("binaryFile")
      .option("recursiveFileLookup", "true")
      .load(articles_path))

# save data to a deltatable
# df.write.mode("overwrite").saveAsTable(f"{catalog}.{db_name}.{table_name}")

#### Functions to be apllied in the content column in order to extract the pad raw text

In [0]:
@pandas_udf("array<string>")
def get_pdf_raw_text(contents: Iterator[pd.Series]) -> Iterator[pd.Series]:
    """
        Text extraction from pdf files. 

        The content of pdf files are in the content column of our pyspark dataframe, so we will iterate over the content column and extract the text from each pdf file.    
    """     
    def extract_doc_text(col):
        """
            receives e pdf (bytes) and return the extracted text.
        
        """    
        content = []
        try:
            pdf = fitz.open(stream = col)
            for page in pdf.pages():
                content.append(str(json.dumps({"page": page.number + 1, "content": page.get_text()})))

            return content       
            
        except Exception as e:
            print(f"Erro ao carregar arquivo {b}: {e}")

    for batch in contents: 
        yield batch.apply(extract_doc_text)    

@pandas_udf("string")
def get_article_topics(pdf_pages: Iterator[pd.Series]) -> Iterator[pd.Series]:
    """
    
    """        
    def get_topics(col):
        """
        
        
        """
        print(col)
        text = json.loads(col)
        return text["content"]

    for page in pdf_pages:
        yield page.apply(get_topics)


@pandas_udf("int")
def get_page_number(pdf_pages: Iterator[pd.Series]) -> Iterator[pd.Series]:
    """
    
    """        
    def get_topics(col):
        """
        
        
        """
        print(col)
        text = json.loads(col)
        return text["page"]
    
    for page in pdf_pages:
        yield page.apply(get_topics)        
        

In [0]:
df_raw_text = df.select("path", "content")

df_raw_text = df_raw_text.withColumn("pdf_raw_text", get_pdf_raw_text(F.col("content"))).drop("content")
 
df_raw_text = df_raw_text.withColumn("pdf_raw_text", F.explode(F.col("pdf_raw_text")))

df_raw_text = df_raw_text.withColumn("pdf_text_page_content", get_article_topics(F.col("pdf_raw_text")))

df_raw_text = df_raw_text.withColumn("page", get_page_number(F.col("pdf_raw_text"))) 
 
df_raw_text.select("path", "page", "pdf_text_page_content").write.mode("overwrite").saveAsTable(f"{catalog}.{db_name}.{table_name}")

#### 1.2 Extracting pdf images

#### Functions to be apllied in the content column in order to extract the images

In [0]:
def extract_images_from_pdf(content):
    """
    Extracts images from a PDF file using PyMuPDF and saves them to a folder.
    """
    # 1. Open the PDF
    try:
        doc = fitz.open(stream = content)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return

    image_count = 0
    
    # Iterate through all pages
    for page in doc.pages():
        
        # 2. Get a list of image objects on the current page
        # 'get_images' returns a list of tuples with image details
        image_list = page.get_images(full=True)

        # Iterate through the image list
        for img_index, img_info in enumerate(image_list):
            xref = img_info[0]  # The XREF is the internal reference number of the image object
            
            # 3. Extract the image data
            # get_pixmap is usually used for rendering, but we use extract_image for raw data
            image_data = pdf_document.extract_image(xref)
            
            # 4. Get the image properties
            image_bytes = image_data["image"]
            image_ext = image_data["ext"]
            
            # 5. Determine the filename
            filename = f"page{page_index+1}_img{img_index+1}_{xref}.{image_ext}"
            image_path = os.path.join(output_folder, filename)
            
            # 6. Save the image file
            try:
                # Use standard file writing for raw image formats (PNG, JPEG)
                if image_ext in ["png", "jpg", "jpeg", "bmp"]:
                     with open(image_path, "wb") as f:
                        f.write(image_bytes)
                else:
                    # For other formats (like TIFF or masked images), use PIL to handle them
                    image_stream = io.BytesIO(image_bytes)
                    img = Image.open(image_stream)
                    # Convert to RGB to ensure compatibility and save as PNG
                    img.convert("RGB").save(image_path + ".png")
                    filename = filename + ".png" # Update filename for logging
                
                print(f"Successfully extracted: {filename}")
                image_count += 1
            except Exception as e:
                print(f"Failed to save image {filename}. Error: {e}")

    pdf_document.close()
    print(f"\n--- Extraction Complete. Total images extracted: {image_count} ---")


# --- Usage Example ---
# Replace 'your_file.pdf' with the actual path to your PDF file
pdf_file = "your_file.pdf"
extract_images_from_pdf(pdf_file)