In [1]:
import os
import json
import pdfplumber

In [2]:
from pyspark.sql.datasource import DataSource, DataSourceReader
from pyspark.sql.types import StringType, StructField, StructType

class PdfDataSource(DataSource):
    @classmethod
    def name(cls):
        return "pdf"

    def schema(self):
        return StructType([
            StructField("file", StringType()),
            StructField("page", StringType()),
            StructField("text", StringType()),
            StructField("author", StringType()),
            StructField("title", StringType())
        ])

    def reader(self, schema: StructType):
        return PdfDataSourceReader(self.options.get("path"), self.options)

class PdfDataSourceReader(DataSourceReader):
    def __init__(self, path, options):
        self.path = path
        self.max_pages = int(options.get("max_pages", -1))
        self.extract_tables = options.get("extract_tables", "false") == "true"

    def read(self, partition):
        for file_name in os.listdir(self.path):
            if file_name.endswith(".pdf"):
                full_path = os.path.join(self.path, file_name)
                with pdfplumber.open(full_path) as pdf:
                    metadata = pdf.metadata or {}
                    author = metadata.get("Author", "Unknown")
                    title = metadata.get("Title", file_name)

                    for i, page in enumerate(pdf.pages):
                        if self.max_pages != -1 and i >= self.max_pages:
                            break

                        if self.extract_tables:
                            tables = page.extract_tables()
                            for t_index, table in enumerate(tables):
                                yield (file_name, f"Page {i+1} Table {t_index+1}", json.dumps(table), author, title)
                            text = page.extract_text()
                            yield (file_name, f"Page {i+1} Text", text, author, title)
                        else:
                            text = page.extract_text()
                            yield (file_name, f"Page {i+1}", text, author, title)



In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.dataSource.register(PdfDataSource)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/09 11:31:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.format("pdf") \
    .option("path", "data/") \
    .option("max_pages", 10) \
    .option("extract_tables", "true") \
    .load()

df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+------------+--------------------+-------------------+--------------------+
|                file|        page|                text|             author|               title|
+--------------------+------------+--------------------+-------------------+--------------------+
|big-data-analytic...| Page 1 Text|AWS Whitepaper\nB...|Amazon Web Services|Big Data Analytic...|
|big-data-analytic...| Page 2 Text|Big Data Analytic...|Amazon Web Services|Big Data Analytic...|
|big-data-analytic...| Page 3 Text|Big Data Analytic...|Amazon Web Services|Big Data Analytic...|
|big-data-analytic...| Page 4 Text|Big Data Analytic...|Amazon Web Services|Big Data Analytic...|
|big-data-analytic...| Page 5 Text|Big Data Analytic...|Amazon Web Services|Big Data Analytic...|
|big-data-analytic...| Page 6 Text|Big Data Analytic...|Amazon Web Services|Big Data Analytic...|
|big-data-analytic...| Page 7 Text|Big Data Analytic...|Amazon Web Services|Big Data Analytic...|
|big-data-analytic..

                                                                                

In [5]:
table_row = df.filter(df.page == "Page 10 Text").collect()
print(table_row[0]['text'])

[Stage 1:>                                                          (0 + 1) / 1]

Big Data Analytics Options on AWS AWS Whitepaper
The AWS advantage in big data analytics
Analyzing large datasets requires significant compute capacity that can vary in size, based on
the amount of input data and the type of analysis. This characteristic of big data workloads is
ideally suited to the pay-as-you-go cloud computing model, where applications can easily scale
up and down based on demand. As requirements change, you can easily resize your environment
(horizontally or vertically) on AWS to meet your needs, without having to wait for additional
hardware or over-investing to provision enough capacity.
For mission-critical applications on a more traditional infrastructure, system designers have no
choice but to over-provision, because a surge in additional data due to an increase in business
needs must be something the system can handle. By contrast, on AWS, you can provision more
capacity and compute in a matter of minutes, meaning that your big data applications grow and
shri

                                                                                