dlt-hub · burnash · Jan 16, 2024 · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023
diff --git a/docs/examples/pdf_to_weaviate/__init__.py b/docs/examples/pdf_to_weaviate/__init__.py
diff --git a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py
@@ -0,0 +1,61 @@
+import os
+
+import dlt
+from dlt.destinations.impl.weaviate import weaviate_adapter
+from PyPDF2 import PdfReader
+
+
+@dlt.resource(selected=False)
+def list_files(folder_path: str):
+    folder_path = os.path.abspath(folder_path)
+    for filename in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, filename)
+        yield {
+            "file_name": filename,
+            "file_path": file_path,
+            "mtime": os.path.getmtime(file_path)
+        }
+
+
+@dlt.transformer(primary_key="page_id", write_disposition="merge")
+def pdf_to_text(file_item, separate_pages: bool = False):
+    if not separate_pages:
+        raise NotImplementedError()
+    # extract data from PDF page by page
+    reader = PdfReader(file_item["file_path"])
+    for page_no in range(len(reader.pages)):
+        # add page content to file item
+        page_item = dict(file_item)
+        page_item["text"] = reader.pages[page_no].extract_text()
+        page_item["page_id"] = file_item["file_name"] + "_" + str(page_no)
+        yield page_item
+
+pipeline = dlt.pipeline(
+    pipeline_name='pdf_to_text',
+    destination='weaviate'
+)
+
+# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf"
+# (3) sends them to pdf_to_text transformer with pipe (|) operator
+pdf_pipeline = list_files("assets/invoices").add_filter(
+    lambda item: item["file_name"].endswith(".pdf")
+) | pdf_to_text(separate_pages=True)
+
+# set the name of the destination table to receive pages
+# NOTE: Weaviate, dlt's tables are mapped to classes
+pdf_pipeline.table_name = "InvoiceText"
+
+# use weaviate_adapter to tell destination to vectorize "text" column
+load_info = pipeline.run(
+    weaviate_adapter(pdf_pipeline, vectorize="text")
+)
+row_counts = pipeline.last_trace.last_normalize_info
+print(row_counts)
+print("------")
+print(load_info)
+
+import weaviate
+
+client = weaviate.Client("http://localhost:8080")
+# get text of all the invoices in InvoiceText class we just created above
+print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md
@@ -92,7 +92,7 @@ athena_work_group="my_workgroup"
 
 ## Data loading
 
-Data loading happens by storing parquet files in an s3 bucket and defining a schema on athena. If you query data via sql queries on athena, the returned data is read by
+Data loading happens by storing parquet files in an s3 bucket and defining a schema on athena. If you query data via SQL queries on athena, the returned data is read by
 scanning your bucket and reading all relevant parquet files in there.
 
 `dlt` internal tables are saved as Iceberg tables.

diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md
@@ -42,7 +42,7 @@ or run:
 ```
 pip install dlt[mssql]
 ```
-This will install dlt with **mssql** extra which contains all the dependencies required by the sql server client.
+This will install dlt with **mssql** extra which contains all the dependencies required by the SQL server client.
 
 **3. Enter your credentials into `.dlt/secrets.toml`.**
 

diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md
@@ -1,16 +1,16 @@
 ---
-title: Transforming data with dbt
+title: Transform the data with dbt
 description: Transforming the data loaded by a dlt pipeline with dbt
 keywords: [transform, dbt, runner]
 ---
 
-# Transforming data using dbt
+# Transform the data with dbt
 
 [dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows simple structuring of your transformations into DAGs. The benefits of
 using dbt include:
 
 - End-to-end cross-db compatibility for dlt→dbt pipelines.
-- Easy to use by sql analysts, low learning curve.
+- Easy to use by SQL analysts, low learning curve.
 - Highly flexible and configurable in usage, supports templating, can run backfills etc.
 - Supports testing and accelerates troubleshooting.
 

diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md
@@ -1,10 +1,10 @@
 ---
-title: Transforming data with Pandas
-description: Transforming the data loaded by a dlt pipeline with Pandas
+title: Transform the data with Pandas
+description: Transform the data loaded by a dlt pipeline with Pandas
 keywords: [transform, pandas]
 ---
 
-# Transforming the data using Pandas
+# Transform the data with Pandas
 
 You can fetch results of any SQL query as a dataframe. If the destination is supporting that
 natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to that, reading

diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md
@@ -1,10 +1,10 @@
 ---
-title: Transforming data with SQL
-description: Transforming the data loaded by a dlt pipeline with SQL client
+title: Transform the data with SQL
+description: Transforming the data loaded by a dlt pipeline with the dlt SQL client
 keywords: [transform, sql]
 ---
 
-# Transforming data using the `dlt` SQL client
+# Transform the data using the `dlt` SQL client
 
 A simple alternative to dbt is to query the data using the `dlt` SQL client and then performing the
 transformations using Python. The `execute_sql` method allows you to execute any SQL statement,

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md
@@ -82,8 +82,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -137,7 +136,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `airtable`, you
    may also use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline).
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md
@@ -71,8 +71,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source guide](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -110,7 +109,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `asana`, you may also use any
    custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md
@@ -51,8 +51,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source.md)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source.md).
 
 ### Add credentials
 
@@ -87,7 +86,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `chess_pipeline`, you may also
    use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md
@@ -116,8 +116,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credential
 
@@ -174,7 +173,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `facebook_ads`, you may also
    use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md
@@ -82,8 +82,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -126,7 +125,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `github_reactions`, you may
    also use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md
@@ -143,8 +143,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -230,7 +229,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is
    `dlt_google_analytics_pipeline`, you may also use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md
@@ -231,8 +231,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -319,7 +318,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `google_sheets_pipeline`, you
    may also use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline).
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Data types
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md
@@ -89,8 +89,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -131,7 +130,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `hubspot_pipeline`, you may
    also use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md
@@ -66,8 +66,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -118,7 +117,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`, you may also
    use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md
@@ -59,8 +59,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credential
 
@@ -118,7 +117,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `matomo`, you may also
    use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md
@@ -130,8 +130,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source).
 
 ### Add credentials
 
@@ -190,7 +189,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `local_mongo`, you may also
    use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md
@@ -61,8 +61,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source)
 
 
 ### Add credentials
@@ -104,7 +103,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is
    `mux`, you may also use any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md
@@ -65,8 +65,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source)
 
 ### Add credentials
 
@@ -109,7 +108,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `notion`, you may also use any
    custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources
 

diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md
@@ -68,8 +68,7 @@ To get started with your data pipeline, follow these steps:
 1. After running this command, a new directory will be created with the necessary files and
    configuration settings to get started.
 
-For more information, read the
-[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source)
+For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source)
 
 ### Add credentials
 
@@ -109,7 +108,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage
    For example, the `pipeline_name` for the above pipeline example is `pipedrive`, you may also use
    any custom name instead.
 
-For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline)
+For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline).
 
 ## Sources and resources