# Weaviate + dlt

Data Ingestion: The project starts with the ingestion of unstructured data, specifically PDF files containing invoices. These PDF files are processed to extract text information.

Data Transformation: Once the text is extracted and stored in Weaviate, it is further processed to extract structured information such as recipient company name, invoice amount, invoice date, invoice number, and service description using a generative prompt.

Data Storage: The structured information is then stored in a Weaviate database, which allows for efficient querying and retrieval of this structured data.

Overall, this project aims to automate the extraction and querying of structured information from unstructured invoice PDFs, making it easier to work with and derive insights from this data.

In [40]:
!pip install -q "dlt[weaviate]"

In [41]:
!dlt --non-interactive init unstructured_weaviate weaviate --branch features/unstructured_weaviate

Looking up the init scripts in [1mhttps://github.com/dlt-hub/verified-sources.git[0m...
No files to update, exiting


In [42]:
!pip install PyPDF2 -q

In [56]:
import os
import weaviate

pdf_to_text_properties = ['file_path', 'file_name', 'content_type', 'text', 'page_id']
unstructured_properties = ['recipient_company_name', 'invoice_amount', 'invoice_date', 'invoice_number', 'service_description',  'content_type']

def show_data(class_name, properties):
    client = weaviate.Client(
        url=os.getenv("WEAVIATE_URL"),
        auth_client_secret=weaviate.AuthApiKey(
            api_key=os.getenv("WEAVIATE_API_KEY")
        ),
        additional_headers={
            "X-OpenAI-Api-Key": os.getenv("WEAVIATE_OPENAI_KEY")
        }
    )
    
    response = (
        client.query
        .get(class_name, properties)
        .do()
    )
    return response

In [64]:
import dlt
from dlt.destinations.weaviate import weaviate_adapter

from unstructured_weaviate import pdf_to_text


from unstructured_weaviate.local_folder import local_folder_resource

# configure the pipeline with your destination details
pipeline = dlt.pipeline(
    pipeline_name="pdf_to_text",
    destination="weaviate",
)

data_resource = local_folder_resource("test_data")

pdf_data_resource = data_resource | pdf_to_text(separate_pages=True)

# use weaviate_adapter to tell destination to vectorize "text" column
load_info = pipeline.run(
    weaviate_adapter(pdf_data_resource, vectorize="text")
)
# pretty print the information on data that was loaded
row_counts = pipeline.last_trace.last_normalize_info
print(row_counts)
print("------")
print(load_info)

            Please instead use the `client.batch.configure()` method to configure your batch and `client.batch` to enter the context manager.
            See https://weaviate.io/developers/weaviate/client-libraries/python for details.


Normalized data for the following tables:
- PdfToText: 2 row(s)

------
Pipeline pdf_to_text completed in 5.69 seconds
1 load package(s) were loaded to destination weaviate and into dataset None
The weaviate destination used https://demo-1-wvxjul5s.weaviate.network location to store data
Load package 1693576393.904332 is LOADED and contains no failed jobs


In [65]:

response = show_data("PdfToText", pdf_to_text_properties)
print(response)

{'data': {'Get': {'PdfToText': [{'content_type': 'application/pdf', 'file_name': 'DE353116862_AA007_2022-07.pdf', 'file_path': '/home/alenaastrakhantseva/dlthub/weaviate_demo/test_data/DE353116862_AA007_2022-07.pdf', 'page_id': 'DE353116862_AA007_2022-07.pdf_0', 'text': 'ScaleVector GmbH\nKrüllsstraße 6\n12435 Berlin\nUSt-ID: DE353116862\nSteuernummer: 37/505/50907Finom\nIBAN: DE05110101015401984723\nBIC/SWIFT: SOBKDEB2XXXE-Mail: anna@scalevector.ai\nTel.: +4917683020993\nSeite 1/1\nScaleVector GmbH  Krüllsstraße 6  12435 Berlin\nAn: Rasa Technologies GmbH\nSchönhauser Allee 175\n10119 Berlin\nUSt-ID: DE311844583\nRechnung Nr. 2022-07\nRechnungsdatum: 18.08.2022 Fälligkeitsdatum: 01.09.2022\nPayment terms: net 14 \nIm Inland nicht steuerbare sonstige Leistung gemäß § 3a UStG\nOther non-taxable services in Germany in accordance with Section 3a \nUStGNettobetrag8.160,00\xa0€\nUSt. (19%)1.550,40\xa0€\n9.710,40\xa0€Consulting Rasa Clients July 2022 51 Stunde 160,00\xa0€ 19 1.550,40\xa0€ 8.

In [66]:
import weaviate
import dlt
import json

generate_prompt = """
Generate a JSON object with the following keys and answer the questions:

{
  "recipient_company_name": "Who is the recipient of the invoice in this {text}? Just return the name. If you don't know, then return None",
  "invoice_amount": "What is the total amount of the invoice {text}? Just return the amount as decimal number, no currency or text. If you don't know, then return None",
  "invoice_date": "What is the date of the invoice {text}? Just return the date. If you don't know, then return None",
  "invoice_number": "What is the invoice number {text}? Just return the number. If you don't know, then return None",
  "service_description": "What is the description of the service that this invoice is for {text}? Just return the description. If you don't know, then return None",
}
"""

@dlt.resource(write_disposition="replace")
def unstructured_to_structured(
        credentials=dlt.secrets.value,
):
    client = weaviate.Client(
        url=credentials["url"],
        auth_client_secret=weaviate.AuthApiKey(api_key=credentials["api_key"]),
        additional_headers=credentials["additional_headers"]
    )   
    response = (
        client.query
        .get("PdfToText", pdf_to_text_properties)
        .with_generate(single_prompt=generate_prompt)
        .do()
    )
    # print(response)
    for row in response["data"]["Get"]["PdfToText"]:
        json_string = row["_additional"]["generate"]["singleResult"]
        json_obj = json.loads(json_string)
        metadata = {p: row[p] for p in ["page_id", "file_path", "content_type"]}
        json_obj.update(metadata)
        yield json_obj


# configure the pipeline with your destination details
pipeline = dlt.pipeline(
    pipeline_name="unstructured_to_structured",
    destination="weaviate",
)

# use weaviate_adapter to tell destination to vectorize "text" column
load_info = pipeline.run(
    weaviate_adapter(unstructured_to_structured, vectorize="service_description")
)
# pretty print the information on data that was loaded
row_counts = pipeline.last_trace.last_normalize_info
print(row_counts)
print("------")
print(load_info)

            Please instead use the `client.batch.configure()` method to configure your batch and `client.batch` to enter the context manager.
            See https://weaviate.io/developers/weaviate/client-libraries/python for details.


Normalized data for the following tables:
- UnstructuredToStructured: 2 row(s)
- DltPipelineState: 1 row(s)
- _dlt_pipeline_state: 0 row(s)

------
Pipeline unstructured_to_structured completed in 10.78 seconds
1 load package(s) were loaded to destination weaviate and into dataset Data20230901012240
The weaviate destination used https://demo-1-wvxjul5s.weaviate.network location to store data
Load package 1693576432.78027 is LOADED and contains no failed jobs


In [69]:
response = show_data("Data20230901012240_UnstructuredToStructured", unstructured_properties + ["page_id", "file_path", "content_type"])
print(response)

{'data': {'Get': {'Data20230901012240_UnstructuredToStructured': [{'content_type': 'application/pdf', 'file_path': '/home/alenaastrakhantseva/dlthub/weaviate_demo/test_data/invoice_1.pdf', 'invoice_amount': 11235, 'invoice_date': 'June 30, 2023', 'invoice_number': 'INV-549283', 'page_id': 'invoice_1.pdf_0', 'recipient_company_name': 'XYZ Corporation', 'service_description': 'Premium Widget Delivery and Installation Services'}, {'content_type': 'application/pdf', 'file_path': '/home/alenaastrakhantseva/dlthub/weaviate_demo/test_data/DE353116862_AA007_2022-07.pdf', 'invoice_amount': 9710.4, 'invoice_date': '18.08.2022', 'invoice_number': '2022-07', 'page_id': 'DE353116862_AA007_2022-07.pdf_0', 'recipient_company_name': 'Rasa Technologies GmbH', 'service_description': 'Consulting Rasa Clients July 2022'}]}}}


In [61]:
!pip install -q duckdb pandas --upgrade

In [70]:
# configure the pipeline with your destination details
pipeline = dlt.pipeline(
    pipeline_name="unstructured_to_structured",
    destination="duckdb",
    dataset_name="data",
    full_refresh=True,
)

# use weaviate_adapter to tell destination to vectorize "text" column
load_info = pipeline.run(unstructured_to_structured)
# pretty print the information on data that was loaded
row_counts = pipeline.last_trace.last_normalize_info
print(row_counts)
print("------")
print(load_info)

Normalized data for the following tables:
- _dlt_pipeline_state: 1 row(s)
- unstructured_to_structured: 2 row(s)

------
Pipeline unstructured_to_structured completed in 5.86 seconds
1 load package(s) were loaded to destination duckdb and into dataset data_20230901015549
The duckdb destination used duckdb:////home/alenaastrakhantseva/dlthub/weaviate_demo/unstructured_to_structured.duckdb location to store data
Load package 1693576555.252389 is LOADED and contains no failed jobs


In [71]:
import duckdb

conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")
conn.sql(f"SET search_path = '{pipeline.dataset_name}'")
display(conn.sql("DESCRIBE"))
data_table = conn.sql("SELECT * FROM unstructured_to_structured").df()
display(data_table)

┌──────────────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬───────────┐
│       database       │        schema        │         name         │ … │     column_types     │ temporary │
│       varchar        │       varchar        │       varchar        │   │      varchar[]       │  boolean  │
├──────────────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼───────────┤
│ unstructured_to_st…  │ data_20230901010751  │ _dlt_loads           │ … │ [VARCHAR, VARCHAR,…  │ false     │
│ unstructured_to_st…  │ data_20230901010751  │ _dlt_pipeline_state  │ … │ [BIGINT, BIGINT, V…  │ false     │
│ unstructured_to_st…  │ data_20230901010751  │ _dlt_version         │ … │ [BIGINT, BIGINT, T…  │ false     │
│ unstructured_to_st…  │ data_20230901010751  │ unstructured_to_st…  │ … │ [VARCHAR, VARCHAR,…  │ false     │
│ unstructured_to_st…  │ data_20230901012240  │ _dlt_loads           │ … │ [VARCHAR, VARCHAR,…  │ false     │
│ unstruct

Unnamed: 0,service_description,recipient_company_name,invoice_amount,invoice_date,invoice_number,page_id,file_path,content_type,_dlt_load_id,_dlt_id
0,Consulting Rasa Clients July 2022,Rasa Technologies GmbH,9710.4,18.08.2022,2022-07,DE353116862_AA007_2022-07.pdf_0,/home/alenaastrakhantseva/dlthub/weaviate_demo...,application/pdf,1693576555.252389,l9RUh9OG8crd5Q
1,Premium Widget Delivery and Installation Services,XYZ Corporation,11235.0,"June 30, 2023",INV-549283,invoice_1.pdf_0,/home/alenaastrakhantseva/dlthub/weaviate_demo...,application/pdf,1693576555.252389,+I9lmBtvrzcmdw
