In [2]:
%pip install --upgrade langchain langchain-experimental langchain-openai langchain-community python-dotenv pyvis

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os

load_dotenv()

uri = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")
database = os.getenv("NEO4J_DATABASE")

driver = GraphDatabase.driver(uri, auth=(username, password))


In [4]:
def test_connection(driver, database):
    with driver.session(database=database) as session:
        result = session.run("RETURN 'Connected to Neo4j!' AS message")
        print(result.single()["message"])

test_connection(driver, database)

Connected to Neo4j!


In [5]:
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
print("API key:", os.getenv("OPENAI_API_KEY"))

API key: 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


In [6]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
import nest_asyncio
import asyncio
import logging
import time

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

nest_asyncio.apply()

llm = ChatOpenAI(model="o4-mini")
graph_transformer = LLMGraphTransformer(llm=llm)

In [7]:
import pandas as pd
from langchain_core.documents import Document

df = pd.read_csv("cars_reklama5.csv", dtype={"Phone": str})

df['Kilometers'] = df['Kilometers'].apply(lambda x: str(int(x)) if pd.notnull(x) else "")

documents = []

for idx, row in df.iterrows():
    content = f"""
    Model: {row['Model']}
    Brand: {row['Brand']}
    Year: {row['Year']}
    Price: {row['Price']}
    Fuel: {row['Fuel']}
    Kilometers: {row['Kilometers']}
    Gearbox: {row['Gearbox']}
    Car body: {row['Car body']}
    Color: {row['Color']}
    Registration: {row['Registration']}
    Registered to: {row['Registered to']}
    Engine power: {row['Engine power']}
    Show class: {row['Show class']}
    Seller: {row['Seller']}
    Phone: {row['Phone']}
    Link: {row['Link']}
    Image URL: {row['Image_URL']}
    """
    documents.append(Document(page_content=content, metadata={"id": idx}))

logger.info(f"Total documents created: {len(documents)}")

batch_size = 50
batched_documents = []

for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]
    content = "\n---\n".join([d.page_content for d in batch])
    batched_documents.append(Document(page_content=content, metadata={"batch_index": i}))
    logger.info(f"Created batch {i//batch_size + 1} containing {len(batch)} documents")

logger.info(f"Total batches created: {len(batched_documents)}")


2025-09-02 15:57:49,570 [INFO] Total documents created: 4173
2025-09-02 15:57:49,571 [INFO] Created batch 1 containing 50 documents
2025-09-02 15:57:49,572 [INFO] Created batch 2 containing 50 documents
2025-09-02 15:57:49,572 [INFO] Created batch 3 containing 50 documents
2025-09-02 15:57:49,573 [INFO] Created batch 4 containing 50 documents
2025-09-02 15:57:49,573 [INFO] Created batch 5 containing 50 documents
2025-09-02 15:57:49,573 [INFO] Created batch 6 containing 50 documents
2025-09-02 15:57:49,574 [INFO] Created batch 7 containing 50 documents
2025-09-02 15:57:49,574 [INFO] Created batch 8 containing 50 documents
2025-09-02 15:57:49,575 [INFO] Created batch 9 containing 50 documents
2025-09-02 15:57:49,575 [INFO] Created batch 10 containing 50 documents
2025-09-02 15:57:49,575 [INFO] Created batch 11 containing 50 documents
2025-09-02 15:57:49,576 [INFO] Created batch 12 containing 50 documents
2025-09-02 15:57:49,576 [INFO] Created batch 13 containing 50 documents
2025-09-02 1

In [8]:
batch_size = 100
batched_documents = []

for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]

    content = "\n---\n".join([d.page_content for d in batch])
    batched_documents.append(Document(page_content=content, metadata={"batch_index": i}))


In [9]:
async def process_batches(batches):
    results = []
    total_batches = len(batches)

    for idx, batch in enumerate(batches, start=1):
        logger.info(f"Processing batch {idx}/{total_batches}...")
        batch_start = time.time()
        try:
            # convert batch to graph documents
            graph_doc = await graph_transformer.aconvert_to_graph_documents([batch])
            results.extend(graph_doc)
            logger.info(f"Finished batch {idx}/{total_batches} in {time.time() - batch_start:.2f}s")
        except Exception as e:
            logger.error(f"Error in batch {idx}: {e}")

    logger.info("All batches processed")
    return results

graph_documents = asyncio.get_event_loop().run_until_complete(
    process_batches(batched_documents)
)

logger.info(f"Total graph documents returned: {len(graph_documents)}")

2025-09-02 15:58:00,342 [INFO] Processing batch 1/42...
2025-09-02 15:58:14,732 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-02 15:58:14,749 [INFO] Finished batch 1/42 in 14.41s
2025-09-02 15:58:14,750 [INFO] Processing batch 2/42...
2025-09-02 15:58:37,282 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-02 15:58:37,294 [INFO] Finished batch 2/42 in 22.54s
2025-09-02 15:58:37,294 [INFO] Processing batch 3/42...
2025-09-02 15:58:55,773 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-02 15:58:55,780 [INFO] Finished batch 3/42 in 18.49s
2025-09-02 15:58:55,780 [INFO] Processing batch 4/42...
2025-09-02 15:59:14,095 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-02 15:59:14,101 [INFO] Finished batch 4/42 in 18.32s
2025-09-02 15:59:14,102 [INFO] Processing batch 5/42...
2025-09-02 15:59:30,079 [INFO] H

In [11]:
def insert_car(tx, doc):
    lines = doc.page_content.split("\n")
    data = {}
    for line in lines:
        if ":" in line:
            key, value = line.split(":", 1)
            data[key.strip()] = value.strip()

    tx.run(
        "MERGE (b:Brand {name: $brand})",
        brand=data.get("Brand")
    )

    tx.run(
        "MERGE (s:Seller {name: $seller, phone: $phone})",
        seller=data.get("Seller"),
        phone=data.get("Phone")
    )

    tx.run(
        """
        MERGE (c:Car {id: $id})
        SET c.model = $model,
            c.year = $year,
            c.price = $price,
            c.fuel = $fuel,
            c.kilometers = $kilometers,
            c.gearbox = $gearbox,
            c.car_body = $car_body,
            c.color = $color,
            c.registration = $registration,
            c.registered_to = $registered_to,
            c.engine_power = $engine_power,
            c.show_class = $show_class,
            c.link = $link,
            c.image_url = $image_url
        """,
        id=doc.metadata["id"],
        model=data.get("Model"),
        year=data.get("Year"),
        price=data.get("Price"),
        fuel=data.get("Fuel"),
        kilometers=data.get("Kilometers"),
        gearbox=data.get("Gearbox"),
        car_body=data.get("Car body"),
        color=data.get("Color"),
        registration=data.get("Registration"),
        registered_to=data.get("Registered to"),
        engine_power=data.get("Engine power"),
        show_class=data.get("Show class"),
        link=data.get("Link"),
        image_url=data.get("Image URL")
    )

    tx.run(
        """
        MATCH (c:Car {id: $id})
        MATCH (b:Brand {name: $brand})
        MATCH (s:Seller {name: $seller, phone: $phone})
        MERGE (c)-[:BELONGS_TO]->(b)
        MERGE (c)-[:IS_SOLD_BY]->(s)
        """,
        id=doc.metadata["id"],
        brand=data.get("Brand"),
        seller=data.get("Seller"),
        phone=data.get("Phone")
    )

with driver.session(database=database) as session:
    total_docs = len(documents)
    for i, doc in enumerate(documents, start=1):
        logger.info(f"Inserting document {i}/{total_docs}: ID={doc.metadata.get('id')}")
        session.write_transaction(insert_car, doc)

logger.info("All cars, brands, and sellers inserted with relationships")

2025-09-02 16:42:17,742 [INFO] Inserting document 1/4173: ID=0
  session.write_transaction(insert_car, doc)
2025-09-02 16:42:17,856 [INFO] Inserting document 2/4173: ID=1
2025-09-02 16:42:17,887 [INFO] Inserting document 3/4173: ID=2
2025-09-02 16:42:17,893 [INFO] Inserting document 4/4173: ID=3
2025-09-02 16:42:17,909 [INFO] Inserting document 5/4173: ID=4
2025-09-02 16:42:17,962 [INFO] Inserting document 6/4173: ID=5
2025-09-02 16:42:17,968 [INFO] Inserting document 7/4173: ID=6
2025-09-02 16:42:17,974 [INFO] Inserting document 8/4173: ID=7
2025-09-02 16:42:17,981 [INFO] Inserting document 9/4173: ID=8
2025-09-02 16:42:17,989 [INFO] Inserting document 10/4173: ID=9
2025-09-02 16:42:17,994 [INFO] Inserting document 11/4173: ID=10
2025-09-02 16:42:18,000 [INFO] Inserting document 12/4173: ID=11
2025-09-02 16:42:18,020 [INFO] Inserting document 13/4173: ID=12
2025-09-02 16:42:18,048 [INFO] Inserting document 14/4173: ID=13
2025-09-02 16:42:18,054 [INFO] Inserting document 15/4173: ID=14