In [1]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


# pip install -r requirements.txt first, before continuing with the rest of the code

In [2]:
# Installing necessary packages!
# These should ideally be in requirements.txt and installed once for the environment.
# But for a notebook that's meant to be self-contained for easy sharing/running,
# it's common to keep them here.
# !pip install trafilatura sentence-transformers torch pandas pyarrow duckdb scipy -q

import sys
import os
import warnings

# Suppress a common warning from the sentence-transformers library
warnings.filterwarnings(
    "ignore", category=FutureWarning, module="huggingface_hub.file_download"
)

# Add the project root to the Python path so we can import from src
# Adjust this path if your notebook is located differently relative to the 'src' folder
# This assumes your project root is '/content/drive/My Drive/WebKnoGraph'
project_root = "/content/drive/My Drive/WebKnoGraph"  # Explicitly set

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")

# Google Colab Drive Mount
try:
    from google.colab import drive

    # Check if already mounted before attempting to mount again (as in embeddings_ui.ipynb Cell 1)
    if not os.path.exists("/content/drive/My Drive"):
        drive.mount("/content/drive/")
        print("Google Drive mounted successfully.")
    else:
        print("Google Drive already mounted.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

# Import from your refactored backend and shared modules
import gradio as gr
import io
import duckdb
import pandas as pd  # Added pandas import
from tqdm.auto import tqdm  # Added tqdm import for progress bar
import traceback  # Added traceback for error logging

# Specific imports for the Embedding Pipeline
from src.backend.config.embeddings_config import EmbeddingConfig
from src.backend.data.embedding_state_manager import EmbeddingStateManager
from src.backend.data.embeddings_loader import DataLoader
from src.backend.data.embeddings_saver import DataSaver
from src.backend.utils.text_processing import TextExtractor
from src.backend.utils.embedding_generation import EmbeddingGenerator
from src.backend.services.embeddings_service import EmbeddingPipeline
from src.shared.logging_config import (
    ConsoleAndGradioLogger,
)  # Using the updated generic logger

print("All modules imported successfully!")

Project root added to sys.path: /content/drive/My Drive/WebKnoGraph
Current working directory: /content
sys.path: ['/content/drive/My Drive/WebKnoGraph', '/content', '/env/python', '/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.11/dist-packages/IPython/extensions', '/usr/local/lib/python3.11/dist-packages/setuptools/_vendor', '/root/.ipython']
Google Drive already mounted.
All modules imported successfully!


In [3]:
# File: embeddings_ui.ipynb - Cell 3
def run_gradio_interface(input_path: str, output_path: str, batch_size: int):
    """Wires up all components and runs the pipeline, yielding UI updates."""
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(
        log_stream, logger_name="EmbeddingLogger"
    )  # Pass logger_name

    config = EmbeddingConfig(
        input_path=input_path, output_path=output_path, batch_size=batch_size
    )

    # Instantiate all our components
    state_manager = EmbeddingStateManager(config.output_path, logger)
    data_loader = DataLoader(config.input_path, logger)
    text_extractor = TextExtractor()
    embedding_generator = EmbeddingGenerator(config.model_name, logger)
    data_saver = DataSaver(config.output_path, logger)

    # Create a modified EmbeddingPipeline class within this scope that includes the fix
    # Alternatively, you would apply this fix directly in src/backend/services/embeddings_service.py
    class FixedEmbeddingPipeline(EmbeddingPipeline):
        def run(self):
            """
            Orchestrates the embedding generation pipeline.
            Yields status updates for Gradio UI.
            """
            self.logger.info("Starting embedding pipeline...")
            yield "Status: Initializing Pipeline..."

            try:
                processed_urls = self.state_manager.get_processed_urls()
                self.logger.info(
                    f"Found {len(processed_urls)} URLs that have already been processed. They will be skipped."
                )
                yield f"Status: Resuming, skipping {len(processed_urls)} already processed URLs."

                self.logger.info("Querying for new pages to process...")
                data_iterator = self.data_loader.stream_unprocessed_data(
                    processed_urls=processed_urls, batch_size=self.config.batch_size
                )
                yield "Status: Loading new data..."

                total_processed_in_session = 0
                for batch_num, df_batch_arrow in enumerate(data_iterator):
                    if df_batch_arrow.num_rows == 0:
                        self.logger.info(f"Batch {batch_num + 1} is empty, skipping.")
                        yield f"Status: Processed Batch {batch_num + 1}: Empty."
                        continue

                    self.logger.info(
                        f"Processing Batch {batch_num + 1} ({len(df_batch_arrow)} pages)..."
                    )

                    # Convert PyArrow RecordBatch to Pandas DataFrame for modification
                    df_batch = df_batch_arrow.to_pandas()

                    # Extract clean text
                    df_batch["clean_text"] = [
                        self.text_extractor.extract(
                            html_content
                        )  # Changed from .extract_text to .extract
                        for html_content in tqdm(
                            df_batch["Content"],  # Use "Content" as per DataLoader
                            desc="Extracting Text",
                            leave=False,
                            unit="docs",
                        )
                    ]

                    # Filter out pages where text extraction might have failed or resulted in empty strings
                    original_count = len(df_batch)
                    df_batch = df_batch[df_batch["clean_text"].str.strip().astype(bool)]
                    filtered_count = original_count - len(df_batch)
                    if filtered_count > 0:
                        self.logger.warning(
                            f"Filtered out {filtered_count} pages with no extractable text in Batch {batch_num + 1}."
                        )

                    if df_batch.empty:  # This .empty check is correct for the Pandas DataFrame after conversion
                        self.logger.warning(
                            f"Batch {batch_num + 1} resulted in no extractable text after filtering, skipping."
                        )
                        yield f"Status: Processed Batch {batch_num + 1}: No valid text extracted."
                        continue

                    # Generate embeddings
                    self.logger.info(
                        f"Generating Embeddings for Batch {batch_num + 1}..."
                    )
                    try:
                        # Corrected: Use the actual method name 'generate' from EmbeddingGenerator
                        df_batch["embedding"] = self.embedding_generator.generate(
                            df_batch["clean_text"].tolist()
                        ).tolist()
                    except Exception as e:
                        # Do not pass exc_info to ConsoleAndGradioLogger.error()
                        # Instead, include the traceback in the message.
                        error_message = f"Error generating embeddings for Batch {batch_num + 1}: {e}\n{traceback.format_exc()}"
                        self.logger.error(error_message)
                        continue

                    # --- ADDED SAVING LOGIC HERE ---
                    # Save the generated embeddings after each batch
                    self.data_saver.save_embeddings_batch(df_batch)  # Added this line
                    # --- END OF ADDED LOGIC ---

                    # The update_processed_urls method is not in the provided EmbeddingStateManager.
                    # You will need to implement an 'update_processed_urls' method in
                    # src/backend/data/embedding_state_manager.py if you intend to save the state.
                    # For now, this line is commented out.
                    # If you need resume functionality, ensure this method is implemented in EmbeddingStateManager:
                    # def update_processed_urls(self, new_urls: list):
                    #     """
                    #     Updates the persistent record of processed URLs.
                    #     This method should append or merge `new_urls` with the existing state
                    #     and save it to a durable storage (e.g., a JSON file or a dedicated DuckDB table).
                    #     """
                    #     pass # Placeholder for actual implementation in embedding_state_manager.py

                    # self.state_manager.update_processed_urls(df_batch["URL"].tolist())
                    total_processed_in_session += len(df_batch)

                    yield f"Status: Processed Batch {batch_num + 1} ({len(df_batch)} embeddings generated). Total in session: {total_processed_in_session}"

                self.logger.info("Embedding pipeline finished successfully.")
                yield "Status: Pipeline Finished Successfully!"

            except Exception as e:
                error_message = (
                    f"A critical pipeline error occurred: {e}\n{traceback.format_exc()}"
                )
                self.logger.error(error_message)
                yield f"Status: Critical Error! Check logs. Error: {e}"
                raise

    pipeline = FixedEmbeddingPipeline(  # Use the fixed pipeline
        config,
        logger,
        state_manager,
        data_loader,
        text_extractor,
        embedding_generator,
        data_saver,
    )

    final_status = "Initializing..."
    for status in pipeline.run():
        final_status = status
        # Yield the current status and the full log content
        yield status, log_stream.getvalue(), ""

    # Generate final summary after the pipeline finishes
    try:
        # Ensure output_glob_path uses forward slashes for DuckDB even on Windows
        output_glob_path = os.path.join(output_path, "*.parquet").replace(os.sep, "/")
        total_embeddings = duckdb.query(
            f"SELECT COUNT(URL) FROM read_parquet('{output_glob_path}')"
        ).fetchone()[0]
        summary_md = f"### âœ… Pipeline Finished\n\n- **Final Status:** {final_status}\n- **Total embeddings saved:** {total_embeddings}\n- **Output location:** `{output_path}`"
    except Exception as e:
        logger.error(f"Could not generate final summary. Error: {e}")
        summary_md = (
            f"### Pipeline Finished\n\n- Could not generate summary. Error: {e}"
        )

    yield final_status, log_stream.getvalue(), summary_md

In [4]:
# File: embeddings_ui.ipynb - Cell 4
# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ðŸ¤– Resumable Embedding Pipeline")
    gr.Markdown(
        "This tool reads HTML from Parquet files, cleans it, generates embeddings, and saves the results in batches. It can be stopped and resumed at any time."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## 1. Configuration")
            input_path_box = gr.Textbox(
                label="Input Parquet Folder Path", value=EmbeddingConfig.input_path
            )
            output_path_box = gr.Textbox(
                label="Output Embeddings Directory Path",
                value=EmbeddingConfig.output_path,
            )
            batch_size_input = gr.Number(
                minimum=1,
                maximum=5,
                value=EmbeddingConfig.batch_size,
                step=1,
                label="Batch Size",
                info="How many pages to process in memory at a time (1/2 recommended).",
            )
            start_button = gr.Button(
                "ðŸš€ Start/Resume Embedding Generation", variant="primary"
            )

        with gr.Column(scale=2):
            gr.Markdown("## 2. Status & Results")
            status_output = gr.Textbox(label="Current Status", interactive=False)
            log_output = gr.Textbox(
                label="Detailed Logs", interactive=False, lines=10, max_lines=20
            )
            summary_output = gr.Markdown("---")

    start_button.click(
        fn=run_gradio_interface,
        inputs=[input_path_box, output_path_box, batch_size_input],
        outputs=[status_output, log_output, summary_output],
    )

In [None]:
# File: embeddings_ui.ipynb - Cell 5
# --- Launch the Application ---
if __name__ == "__main__":
    try:
        from google.colab import drive

        # It's better to mount once at the very start of the notebook
        # or main.py. If it's already mounted, no need to force_remount unless necessary.
        # Check if already mounted before attempting to mount again (as in embeddings_ui.ipynb Cell 1)
        if not os.path.exists("/content/drive/My Drive"):
            drive.mount("/content/drive/")
            print("Google Drive mounted successfully.")
        else:
            print("Google Drive already mounted.")

        demo.launch(debug=True, share=True)
    except Exception as e:
        print("Could not launch Gradio demo in this environment.")
        print(e)

Google Drive already mounted.
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3d946586003c66fa05.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


2025-06-28 18:04:09,784 - INFO - Loading embedding model: nomic-ai/nomic-embed-text-v1.5...
INFO:EmbeddingLogger:Loading embedding model: nomic-ai/nomic-embed-text-v1.5...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_hf_nomic_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

2025-06-28 18:04:26,079 - INFO - Model loaded successfully.
INFO:EmbeddingLogger:Model loaded successfully.
2025-06-28 18:04:26,083 - INFO - Starting embedding pipeline...
INFO:EmbeddingLogger:Starting embedding pipeline...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2025-06-28 18:04:31,336 - INFO - Found 17 URLs that have already been processed. They will be skipped.
INFO:EmbeddingLogger:Found 17 URLs that have already been processed. They will be skipped.
2025-06-28 18:04:31,338 - INFO - Found 17 URLs that have already been processed. They will be skipped.
INFO:EmbeddingLogger:Found 17 URLs that have already been processed. They will be skipped.
2025-06-28 18:04:31,341 - INFO - Querying for new pages to process...
INFO:EmbeddingLogger:Querying for new pages to process...
2025-06-28 18:04:31,349 - INFO - Querying for new pages to process...
INFO:EmbeddingLogger:Querying for new pages to process...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2025-06-28 18:05:12,530 - INFO - Processing Batch 1 (1 pages)...
INFO:EmbeddingLogger:Processing Batch 1 (1 pages)...


Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:12,669 - INFO - Generating Embeddings for Batch 1...
INFO:EmbeddingLogger:Generating Embeddings for Batch 1...
2025-06-28 18:05:12,672 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:13,846 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180513_f6ec8218bb454c70ab29d28b289538a3.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180513_f6ec8218bb454c70ab29d28b289538a3.parquet
2025-06-28 18:05:13,893 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180513_f6ec8218bb454c70ab29d28b289538a3.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180513_f6ec8218bb454c70ab29d28b289538a3.parquet
2025

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:13,966 - INFO - Generating Embeddings for Batch 2...
INFO:EmbeddingLogger:Generating Embeddings for Batch 2...
2025-06-28 18:05:13,971 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:14,968 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180514_63e102befa864724894e3d33cbf5ef49.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180514_63e102befa864724894e3d33cbf5ef49.parquet
2025-06-28 18:05:14,987 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180514_63e102befa864724894e3d33cbf5ef49.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180514_63e102befa864724894e3d33cbf5ef49.parquet
2025

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:15,232 - INFO - Generating Embeddings for Batch 3...
INFO:EmbeddingLogger:Generating Embeddings for Batch 3...
2025-06-28 18:05:15,236 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:16,235 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180516_7ff5f9aa6542409c8133b3d25ce8237b.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180516_7ff5f9aa6542409c8133b3d25ce8237b.parquet
2025-06-28 18:05:16,253 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180516_7ff5f9aa6542409c8133b3d25ce8237b.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180516_7ff5f9aa6542409c8133b3d25ce8237b.parquet
2025

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:16,355 - INFO - Processing Batch 5 (1 pages)...
INFO:EmbeddingLogger:Processing Batch 5 (1 pages)...


Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:16,445 - INFO - Processing Batch 6 (1 pages)...
INFO:EmbeddingLogger:Processing Batch 6 (1 pages)...


Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:16,600 - INFO - Processing Batch 7 (1 pages)...
INFO:EmbeddingLogger:Processing Batch 7 (1 pages)...


Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:16,729 - INFO - Generating Embeddings for Batch 7...
INFO:EmbeddingLogger:Generating Embeddings for Batch 7...
2025-06-28 18:05:16,736 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:23,480 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_07611f4d761140d4bf877bc44dc5605b.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_07611f4d761140d4bf877bc44dc5605b.parquet
2025-06-28 18:05:23,499 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_07611f4d761140d4bf877bc44dc5605b.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_07611f4d761140d4bf877bc44dc5605b.parquet
2025

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:23,596 - INFO - Generating Embeddings for Batch 8...
INFO:EmbeddingLogger:Generating Embeddings for Batch 8...
2025-06-28 18:05:23,597 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:23,981 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_654f3368c6c648c9a75736ad7815fd92.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_654f3368c6c648c9a75736ad7815fd92.parquet
2025-06-28 18:05:23,997 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_654f3368c6c648c9a75736ad7815fd92.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180523_654f3368c6c648c9a75736ad7815fd92.parquet
2025

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:24,080 - INFO - Generating Embeddings for Batch 9...
INFO:EmbeddingLogger:Generating Embeddings for Batch 9...
2025-06-28 18:05:24,085 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:25,700 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180525_85f6646b260349169d4b256ee32da8fd.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180525_85f6646b260349169d4b256ee32da8fd.parquet
2025-06-28 18:05:25,717 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180525_85f6646b260349169d4b256ee32da8fd.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180525_85f6646b260349169d4b256ee32da8fd.parquet
2025

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:25,784 - INFO - Processing Batch 11 (1 pages)...
INFO:EmbeddingLogger:Processing Batch 11 (1 pages)...


Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:26,095 - INFO - Generating Embeddings for Batch 11...
INFO:EmbeddingLogger:Generating Embeddings for Batch 11...
2025-06-28 18:05:26,098 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:31,568 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180531_11009156611748cfa24ee3780e745372.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180531_11009156611748cfa24ee3780e745372.parquet
2025-06-28 18:05:31,585 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180531_11009156611748cfa24ee3780e745372.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180531_11009156611748cfa24ee3780e745372.parquet
20

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:31,726 - INFO - Generating Embeddings for Batch 12...
INFO:EmbeddingLogger:Generating Embeddings for Batch 12...
2025-06-28 18:05:31,732 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:37,204 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180537_82781a12169e48cc81996db0dae5d6ca.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180537_82781a12169e48cc81996db0dae5d6ca.parquet
2025-06-28 18:05:37,223 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180537_82781a12169e48cc81996db0dae5d6ca.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180537_82781a12169e48cc81996db0dae5d6ca.parquet
20

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:37,374 - INFO - Generating Embeddings for Batch 13...
INFO:EmbeddingLogger:Generating Embeddings for Batch 13...
2025-06-28 18:05:37,376 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:44,283 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180544_77b06cd2dc9a466eb5cb4596275d502d.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180544_77b06cd2dc9a466eb5cb4596275d502d.parquet
2025-06-28 18:05:44,300 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180544_77b06cd2dc9a466eb5cb4596275d502d.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180544_77b06cd2dc9a466eb5cb4596275d502d.parquet
20

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:44,406 - INFO - Generating Embeddings for Batch 14...
INFO:EmbeddingLogger:Generating Embeddings for Batch 14...
2025-06-28 18:05:44,408 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:05:46,548 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180546_08711b7330fb4845bf74b164ea6482a1.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180546_08711b7330fb4845bf74b164ea6482a1.parquet
2025-06-28 18:05:46,566 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180546_08711b7330fb4845bf74b164ea6482a1.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180546_08711b7330fb4845bf74b164ea6482a1.parquet
20

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:05:46,677 - INFO - Generating Embeddings for Batch 15...
INFO:EmbeddingLogger:Generating Embeddings for Batch 15...
2025-06-28 18:05:46,681 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:06:03,173 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180603_28cae06d49b346d998fe1a88b246110e.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180603_28cae06d49b346d998fe1a88b246110e.parquet
2025-06-28 18:06:03,200 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180603_28cae06d49b346d998fe1a88b246110e.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180603_28cae06d49b346d998fe1a88b246110e.parquet
20

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:06:03,302 - INFO - Processing Batch 17 (1 pages)...
INFO:EmbeddingLogger:Processing Batch 17 (1 pages)...


Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:06:03,411 - INFO - Generating Embeddings for Batch 17...
INFO:EmbeddingLogger:Generating Embeddings for Batch 17...
2025-06-28 18:06:03,414 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
2025-06-28 18:06:13,842 - INFO - Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180613_d8c8a471df124ce981b121d9ddbcb565.parquet
INFO:EmbeddingLogger:Saving new Parquet file: /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180613_d8c8a471df124ce981b121d9ddbcb565.parquet
2025-06-28 18:06:13,855 - INFO - Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180613_d8c8a471df124ce981b121d9ddbcb565.parquet
INFO:EmbeddingLogger:Saved batch of 1 embeddings to /content/drive/My Drive/WebKnoGraph/data/url_embeddings/embeddings_batch_20250628_180613_d8c8a471df124ce981b121d9ddbcb565.parquet
20

Extracting Text:   0%|          | 0/1 [00:00<?, ?docs/s]

2025-06-28 18:06:13,936 - INFO - Generating Embeddings for Batch 18...
INFO:EmbeddingLogger:Generating Embeddings for Batch 18...
2025-06-28 18:06:13,938 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...
