In [22]:
from pathlib import Path
from wikipedia_markdown.utils.yaml import load_yaml
from wikipedia_markdown.utils.database import get_rows_from_ids

# Set the base path
base_path = Path("../")  # One level up from the current working directory

# Load the YAML configuration
config_path = base_path / "run_config.yaml"  # Path to run_config.yaml
config = load_yaml(config_path)

# Construct the full path to the database file
data_folder = base_path / config["data_folder"]  # Path to the data folder
db_file = config["db_file"]  # Database file name
db_path = data_folder / db_file  # Full path to the database file

# Print the database path for verification
print(f"Database path: {db_path}")

# Check if the database file exists
if db_path.exists():
    print("Database file exists.")
else:
    print("Database file does not exist. Initializing the database...")
    from wikipedia_markdown.utils.database import initialize_db
    initialize_db(db_path)  # Initialize the database if it doesn't exist

Database path: ../data/database.db
Database file exists.


In [39]:
# article_id = 61541 # Bobby Robson
# article_id = 17 # Adobe Illustrator
article_id = 1

rows = get_rows_from_ids(
    db_path=db_path,
    ids=[article_id],
)

article = rows[0]
print(f"Article found: {rows[0]['title']}")

Article found: April


In [6]:
# print(article["markdown_text"])

In [40]:
from wikipedia_markdown.clean_markdown import clean_long_text, clean_text
from transformers import AutoTokenizer
from dotenv import load_dotenv
from os import getenv

# Load Local environment variables (OpenRouter API Key)
load_dotenv()

huggingface_token = getenv("HUGGINGFACE_TOKEN")

# model_openrouter = "deepseek/deepseek-chat"
# model_hf = "deepseek-ai/DeepSeek-V3"

# model_openrouter = "mistralai/mistral-nemo"
# model_hf = "mistralai/Mistral-Nemo-Instruct-2407"

model_openrouter = "openai/gpt-4o-mini"
model_hf = "Xenova/gpt-4o"

tokenizer = AutoTokenizer.from_pretrained(model_hf, token=huggingface_token)

prompts = load_yaml(base_path / "prompts.yaml")

In [41]:
article["markdown_text_tokens"]

4246

In [42]:
if article["markdown_text_tokens"] > 5000:
    # print("Long article\n")
    article_formatted = clean_long_text(
        model_openrouter=model_openrouter,
        text=article["markdown_text"],
        template=prompts["clean_markdown_2"],
        tokenizer=tokenizer,
        max_tokens=7000,  # Max 8k, but we have a prompt
    )
else:
    # print("Short article\n")
    article_formatted = clean_text(
        model_openrouter=model_openrouter,
        text=article["markdown_text"],
        template=prompts["clean_markdown_2"],
    )

print(article_formatted)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1072b38e0>>
Traceback (most recent call last):
  File "/Users/fernando/Documents/GitHub/wikipedia-markdown/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [35]:
print(article["markdown_text"])


# Adobe Illustrator

Adobe Illustrator is a computer program for making graphic design and illustrations. It is made by Adobe Systems. Pictures created in Adobe Illustrator can be made bigger or smaller, and look exactly the same at any size. It works well with the rest of the products with the Adobe name.
## History

It was first released in 1986 for the Apple Macintosh. The latest version is Adobe Illustrator 2024, part of Adobe Creative Cloud.
## Release history

<table><tr><td> Version </td><td> Platforms </td><td> Release date </td><td> Code name
</td></tr><tr><td> 1.0 </td><td> Mac OS </td><td> January 1987 </td><td>  Picasso
</td></tr><tr><td> 1.1 </td><td> Mac OS </td><td> 19 March 1987 </td><td> Inca
</td></tr><tr><td> 88 </td><td> Mac OS </td><td> March 1988 </td><td>
</td></tr><tr><td> 2.0 </td><td> Windows </td><td> January 1989 </td><td> Pinnacle
</td></tr><tr><td> 3 </td><td> Mac OS, NeXT, other Unices </td><td> October 1990 </td><td> Desert Moose
</td></tr><tr><td> 3.5 