In [16]:
from pathlib import Path
from wikipedia_markdown.utils.yaml import load_yaml
from wikipedia_markdown.utils.database import get_rows_from_ids

# Set the base path
base_path = Path("../")  # One level up from the current working directory

# Load the YAML configuration
config_path = base_path / "run_config.yaml"
config = load_yaml(config_path)

# Construct the full path to the database file
data_folder = base_path / config["data_folder"]
db_file = config["db_file"]
db_path = data_folder / db_file

# Print the database path for verification
print(f"Database path: {db_path}")

# Check if the database file exists
if db_path.exists():
    print("Database file exists.")
else:
    print("Database file does not exist.")

Database path: ../data/database.db
Database file exists.


In [17]:
article_id = 61541 # Bobby Robson
# article_id = 17 # Adobe Illustrator
# article_id = 1

rows = get_rows_from_ids(
    db_path=db_path,
    ids=[article_id],
)

article = rows[0]
print(f"Article found: {rows[0]['title']}")

Article found: Bobby Robson


In [18]:
# print(article["markdown_text"])

In [19]:
from wikipedia_markdown.clean_markdown.openrouter import clean_long_text, clean_text
from transformers import AutoTokenizer
from dotenv import load_dotenv
from os import getenv

# Load Local environment variables (OpenRouter API Key)
load_dotenv()

huggingface_token = getenv("HUGGINGFACE_TOKEN")

model_openrouter = "deepseek/deepseek-chat"
model_hf = "deepseek-ai/DeepSeek-V3"

# model_openrouter = "mistralai/mistral-nemo"
# model_hf = "mistralai/Mistral-Nemo-Instruct-2407"

# model_openrouter = "openai/gpt-4o-mini"
# model_hf = "Xenova/gpt-4o"

tokenizer = AutoTokenizer.from_pretrained(model_hf, token=huggingface_token)

prompts = load_yaml(base_path / "prompts.yaml")

In [20]:
article["markdown_text_tokens"]

1057

In [21]:
if article["markdown_text_tokens"] > 5000:
    # print("Long article\n")
    article_formatted = clean_long_text(
        model_openrouter=model_openrouter,
        text=article["markdown_text"],
        template=prompts["clean_markdown"],
        tokenizer=tokenizer,
        max_tokens=7000,  # Max 8k, but we have a prompt
    )
else:
    # print("Short article\n")
    article_formatted = clean_text(
        model_openrouter=model_openrouter,
        text=article["markdown_text"],
        template=prompts["clean_markdown"],
    )

print(article_formatted)

# Bobby Robson

Sir Robert William "Bobby" Robson (18 February 1933 – 31 July 2009) was an English association footballer and football team manager. As a player, he played almost 600 games and scored over 100 goals. He also played for the England national team. He scored four goals in 20 games for them.

Robson managed Fulham F.C. and Ipswich Town F.C. (F.C. means football club). He then managed the England national football team for eight years, from 1982 to 1990. He led England to the semi-finals of the World Cup in 1990. He left England to manage other football teams in Europe. These included clubs from The Netherlands, Portugal, and Spain. Robson came back to England to manage Newcastle United F.C. in 1999.

Robson was diagnosed with lung cancer in 2007. It was described as terminal. With no hope of recovery, two years later, he died in the year 2009.

## Early life

Robson was born on 18 February 1933 in Sacriston, County Durham. He was the fourth of five brothers. As a young chil

In [22]:
print(article["markdown_text"])


# Bobby Robson

Sir Robert William "Bobby" Robson  (18 February 1933 – 31 July 2009) was an English association footballer and football team manager. As a player, he played almost 600 games and scored over 100 goals. He also played for the England national team. He scored four goals in 20 games for them.

Robson managed Fulham F.C. and Ipswich Town F.C.. (F.C. means football club). He then managed the England national football team for eight years, from 1982 to 1990.  He led England to the semi-finals of the World Cup in 1990.  He left England to manage other football teams in Europe.  These included clubs from The Netherlands, Portugal and Spain.  Robson came back to England to manage Newcastle United F.C. in 1999.

Robson was diagnosed with lung cancer in 2007. It was described as terminal. With no hope of recovery, two years later, he died in the year 2009. 
## Early life

Robson was born on 18 February 1933 in Sacriston, County Durham.  He was the fourth of five brothers.  As a yo