In [None]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json

In [None]:
endpoint_url = "https://query.wikidata.org/sparql"

# SPARQL: Get all mushrooms (with any of P783–P789 properties) and their English Wikipedia article
query = """
SELECT DISTINCT ?item ?itemLabel ?article
WHERE {
  ?item (wdt:P783|wdt:P784|wdt:P785|wdt:P786|wdt:P787|wdt:P788|wdt:P789) [] .
  
  OPTIONAL {
    ?article schema:about ?item ;
             schema:isPartOf <https://en.wikipedia.org/> .
  }

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""

def get_results(endpoint_url, query):
    user_agent = f"WDQS-example Python/{sys.version_info[0]}.{sys.version_info[1]}"
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

try:
    results = get_results(endpoint_url, query)
    bindings = results["results"]["bindings"]
    print(f"Successfully fetched {len(bindings)} results.\n")

    # Extract mushroom name and article link
    data = []
    for result in bindings:
        name = result["itemLabel"]["value"]
        article = result.get("article", {}).get("value", None)
        if article:  # Only keep those with an article
            data.append({"mushroom": name, "article": article})

    # Display nicely
    df = pd.DataFrame(data)
    print(df.head(10))

except Exception as e:
    print(f"An error occurred: {e}")

# Save as json file
with open("mushroom_data.json", "w") as f:
    json.dump(data, f, indent=2)

# Save as csv file
df.to_csv("data/mushroom_data.csv", index=False)

# Save as json file
with open("data/mushroom_data.json", "w") as f:
    json.dump(data, f, indent=2)

Successfully fetched 1342 results.

                   mushroom                                            article
0          Russula maculata     https://en.wikipedia.org/wiki/Russula_maculata
1            Russula vinosa       https://en.wikipedia.org/wiki/Russula_vinosa
2  Gymnopilus pseudofulgens  https://en.wikipedia.org/wiki/Gymnopilus_pseud...
3    Gymnopilus josserandii  https://en.wikipedia.org/wiki/Gymnopilus_josse...
4  Albatrellus subrubescens  https://en.wikipedia.org/wiki/Albatrellus_subr...
5       Lactarius lignyotus  https://en.wikipedia.org/wiki/Lactarius_lignyotus
6             Amanita verna        https://en.wikipedia.org/wiki/Amanita_verna
7            lethal webcaps  https://en.wikipedia.org/wiki/Cortinarius_rube...
8         Russula betularum    https://en.wikipedia.org/wiki/Russula_betularum
9                  Galerina             https://en.wikipedia.org/wiki/Galerina


In [None]:
# Save as csv file
df.to_csv("data/mushroom_data.csv", index=False)

# Save as json file
with open("data/mushroom_data.json", "w") as f:
    json.dump(data, f, indent=2)


In [87]:
# Save as json file
import json
with open("mushroom_data.json", "w") as f:
    json.dump(data, f, indent=2)

In [90]:
# Print number of results
print(f"Number of results: {len(results['results']['bindings'])}")

Number of results: 1342


In [89]:
import os, time, requests, urllib.parse, json, re

API_URL = "https://en.wikipedia.org/w/api.php"
SAVE_DIR = "data/raw_articles_wikitext"
os.makedirs(SAVE_DIR, exist_ok=True)

# --- Load your Wikidata-derived JSON ---
with open("mushroom_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Number of entries in JSON file: {len(data)}")

# Extract titles from article URLs
def extract_title(article_url):
    """Extract the Wikipedia page title from a full URL."""
    return urllib.parse.unquote(article_url.split("/")[-1])

titles = []
for entry in data:
    article_url = entry.get("article")
    if article_url and article_url.startswith("https://en.wikipedia.org/wiki/"):
        titles.append(extract_title(article_url))
    else:
        print(f"⚠️ Skipping invalid or missing article for: {entry.get('mushroom', 'Unknown')}")

# --- User-Agent (Wikipedia requirement) ---
session = requests.Session()
session.headers.update({
    "User-Agent": "DTU-MushroomGraph/1.0 (contact: your.email@example.com)"
})

def get_wikitext(title):
    """Fetch the full wikitext of a Wikipedia page."""
    params = {
        "action": "query",
        "format": "json",
        "formatversion": "2",
        "redirects": "1",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "titles": title
    }
    r = session.get(API_URL, params=params, timeout=30)
    if r.status_code != 200:
        return None
    js = r.json()
    pages = js.get("query", {}).get("pages", [])
    if not pages or "missing" in pages[0]:
        return None
    return pages[0]["revisions"][0]["slots"]["main"]["content"]

def save_wikitext(title):
    """Download and save the wikitext for a Wikipedia article title."""
    safe = urllib.parse.unquote(title).replace("/", "_")
    path = os.path.join(SAVE_DIR, f"{safe}.txt")

    # Skip if file already exists
    if os.path.exists(path):
        print(f"⏭️  Skipping {title} (already exists)")
        return True

    text = get_wikitext(title)
    if not text:
        print(f"❌ Failed to get {title}")
        return False

    # Remove embedded image/file references
    text = re.sub(r"\[\[(?:File|Image):[^\]]+\]\]", "", text)

    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

    time.sleep(0.5)  # be polite to the API
    return True

failed = []
for i, title in enumerate(titles, start=1):
    ok = save_wikitext(title)
    status = "Saved" if ok else "Failed"
    print(f"[{i}/{len(titles)}] {status} {title}")
    if not ok:
        failed.append(title)

if failed:
    os.makedirs("data", exist_ok=True)
    with open("data/failed_wikitext.json", "w", encoding="utf-8") as f:
        json.dump(failed, f, indent=2)
    print(f"Completed with {len(failed)} failures.")
else:
    print("Completed successfully.")

Number of entries in JSON file: 1161
⏭️  Skipping Russula_maculata (already exists)
[1/1161] Saved Russula_maculata
⏭️  Skipping Russula_vinosa (already exists)
[2/1161] Saved Russula_vinosa
⏭️  Skipping Gymnopilus_pseudofulgens (already exists)
[3/1161] Saved Gymnopilus_pseudofulgens
⏭️  Skipping Gymnopilus_josserandii (already exists)
[4/1161] Saved Gymnopilus_josserandii
⏭️  Skipping Albatrellus_subrubescens (already exists)
[5/1161] Saved Albatrellus_subrubescens
⏭️  Skipping Lactarius_lignyotus (already exists)
[6/1161] Saved Lactarius_lignyotus
⏭️  Skipping Amanita_verna (already exists)
[7/1161] Saved Amanita_verna
⏭️  Skipping Cortinarius_rubellus (already exists)
[8/1161] Saved Cortinarius_rubellus
⏭️  Skipping Russula_betularum (already exists)
[9/1161] Saved Russula_betularum
⏭️  Skipping Galerina (already exists)
[10/1161] Saved Galerina
⏭️  Skipping Tricholoma_sulphureum (already exists)
[11/1161] Saved Tricholoma_sulphureum
⏭️  Skipping Amanita_muscaria_var._guessowii (al

In [91]:
import os
import re
import json

INPUT_DIR = "data/raw_articles_wikitext"
OUTPUT_PATH = "data/parsed_wikitext_data.json"

# --- Regex patterns ---
LINK_PATTERN = re.compile(r"\[\[([^\]|#]+)(?:\|[^\]]+)?\]\]")   # [[Target|Label]] or [[Target]]
CATEGORY_PATTERN = re.compile(r"\[\[Category:([^\]|#]+)")       # [[Category:Something]]
SECTION_PATTERN = re.compile(r"^==+\s*(.*?)\s*==+", re.MULTILINE)

def parse_wikitext(title, text):
    """
    Parse a single wikitext article into structured fields:
    links, categories, section titles, full text.
    """

    # --- Extract categories ---
    categories = CATEGORY_PATTERN.findall(text)

    # --- Extract links ---
    links = LINK_PATTERN.findall(text)
    # Remove duplicates, ignore File/Image prefixes
    links = list({
        link.strip()
        for link in links
        if not link.lower().startswith(("file:", "image:", "category:"))
    })

    # --- Extract section titles ---
    sections = SECTION_PATTERN.findall(text)

    # --- Clean categories and sections ---
    categories = [c.strip() for c in categories if c.strip()]
    sections = [s.strip() for s in sections if s.strip()]

    return {
        "title": title,
        "links": links,
        "categories": categories,
        "sections": sections,
        "text": text.strip()
    }


def main():
    parsed_data = []

    files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".txt")]
    total = len(files)
    print(f"Parsing {total} wikitext files...")

    for i, filename in enumerate(files, start=1):
        path = os.path.join(INPUT_DIR, filename)
        with open(path, encoding="utf-8") as f:
            text = f.read()

        title = filename.replace(".txt", "")
        parsed = parse_wikitext(title, text)
        parsed_data.append(parsed)

        if i % 50 == 0 or i == total:
            print(f"[{i}/{total}] Processed {filename}")

    # --- Save consolidated JSON ---
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        json.dump(parsed_data, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Done. Saved parsed data to: {OUTPUT_PATH}")
    print(f"Entries parsed: {len(parsed_data)}")

main()

Parsing 1161 wikitext files...
[50/1161] Processed Lactarius_vietus.txt
[100/1161] Processed Gymnopilus_subfulvus.txt
[150/1161] Processed Fomitopsis_pinicola.txt
[200/1161] Processed Gymnopilus_maritimus.txt
[250/1161] Processed Coprinopsis_nivea.txt
[300/1161] Processed Gymnopilus_bellulus.txt
[350/1161] Processed Ganoderma_sichuanense.txt
[400/1161] Processed Inocybe_geophylla.txt
[450/1161] Processed Gymnopilus_purpureonitens.txt
[500/1161] Processed Gymnopilus_panelloides.txt
[550/1161] Processed Gymnopilus_communis.txt
[600/1161] Processed Russula_risigallina.txt
[650/1161] Processed Thelephora_terrestris.txt
[700/1161] Processed Mycena_overholtsii.txt
[750/1161] Processed Clathrus_columnatus.txt
[800/1161] Processed Lactarius_affinis.txt
[850/1161] Processed Cantharellus_cascadensis.txt
[900/1161] Processed Hericium_cirrhatum.txt
[950/1161] Processed Russula_versicolor.txt
[1000/1161] Processed Agaricus_pattersoniae.txt
[1050/1161] Processed Cortinarius_delibutus.txt
[1100/1161]

In [55]:
import os

folder = "data/raw_articles_wikitext"
sizes = [(f, os.path.getsize(os.path.join(folder, f))) for f in os.listdir(folder)]
sizes.sort(key=lambda x: x[1])
print("Smallest 10 files:")
for f, s in sizes[:10]:
    print(f"{f}: {s} bytes")

print("\nLargest 10 files:")
for f, s in sizes[-10:]:
    print(f"{f}: {s} bytes")


Smallest 10 files:
Leccinellum_pseudoscabrum.txt: 838 bytes
Lactarius_azonites.txt: 843 bytes
Amanita_lividopallescens.txt: 882 bytes
Amanita_proxima.txt: 933 bytes
Ramaria_aurea.txt: 957 bytes
Lactarius_semisanguifluus.txt: 1052 bytes
Mycena_vinacea.txt: 1135 bytes
Mycena_kuurkacea.txt: 1235 bytes
Macrocybe_spectabilis.txt: 1317 bytes
Cantharellus_appalachiensis.txt: 1423 bytes

Largest 10 files:
Lactifluus_volemus.txt: 43243 bytes
Paxillus_involutus.txt: 49583 bytes
Suillus_luteus.txt: 49797 bytes
Phallus_indusiatus.txt: 50000 bytes
Suillellus_luridus.txt: 54957 bytes
Armillaria_gallica.txt: 55593 bytes
Psilocybe_semilanceata.txt: 57240 bytes
Amanita_phalloides.txt: 61751 bytes
Amanita_muscaria.txt: 82606 bytes
Boletus_edulis.txt: 86824 bytes


In [58]:
import os
files = [f for f in os.listdir("data/raw_articles_wikitext") if f.endswith(".txt")]
print(len(files))

# Print the ones that does not end with .txt
for f in os.listdir("data/raw_articles_wikitext"):
    if not f.endswith(".txt"):
        print(f)

531


In [59]:
# Print number of files in raw_articles_wikitext
print(f"Number of files in raw_articles_wikitext: {len(os.listdir('data/raw_articles_wikitext'))}")

Number of files in raw_articles_wikitext: 531
