In [None]:
import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify

# ---- LOAD LINKS ----
file_path = "../raw_data/html/links.txt"
with open(file_path, "r", encoding="utf-8") as f:
    links = [line.strip() for line in f if line.strip()]

# ---- SCRAPER ‚Üí Markdown ----
def simple_markdown_from_url(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "lxml")

    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    markdown = markdownify(str(soup), heading_style="ATX")
    return markdown

# ---- OUTPUT FOLDER (one folder up from current directory) ----
output_dir = os.path.join("..", "transformed_data/html_to_md")
os.makedirs(output_dir, exist_ok=True)

# ---- PROCESS EACH LINK ----
for url in links:
    slug = url.rstrip("/")         # remove ending slash if exists
    slug = slug.split("/")[-1]     # take last part
    filename = f"{slug}.md"

    markdown_raw = simple_markdown_from_url(url)

    with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f:
        f.write(markdown_raw)

    print(f"‚úî Saved {filename}")

print("\nüéâ All done! Files created in ../transformed_data/")


‚úî Saved master-data-science.md
‚úî Saved data-science-b-sc.md
‚úî Saved data-science-m-sc.md

üéâ All done! Files created in ../transformed_data/


In [None]:
import os
import pdfplumber

pdf_folder = "../raw_data/pdfs"
output_folder = "../transformed_data/pdf_to_md"
os.makedirs(output_folder, exist_ok=True)

def pdf_to_markdown(pdf_path):
    markdown_pages = []

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            text = page.extract_text() or ""
            text = text.strip()
            if not text:
                continue
            markdown_pages.append(f"# Page {i}\n\n{text}")

    return "\n\n".join(markdown_pages)

# Process all PDFs in folder
for file in os.listdir(pdf_folder):
    if file.lower().endswith(".pdf"):
        input_path = os.path.join(pdf_folder, file)
        base = os.path.splitext(file)[0]
        output_path = os.path.join(output_folder, f"{base}.md")

        md = pdf_to_markdown(input_path)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(md)

        print(f"‚úîÔ∏è {file} ‚Üí {base}.md")

print("\n‚ú® Done! Markdown saved in ../transformed_data/")


‚úîÔ∏è general_info1.pdf ‚Üí general_info1.md
‚úîÔ∏è general_info2.pdf ‚Üí general_info2.md
‚úîÔ∏è general_info3.pdf ‚Üí general_info3.md
‚úîÔ∏è msc-datascience_faq.pdf ‚Üí msc-datascience_faq.md

‚ú® Done! Markdown saved in ../transformed_data/
