In [33]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
import re
import requests
import time

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)

base_url = "https://lom.agc.gov.my"
main_url = "https://lom.agc.gov.my/principal.php?type=updated"
root_dir = "malaysian_acts"
bm_dir = os.path.join(root_dir, "BM")
en_dir = os.path.join(root_dir, "EN")
os.makedirs(bm_dir, exist_ok=True)
os.makedirs(en_dir, exist_ok=True)

driver.get(main_url)
wait = WebDriverWait(driver, 10)

def scrape_current_table():
    soup = BeautifulSoup(driver.page_source, "html.parser")
    rows = soup.select("table tbody tr")
    for row in rows:
        links = row.find_all("a", class_="event_kira_download_updated")
        for link in links:
            try:
                href = link.get("href")
                if not href:
                    continue
                full_url = base_url + href.replace("..", "")
                raw_filename = os.path.basename(full_url)
                act_match = re.search(r'(Act|Akta)[ _-]?(\d+)', raw_filename, re.IGNORECASE)
                act_number = act_match.group(2) if act_match else "000"
                lang = "BM" if "BM" in full_url or "Akta" in full_url else "EN"
                lang_folder = bm_dir if lang == "BM" else en_dir
                filename = f"Act_{act_number}_{lang}_{raw_filename}"
                save_path = os.path.join(lang_folder, filename)
                if os.path.exists(save_path):
                    print(f"✅ Already downloaded: {filename}")
                    continue
                print(f"⬇️ Downloading: {filename}")
                r = requests.get(full_url)
                r.raise_for_status()
                with open(save_path, "wb") as f:
                    f.write(r.content)
            except Exception as e:
                print(f"❌ Error: {e}")
                continue

# Scrape first page
scrape_current_table()

while True:
    try:
        # Find the "Next" button by class
        next_btn = driver.find_element(By.CSS_SELECTOR, "span.paginate_button.next")
        next_btn_class = next_btn.get_attribute("class")
        if "disabled" in next_btn_class:
            break  # Done with all pages

        driver.execute_script("arguments[0].click();", next_btn)
        time.sleep(2.2)  # Wait for DataTables to update table
        scrape_current_table()
    except Exception as e:
        print("Reached the last page or failed to click next:", e)
        break

driver.quit()
print("✅ All Acts (0–868) downloaded across all paginated pages.")


⬇️ Downloading: Act_868_BM_Akta 868 - AKTA MAJLIS MEDIA MALAYSIA 2025.pdf
⬇️ Downloading: Act_868_EN_Act 868 - MALAYSIAN MEDIA COUNCIL ACT 2025.pdf
⬇️ Downloading: Act_867_BM_Akta 867-Akta Iltizam Kecekapan Perkhidmatan Kerajaan 2025.pdf
⬇️ Downloading: Act_867_EN_Act 867-Government Service Efficiency Commitment Act 2025.pdf
⬇️ Downloading: Act_866_BM_Akta 866-Akta Keselamatan Dalam Talian 2025.pdf
⬇️ Downloading: Act_866_EN_Act 866-Online Safety Act 2025.pdf
⬇️ Downloading: Act_865_BM_Akta 865 AKTA INSTITUT KOPERASI (EMERBADANAN)1968.pdf
⬇️ Downloading: Act_865_EN_Act 865 CO-OPERATIVE INSTITUTE (INCORPORATION) ACT 1968.pdf
⬇️ Downloading: Act_864_BM_Akta 864 - AKTA PERKONGSIAN DATA 2025.pdf
⬇️ Downloading: Act_864_EN_Act 864 - DATA SHARING ACT 2025.pdf
⬇️ Downloading: Act_863_EN_Act 863.pdf
⬇️ Downloading: Act_863_BM_Akta 863.pdf
⬇️ Downloading: Act_862_EN_Act 862 -FINANCE ACT 2024.pdf
⬇️ Downloading: Act_862_BM_Akta 862 - AKTA KEWANGAN 2024.pdf
⬇️ Downloading: Act_861_BM_Akta Kecekap