In [9]:
import os
import time
import glob
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib.parse
import requests
import shutil
# --- SETTINGS ---
BASE_URL = "https://sinca.mma.gob.cl"
FROM_DATE = "180101"
TO_DATE   = "241231"
DOWNLOAD_DIR = r"C:\Users\black\Documents\SINCA"



In [15]:


os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# --- Chrome setup ---
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": DOWNLOAD_DIR,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 15)

# --- 1. Go to main page ---
driver.get(BASE_URL)

# --- 2. Collect region links (store hrefs as strings to avoid stale elements) ---
region_elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='/index.php/region/index/id/']")
region_links = []
for r in region_elements:
    try:
        region_name = r.text.strip()
        region_href = r.get_attribute("href")
        region_links.append((region_name, region_href))
    except:
        continue

print(f"Found {len(region_links)} regions.")

# --- 3. Loop over each region ---
for region_name, region_href in region_links:
    print(f"\n=== Region: {region_name} ({region_href}) ===")
    driver.get(region_href)
    time.sleep(2)

    # --- Collect station links ---
    try:
        iframe_links = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.iframe"))
        )
        station_data = []
        for link in iframe_links:
            title = link.get_attribute("title") or "station"
            if "Red MMA" in title:
                continue
            href = link.get_attribute("href")
            if href.startswith("//"):
                href = "https:" + href
            station_data.append((href, title))
    except:
        print(" ⚠ No station links found in this region")
        continue

    print(f"  Found {len(station_data)} stations.")

    # --- Loop over each station ---
    for i, (href, title) in enumerate(station_data, start=1):
        print(f"   [{i}] {title}")

        # ✅ Only proceed if station title matches one of the target pollutants
        if not any(keyword in title for keyword in [
            "Material particulado MP 10 ",
            "Material particulado MP 2,5 ",
            "Ozono.- ",
            "Óxidos de nitrógeno "
        ]):
            print("      ⚠ Skipping (not a target pollutant)")
            continue

        safe_title = re.sub(r'[^A-Za-z0-9 _-]+', "", title).replace(" ", "_") + ".csv"
        dest_path = os.path.join(DOWNLOAD_DIR, safe_title)

        driver.get(href)
        time.sleep(2)

        try:
            # --- Switch to left frame, set dates & select 'registro horario' ---
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "left")))
            from_box = wait.until(EC.presence_of_element_located((By.ID, "from")))
            to_box = driver.find_element(By.ID, "to")

            driver.execute_script(
                "arguments[0].value = arguments[1]; arguments[0].dispatchEvent(new Event('change'));",
                from_box, FROM_DATE
            )
            driver.execute_script(
                "arguments[0].value = arguments[1]; arguments[0].dispatchEvent(new Event('change'));",
                to_box, TO_DATE
            )

            # select registro horario
            select_elem = wait.until(EC.presence_of_element_located((By.ID, "ic")))
            options = select_elem.find_elements(By.TAG_NAME, "option")
            for opt in options:
                if "registro horario" in opt.text.lower():
                    driver.execute_script("""
                        arguments[0].value = arguments[1];
                        arguments[0].dispatchEvent(new Event('change'));
                    """, select_elem, opt.get_attribute("value"))
                    driver.execute_script("EnablePeriod();Open();")
                    break

            driver.switch_to.default_content()

            # --- Wait for right frame & download CSV ---
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "right")))
            csv_link = wait.until(
                EC.presence_of_element_located((By.XPATH, "//a[contains(@href,\"Open('xcl')\")]"))
            )
            driver.execute_script("arguments[0].click();", csv_link)

            # handle rename safely (using before/after snapshot logic)
            before_files = set(glob.glob(os.path.join(DOWNLOAD_DIR, "*.csv")))
            file_path = None
            for _ in range(75):
                time.sleep(1)
                after_files = set(glob.glob(os.path.join(DOWNLOAD_DIR, "*.csv")))
                new_files = [f for f in (after_files - before_files) if not f.endswith(".crdownload")]
                if new_files:
                    file_path = max(new_files, key=os.path.getctime)
                    break

            if file_path:
                if os.path.exists(dest_path):
                    os.remove(dest_path)
                shutil.move(file_path, dest_path)
                print(f"      ✔ Saved as {dest_path}")
            else:
                print("      ✖ Download failed or timed out")

        except Exception as e:
            print(f"      ✖ Error on {title}: {e}")

        finally:
            driver.switch_to.default_content()


driver.quit()
print("All done.")

Found 16 regions.

=== Region:  (https://sinca.mma.gob.cl/index.php/region/index/id/XV) ===
  Found 1 stations.
   [1] Material particulado MP 2,5 | Arica
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_25__Arica.csv

=== Region:  (https://sinca.mma.gob.cl/index.php/region/index/id/I) ===
  Found 1 stations.
   [1] Material particulado MP 2,5 | Alto Hospicio
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_25__Alto_Hospicio.csv

=== Region:  (https://sinca.mma.gob.cl/index.php/region/index/id/II) ===
  Found 125 stations.
   [1] Material particulado MP 10 | Antofagasta
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_10__Antofagasta.csv
   [2] Material particulado MP 2,5 | Antofagasta
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_25__Antofagasta.csv
   [3] Dióxido de azufre | Antofagasta
      ⚠ Skipping (not a target pollutant)
   [4] Material particulado MP 10 | Oncologico
      ✔ Saved as

In [14]:
import os
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# --- Selenium setup ---
DOWNLOAD_DIR = os.path.expanduser("~/Documents/SINCA")
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 10)

BASE_URL = "https://sinca.mma.gob.cl"
REGIONS_URL = f"{BASE_URL}/index.php/region/index/id/"

records = []

region_codes = [
    "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X",
    "XI", "XII", "M", "XIV", "XV", "XVI"
]

for code in region_codes:
    url = f"{REGIONS_URL}{code}"
    print(f"=== Region {code} ({url}) ===")
    driver.get(url)
    
    # --- Collect station URLs first to avoid stale references ---
    station_elements = wait.until(EC.presence_of_all_elements_located(
        (By.CSS_SELECTOR, "a[href^='/index.php/estacion/index/id/']")
    ))
    station_links = [(st.text, st.get_attribute("href")) for st in station_elements]
    print(f"  Found {len(station_links)} stations.")

    for st_name, st_href in station_links:
        try:
            print(f"   -> {st_name} ({st_href})")
            driver.get(st_href)
            
            # --- Extract coordinates ---
            try:
                map_elem = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "div#map_canvas a[href*='maps.google.com/maps?ll='], iframe[src*='maps.google.com/maps?ll=']")
                    )
                )
                href = map_elem.get_attribute("href") or map_elem.get_attribute("src")
                match = re.search(r"ll=([-.\d]+),([-.\d]+)", href)
                if match:
                    lat, lon = float(match.group(1)), float(match.group(2))
                else:
                    lat, lon = None, None
            except:
                lat, lon = None, None

            # --- Extract general info ---
            general_info = {}
            try:
                rows = driver.find_elements(By.CSS_SELECTOR, "#tablaGeneral tbody tr")
                for row in rows:
                    key = row.find_element(By.TAG_NAME, "th").text.strip()
                    val = row.find_element(By.TAG_NAME, "td").text.strip()
                    general_info[key] = val
            except:
                pass

            # --- Extract contaminant parameters ---
            try:
                param_rows = driver.find_elements(By.CSS_SELECTOR, "#medicion tbody tr.master")
                if param_rows:
                    for row in param_rows:
                        param_name = row.find_element(By.CSS_SELECTOR, "th span").text.strip()
                        cols = row.find_elements(By.TAG_NAME, "td")
                        first_date = cols[0].text.strip() if len(cols) > 0 else ""
                        last_date = cols[1].text.strip() if len(cols) > 1 else ""
                        tecnica = cols[2].text.strip() if len(cols) > 2 else ""

                        record = {
                            "Estación": st_name,
                            "Región código": code,
                            "Parámetro": param_name,
                            "Fecha primer registro": first_date,
                            "Fecha último registro": last_date,
                            "Técnica de medición": tecnica,
                            "Latitud": lat,
                            "Longitud": lon,
                        }
                        record.update(general_info)
                        records.append(record)
                else:
                    record = {
                        "Estación": st_name,
                        "Región código": code,
                        "Latitud": lat,
                        "Longitud": lon,
                    }
                    record.update(general_info)
                    records.append(record)
            except:
                record = {
                    "Estación": st_name,
                    "Región código": code,
                    "Latitud": lat,
                    "Longitud": lon,
                }
                record.update(general_info)
                records.append(record)

        except Exception as e:
            print(f"   ⚠️ Error with station {st_name}: {e}")

# --- Save to Excel ---
df = pd.DataFrame(records)
out_file = os.path.join(DOWNLOAD_DIR, "stations_info.xlsx")
df.to_excel(out_file, index=False)
print(f"\n✅ Saved {len(df)} rows to {out_file}")

driver.quit()


=== Region I (https://sinca.mma.gob.cl/index.php/region/index/id/I) ===
  Found 2 stations.
   -> Alto Hospicio (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)
=== Region II (https://sinca.mma.gob.cl/index.php/region/index/id/II) ===
  Found 68 stations.
   -> Antofagasta (https://sinca.mma.gob.cl/index.php/estacion/index/id/259)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/259)
   -> Oncológico (https://sinca.mma.gob.cl/index.php/estacion/index/id/154)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/154)
   -> Playa Blanca (https://sinca.mma.gob.cl/index.php/estacion/index/id/70)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/70)
   -> Rendic (https://sinca.mma.gob.cl/index.php/estacion/index/id/33)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/33)
   -> Sur (https://sinca.mma.gob.cl/index.php/estacion/index/id/46)
   ->  (https://sinca.mma.gob.cl/inde