In [15]:
import os
import time
import glob
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib.parse
import requests
# --- SETTINGS ---
BASE_URL = "https://sinca.mma.gob.cl"
FROM_DATE = "180101"
TO_DATE   = "241231"
DOWNLOAD_DIR = r"C:\Users\black\Documents\SINCA"



In [24]:


os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# --- Chrome setup ---
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": DOWNLOAD_DIR,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 15)

# --- 1. Go to main page ---
driver.get(BASE_URL)

# --- 2. Collect region links (store hrefs as strings to avoid stale elements) ---
region_elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='/index.php/region/index/id/']")
region_links = []
for r in region_elements:
    try:
        region_name = r.text.strip()
        region_href = r.get_attribute("href")
        region_links.append((region_name, region_href))
    except:
        continue

print(f"Found {len(region_links)} regions.")

# --- 3. Loop over each region ---
for region_name, region_href in region_links:
    print(f"\n=== Region: {region_name} ({region_href}) ===")
    driver.get(region_href)
    time.sleep(2)

    # --- Collect station links ---
    try:
        iframe_links = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.iframe"))
        )
        station_data = []
        for link in iframe_links:
            title = link.get_attribute("title") or "station"
            if "Red MMA" in title:
                continue
            href = link.get_attribute("href")
            if href.startswith("//"):
                href = "https:" + href
            station_data.append((href, title))
    except:
        print(" ⚠ No station links found in this region")
        continue

    print(f"  Found {len(station_data)} stations.")

    # --- Loop over each station ---
    for i, (href, title) in enumerate(station_data, start=1):
        print(f"   [{i}] {title}")

        safe_title = re.sub(r'[^A-Za-z0-9 _-]+', "", title).replace(" ", "_") + ".csv"
        dest_path = os.path.join(DOWNLOAD_DIR, safe_title)

        driver.get(href)
        time.sleep(2)

        try:
            # --- Switch to left frame and set dates ---
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "left")))
            from_box = wait.until(EC.presence_of_element_located((By.ID, "from")))
            to_box = driver.find_element(By.ID, "to")

            driver.execute_script(
                "arguments[0].value = arguments[1]; arguments[0].dispatchEvent(new Event('change'));",
                from_box, FROM_DATE
            )
            driver.execute_script(
                "arguments[0].value = arguments[1]; arguments[0].dispatchEvent(new Event('change'));",
                to_box, TO_DATE
            )

            driver.execute_script("Open();")  # trigger right frame reload
            driver.switch_to.default_content()

            # --- Wait for right frame and CSV link ---
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "right")))
            csv_link = wait.until(
                EC.presence_of_element_located((By.XPATH, "//a[contains(@href,\"Open('xcl')\")]"))
            )

            # --- JS click to bypass overlay ---
            driver.execute_script("arguments[0].click();", csv_link)

            # --- Wait for download to finish ---
            file_path = None
            for _ in range(60):  # up to 60 seconds
                time.sleep(1)
                files = glob.glob(os.path.join(DOWNLOAD_DIR, "*.csv"))
                if files:
                    latest_file = max(files, key=os.path.getctime)
                    if not latest_file.endswith(".crdownload"):
                        file_path = latest_file
                        break

            if file_path:
                if os.path.exists(dest_path):
                    os.remove(dest_path)
                os.rename(file_path, dest_path)
                print(f"      ✔ Saved as {dest_path}")
            else:
                print("      ✖ Download failed or timed out")

        except Exception as e:
            print(f"      ✖ Error on {title}: {e}")

        finally:
            driver.switch_to.default_content()

driver.quit()
print("All done.")

Found 16 regions.

=== Region:  (https://sinca.mma.gob.cl/index.php/region/index/id/XV) ===
  Found 1 stations.
   [1] Material particulado MP 2,5 | Arica
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_25__Arica.csv

=== Region:  (https://sinca.mma.gob.cl/index.php/region/index/id/I) ===
  Found 1 stations.
   [1] Material particulado MP 2,5 | Alto Hospicio
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_25__Alto_Hospicio.csv

=== Region:  (https://sinca.mma.gob.cl/index.php/region/index/id/II) ===
  Found 125 stations.
   [1] Material particulado MP 10 | Antofagasta
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_10__Antofagasta.csv
   [2] Material particulado MP 2,5 | Antofagasta
      ✔ Saved as C:\Users\black\Documents\SINCA\Material_particulado_MP_25__Antofagasta.csv
   [3] Dióxido de azufre | Antofagasta
      ✔ Saved as C:\Users\black\Documents\SINCA\Dixido_de_azufre__Antofagasta.csv
   [4] Material particu

In [None]:
import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://sinca.mma.gob.cl"
REGIONS_URL = f"{BASE_URL}/index.php/region/index/id/"

# We'll store results here
records = []



for code in region_codes:
    url = f"{REGIONS_URL}{code}"
    print(f"=== Region {code} ({url}) ===")

    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Find station links
    stations = soup.select("a[href^='/index.php/estacion/index/id/']")
    print(f"  Found {len(stations)} stations.")

    for st in stations:
        st_name = st.get_text(strip=True)
        st_href = BASE_URL + st["href"]
        print(f"   -> {st_name} ({st_href})")

        st_resp = requests.get(st_href)
        st_resp.raise_for_status()
        st_soup = BeautifulSoup(st_resp.text, "html.parser")

        # --- Información general ---
        general_info = {}
        info_table = st_soup.select_one("#tablaGeneral")
        if info_table:
            for row in info_table.select("tbody tr"):
                key = row.select_one("th").get_text(strip=True)
                val = row.select_one("td").get_text(strip=True)
                general_info[key] = val

        # --- Parámetros contaminantes ---
        contaminantes = st_soup.select_one("#medicion")
        if contaminantes:
            for row in contaminantes.select("tbody tr.master"):
                cols = row.find_all("td")
                param_name = row.select_one("th span").get_text(strip=True)

                first_date = cols[0].get_text(strip=True) if len(cols) > 0 else ""
                last_date = cols[1].get_text(strip=True) if len(cols) > 1 else ""
                tecnica = cols[2].get_text(strip=True) if len(cols) > 2 else ""

                record = {
                    "Estación": st_name,
                    "Región código": code,
                    "Parámetro": param_name,
                    "Fecha primer registro": first_date,
                    "Fecha último registro": last_date,
                    "Técnica de medición": tecnica,
                }

                # Add general info columns
                record.update(general_info)

                records.append(record)
        else:
            # If no contaminantes table, still keep the general info
            record = {"Estación": st_name, "Región código": code}
            record.update(general_info)
            records.append(record)

# Convert to DataFrame
df = pd.DataFrame(records)

# Save to Excel
out_file = os.path.expanduser("~/Documents/SINCA/stations_info.xlsx")
os.makedirs(os.path.dirname(out_file), exist_ok=True)
df.to_excel(out_file, index=False)

print(f"\n✅ Saved {len(df)} rows to {out_file}")


=== Region I (https://sinca.mma.gob.cl/index.php/region/index/id/I) ===
  Found 2 stations.
   -> Alto Hospicio (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)
=== Region II (https://sinca.mma.gob.cl/index.php/region/index/id/II) ===
  Found 68 stations.
   -> Antofagasta (https://sinca.mma.gob.cl/index.php/estacion/index/id/259)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/259)
   -> Oncológico (https://sinca.mma.gob.cl/index.php/estacion/index/id/154)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/154)
   -> Playa Blanca (https://sinca.mma.gob.cl/index.php/estacion/index/id/70)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/70)
   -> Rendic (https://sinca.mma.gob.cl/index.php/estacion/index/id/33)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/33)
   -> Sur (https://sinca.mma.gob.cl/index.php/estacion/index/id/46)
   ->  (https://sinca.mma.gob.cl/inde

In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://sinca.mma.gob.cl"
REGIONS_URL = f"{BASE_URL}/index.php/region/index/id/"

# Resultados
records = []

# There are 16 regions + "XV" (Arica y Parinacota)
region_codes = [
    "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X",
    "XI", "XII", "XIII", "XIV", "XV", "XVI"
]

for code in region_codes:
    url = f"{REGIONS_URL}{code}"
    print(f"=== Region {code} ({url}) ===")

    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    stations = soup.select("a[href^='/index.php/estacion/index/id/']")
    print(f"  Found {len(stations)} stations.")

    for st in stations:
        st_name = st.get_text(strip=True)
        st_href = BASE_URL + st["href"]
        print(f"   -> {st_name} ({st_href})")

        st_resp = requests.get(st_href)
        st_resp.raise_for_status()
        st_soup = BeautifulSoup(st_resp.text, "html.parser")

        # --- Coordenadas desde enlace Google Maps ---
        lat, lon = None, None
        map_link = st_soup.select_one("div#map_canvas a[href*='maps.google.com/maps?ll=']")
        if map_link:
            href = map_link["href"]
            if "ll=" in href:
                coords = href.split("ll=")[1].split("&")[0]
                try:
                    lat, lon = coords.split(",")
                    lat, lon = float(lat), float(lon)  # convertir a float
                except ValueError:
                    pass




        # --- Información general ---
        general_info = {}
        info_table = st_soup.select_one("#tablaGeneral")
        if info_table:
            for row in info_table.select("tbody tr"):
                key = row.select_one("th").get_text(strip=True)
                val = row.select_one("td").get_text(strip=True)
                general_info[key] = val

        # --- Parámetros contaminantes ---
        contaminantes = st_soup.select_one("#medicion")
        if contaminantes:
            for row in contaminantes.select("tbody tr.master"):
                cols = row.find_all("td")
                param_name = row.select_one("th span").get_text(strip=True)

                first_date = cols[0].get_text(strip=True) if len(cols) > 0 else ""
                last_date = cols[1].get_text(strip=True) if len(cols) > 1 else ""
                tecnica = cols[2].get_text(strip=True) if len(cols) > 2 else ""

                record = {
                    "Estación": st_name,
                    "Región código": code,
                    "Parámetro": param_name,
                    "Fecha primer registro": first_date,
                    "Fecha último registro": last_date,
                    "Técnica de medición": tecnica,
                    "Latitud": lat,
                    "Longitud": lon,
                }
                record.update(general_info)
                records.append(record)
        else:
            record = {
                "Estación": st_name,
                "Región código": code,
                "Latitud": lat,
                "Longitud": lon,
            }
            record.update(general_info)
            records.append(record)

# Guardar en Excel
df = pd.DataFrame(records)
out_file = os.path.expanduser("~/Documents/SINCA/stations_info.xlsx")
os.makedirs(os.path.dirname(out_file), exist_ok=True)
df.to_excel(out_file, index=False)

print(f"\n✅ Saved {len(df)} rows to {out_file}")


=== Region I (https://sinca.mma.gob.cl/index.php/region/index/id/I) ===
  Found 2 stations.
   -> Alto Hospicio (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)

✅ Saved 2 rows to C:\Users\black/Documents/SINCA/stations_info.xlsx


In [14]:
import os
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# --- Selenium setup ---
DOWNLOAD_DIR = os.path.expanduser("~/Documents/SINCA")
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 10)

BASE_URL = "https://sinca.mma.gob.cl"
REGIONS_URL = f"{BASE_URL}/index.php/region/index/id/"

records = []

region_codes = [
    "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X",
    "XI", "XII", "M", "XIV", "XV", "XVI"
]

for code in region_codes:
    url = f"{REGIONS_URL}{code}"
    print(f"=== Region {code} ({url}) ===")
    driver.get(url)
    
    # --- Collect station URLs first to avoid stale references ---
    station_elements = wait.until(EC.presence_of_all_elements_located(
        (By.CSS_SELECTOR, "a[href^='/index.php/estacion/index/id/']")
    ))
    station_links = [(st.text, st.get_attribute("href")) for st in station_elements]
    print(f"  Found {len(station_links)} stations.")

    for st_name, st_href in station_links:
        try:
            print(f"   -> {st_name} ({st_href})")
            driver.get(st_href)
            
            # --- Extract coordinates ---
            try:
                map_elem = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "div#map_canvas a[href*='maps.google.com/maps?ll='], iframe[src*='maps.google.com/maps?ll=']")
                    )
                )
                href = map_elem.get_attribute("href") or map_elem.get_attribute("src")
                match = re.search(r"ll=([-.\d]+),([-.\d]+)", href)
                if match:
                    lat, lon = float(match.group(1)), float(match.group(2))
                else:
                    lat, lon = None, None
            except:
                lat, lon = None, None

            # --- Extract general info ---
            general_info = {}
            try:
                rows = driver.find_elements(By.CSS_SELECTOR, "#tablaGeneral tbody tr")
                for row in rows:
                    key = row.find_element(By.TAG_NAME, "th").text.strip()
                    val = row.find_element(By.TAG_NAME, "td").text.strip()
                    general_info[key] = val
            except:
                pass

            # --- Extract contaminant parameters ---
            try:
                param_rows = driver.find_elements(By.CSS_SELECTOR, "#medicion tbody tr.master")
                if param_rows:
                    for row in param_rows:
                        param_name = row.find_element(By.CSS_SELECTOR, "th span").text.strip()
                        cols = row.find_elements(By.TAG_NAME, "td")
                        first_date = cols[0].text.strip() if len(cols) > 0 else ""
                        last_date = cols[1].text.strip() if len(cols) > 1 else ""
                        tecnica = cols[2].text.strip() if len(cols) > 2 else ""

                        record = {
                            "Estación": st_name,
                            "Región código": code,
                            "Parámetro": param_name,
                            "Fecha primer registro": first_date,
                            "Fecha último registro": last_date,
                            "Técnica de medición": tecnica,
                            "Latitud": lat,
                            "Longitud": lon,
                        }
                        record.update(general_info)
                        records.append(record)
                else:
                    record = {
                        "Estación": st_name,
                        "Región código": code,
                        "Latitud": lat,
                        "Longitud": lon,
                    }
                    record.update(general_info)
                    records.append(record)
            except:
                record = {
                    "Estación": st_name,
                    "Región código": code,
                    "Latitud": lat,
                    "Longitud": lon,
                }
                record.update(general_info)
                records.append(record)

        except Exception as e:
            print(f"   ⚠️ Error with station {st_name}: {e}")

# --- Save to Excel ---
df = pd.DataFrame(records)
out_file = os.path.join(DOWNLOAD_DIR, "stations_info.xlsx")
df.to_excel(out_file, index=False)
print(f"\n✅ Saved {len(df)} rows to {out_file}")

driver.quit()


=== Region I (https://sinca.mma.gob.cl/index.php/region/index/id/I) ===
  Found 2 stations.
   -> Alto Hospicio (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/157)
=== Region II (https://sinca.mma.gob.cl/index.php/region/index/id/II) ===
  Found 68 stations.
   -> Antofagasta (https://sinca.mma.gob.cl/index.php/estacion/index/id/259)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/259)
   -> Oncológico (https://sinca.mma.gob.cl/index.php/estacion/index/id/154)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/154)
   -> Playa Blanca (https://sinca.mma.gob.cl/index.php/estacion/index/id/70)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/70)
   -> Rendic (https://sinca.mma.gob.cl/index.php/estacion/index/id/33)
   ->  (https://sinca.mma.gob.cl/index.php/estacion/index/id/33)
   -> Sur (https://sinca.mma.gob.cl/index.php/estacion/index/id/46)
   ->  (https://sinca.mma.gob.cl/inde

In [25]:
import pandas as pd
import glob, os, re

folder_path = r"C:\Users\black\Dropbox\SINCA"      # change if needed
out_path = r"C:\Users\black\Documents\SINCA\Data_Pollution_cleaned.csv"

all_files = glob.glob(os.path.join(folder_path, "*.csv"))
dfs = []

def merge_split_decimals(df: pd.DataFrame) -> pd.DataFrame:
    """
    Merge adjacent 'Unnamed:*' column into the left neighbor when left looks like integer
    and the unnamed right looks like a fractional part (digits only).
    """
    cols = list(df.columns)
    to_drop = []
    # Work as strings
    for c in cols:
        df[c] = df[c].astype(str)
    df = df.replace({"": pd.NA, "nan": pd.NA, "None": pd.NA})
    for i, c in enumerate(cols):
        if re.match(r"^Unnamed: ?\d+$", str(c)) and i > 0:
            left = cols[i-1]
            left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
            right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
            mask = left_is_int & right_is_frac
            if mask.any():
                # preserve leading zeros on fractional part
                df.loc[mask, left] = df.loc[mask, left].astype(str) + "." + df.loc[mask, c].astype(str)
                to_drop.append(c)
    if to_drop:
        df = df.drop(columns=list(dict.fromkeys(to_drop)), errors="ignore")
    return df

def fix_decimal_cell(val):
    """Turn '13,4399' -> '13.4399', '69 9047' -> '69.9047', leave others unchanged."""
    if pd.isna(val):
        return val
    s = str(val).strip()
    # comma decimal
    if re.fullmatch(r"-?\d+,\d+", s):
        return s.replace(",", ".")
    # space decimal like '69 9047' (one or more spaces)
    if re.fullmatch(r"-?\d+\s+\d+", s):
        return re.sub(r"\s+", ".", s)
    return s

for file in all_files:
    fname = os.path.basename(file).replace(".csv", "")
    # split filename into medida and centro (keeps everything after first '__' as centro)
    medida, centro = fname.split("__", 1)

    # Read as strings to avoid early coercion
    df = pd.read_csv(file, sep=";", dtype=str, engine="python")

    # If CSV was ragged and created Unnamed columns, try to merge split decimals
    df = merge_split_decimals(df)

    # Trim whitespace and normalize empties
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df = df.replace({"": pd.NA})

    # Fix decimal patterns in every column where it's needed
    for col in df.columns:
        # quick check: only act if there is evidence of comma-decimal or spaced-decimal
        col_sample = df[col].dropna().astype(str)
        if col_sample.empty:
            continue
        if col_sample.str.contains(r",").any() or col_sample.str.contains(r"\d+\s+\d+").any():
            df[col] = df[col].apply(fix_decimal_cell)
            # try to convert to numeric (if appropriate)
            df[col] = pd.to_numeric(df[col], errors="ignore")

    # Attach metadata
    df["Medida"] = medida
    df["Centro"] = centro.replace("_", " ")

    dfs.append(df)

# Concatenate all files
combined_df = pd.concat(dfs, ignore_index=True)

# Convert FECHA (YYMMDD) to datetime (Excel will read it fine)
if "FECHA (YYMMDD)" in combined_df.columns:
    combined_df["FECHA (YYMMDD)"] = pd.to_datetime(
        combined_df["FECHA (YYMMDD)"].astype(str).str.strip(), format="%y%m%d", errors="coerce"
    )

# Drop unwanted columns
combined_df = combined_df.drop(columns=["HORA (HHMM)", "Unnamed: 5"], errors="ignore")

# Normalize and convert 'Registros validados' to numeric (handle comma/space decimals if any remain)
if "Registros validados" in combined_df.columns:
    combined_df["Registros validados"] = (
        combined_df["Registros validados"].astype(str)
        .str.replace(",", ".", regex=False)
        .str.replace(r"\s+", ".", regex=True)
    )
    combined_df["Registros validados"] = pd.to_numeric(combined_df["Registros validados"], errors="coerce")

    # Drop Medida+Centro groups where all 'Registros validados' are null
    combined_df = combined_df.groupby(["Medida", "Centro"], group_keys=False).filter(
        lambda g: not g["Registros validados"].isna().all()
    )

# Reset index for a clean output
combined_df = combined_df.reset_index(drop=True)

# Save cleaned output
combined_df.to_csv(out_path, index=False)
print("Saved cleaned file to:", out_path)




  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  df[col] = pd.to_numeric(df[col], errors="ignore")
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_fr

KeyboardInterrupt: 