# Taller básico sobre WebScraping y MONGOdb
----

0.   Trabajar con google drive
1.   Instalar librerías para webscraping y lectura de PDFs
2.   Crear doom inicial (conf y utilidades)
3.   Crear JSON a partir de los PDFs extraidos según recorrido del DOM
4.   Cargar json a mongoAtlas



# 0.trabajar con google Drive


In [None]:
# habilitamos drive de google desde colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1.Instalar librerías

In [None]:
!pip install requests beautifulsoup4 lxml
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


# 2.Crear doom inicial (conf y utilidades)

In [None]:
import os, re, time, json, hashlib
from urllib.parse import urljoin
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

# Carpeta destino en tu Drive (se crea si no existe)
DEST_DIR = "/content/drive/MyDrive/Big data/BasesDatos/Taller 1"
os.makedirs(DEST_DIR, exist_ok=True)

BASE_URL = "https://www.mininterior.gov.co/normatividad/?filter=true&page={page}"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; colab-scraper/1.0)"}
TIMEOUT = 25
PAUSE = 1.2  # ser amable con el servidor

def norm(s: str) -> str:
    return re.sub(r"\s+"," ", s or "").strip()

def sha1(s: str) -> str:
    import hashlib
    return hashlib.sha1(s.encode('utf-8')).hexdigest()

def fetch_html(url: str) -> BeautifulSoup:
    r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")


# 3.Crear JSON a partir de los PDFs extraidos según recorrido del DOM


---


In [None]:
import os, re, time, json, hashlib
from urllib.parse import urljoin
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

# === CONFIGURACIÓN ===

def collect_pdf_links(pages=(1,2)):
    found, seen = [], set()
    for p in pages:
        url = BASE_URL.format(page=p)
        soup = fetch_html(url)
        anchors = soup.select('a.dmach-acf-value.et_pb_button[href]')
        anchors += [a for a in soup.select('a[href]') if a.get_text(strip=True).lower() == "documento"]
        for a in anchors:
            href = a.get("href", "")
            if not href:
                continue
            abs_url = urljoin(url, href)
            if abs_url.lower().endswith(".pdf") and abs_url not in seen:
                seen.add(abs_url)
                found.append(abs_url)
        time.sleep(PAUSE)
    return found

def download_pdf(url: str, dest_dir: str) -> str:
    fname = os.path.basename(url.split("?")[0]) or sha1(url) + ".pdf"
    local_path = os.path.join(dest_dir, fname)
    if not os.path.exists(local_path):
        print(f"  ↓ Descargando {fname}")
        with requests.get(url, headers=HEADERS, timeout=TIMEOUT, stream=True, verify=False) as r:
            r.raise_for_status()
            with open(local_path, "wb") as f:
                for chunk in r.iter_content(8192):
                    if chunk:
                        f.write(chunk)
    else:
        print(f"  ✓ Ya existe {fname}")
    return local_path

def extract_pdf_text(local_path: str) -> str:
    try:
        txt = extract_text(local_path) or ""
        return norm(txt)
    except Exception as e:
        print(f"[WARN] No se pudo extraer texto de {os.path.basename(local_path)}: {e}")
        return ""

# === FLUJO PRINCIPAL ===
pdf_links = collect_pdf_links()
print(f"Se encontraron {len(pdf_links)} enlaces PDF.")
pdf_links = pdf_links[:MAX_DOCS]

records = []
for i, pdf_url in enumerate(pdf_links, 1):
    print(f"\n[{i}/{len(pdf_links)}] {pdf_url}")
    local_path = download_pdf(pdf_url, DEST_DIR)
    text = extract_pdf_text(local_path)

    rec = {
        "_id": sha1(pdf_url),
        "fuente": "Ministerio del Interior - Normatividad",
        "pdf_url": pdf_url,
        "archivo": os.path.basename(local_path),
        "ruta_local": local_path,
        "extraido_en": datetime.utcnow().isoformat() + "Z",
        "texto": text
    }
    records.append(rec)
    time.sleep(PAUSE)

# === GUARDAR JSONS ===
for rec in records:
    out_path = os.path.join(DEST_DIR, os.path.splitext(rec["archivo"])[0] + ".json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(rec, f, ensure_ascii=False, indent=2)

index_path = os.path.join(DEST_DIR, "index.json")
with open(index_path, "w", encoding="utf-8") as f:
    json.dump({"total": len(records), "documentos": records}, f, ensure_ascii=False, indent=2)

print(f"\n✅ Listo. Se descargaron y procesaron {len(records)} PDFs.")
print(f"Los archivos y JSON se guardaron en:\n{DEST_DIR}")


Se encontraron 12 enlaces PDF.

[1/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/directiva-005-de-2024.pdf
  ✓ Ya existe directiva-005-de-2024.pdf


  "extraido_en": datetime.utcnow().isoformat() + "Z",



[2/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/oficio-no.-p-1159-2025-09-23.pdf
  ✓ Ya existe oficio-no.-p-1159-2025-09-23.pdf

[3/10] https://www.mininterior.gov.co/wp-content/uploads/2025/10/1512-1.pdf
  ✓ Ya existe 1512-1.pdf

[4/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/27_autoqueadmite_202500231autoadmisor_0_20250813111030442.pdf
  ✓ Ya existe 27_autoqueadmite_202500231autoadmisor_0_20250813111030442.pdf

[5/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/resolucion-numero-rpu02322025-del-05-de-septiembre-de-2025.pdf
  ✓ Ya existe resolucion-numero-rpu02322025-del-05-de-septiembre-de-2025.pdf

[6/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/resolucion-numero-rpu02462025-del-08-de-septiembre-de-2025.pdf
  ✓ Ya existe resolucion-numero-rpu02462025-del-08-de-septiembre-de-2025.pdf

[7/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/resolucion-numero-rpu02582025-del-15-de-septiembre-de-2025.pdf
  ↓ Des

In [None]:
# ================================================================
#  Descarga de 10 PDFs del MinInterior + extracción de texto (OCR)
#  Guarda .pdf y .json en Google Drive
# ================================================================

!apt-get -qq install poppler-utils tesseract-ocr
!pip install -q requests beautifulsoup4 lxml pdfminer.six pytesseract pdf2image

import os, re, time, json, hashlib
from urllib.parse import urljoin
from datetime import datetime

import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

# === CONFIGURACIÓN ===

def collect_pdf_links(pages=(1,2)):
    found, seen = [], set()
    for p in pages:
        url = BASE_URL.format(page=p)
        soup = fetch_html(url)
        anchors = soup.select('a.dmach-acf-value.et_pb_button[href]')
        anchors += [a for a in soup.select('a[href]') if a.get_text(strip=True).lower() == "documento"]
        for a in anchors:
            href = a.get("href", "")
            if not href:
                continue
            abs_url = urljoin(url, href)
            if abs_url.lower().endswith(".pdf") and abs_url not in seen:
                seen.add(abs_url)
                found.append(abs_url)
        time.sleep(PAUSE)
    return found

def download_pdf(url: str, dest_dir: str) -> str:
    fname = os.path.basename(url.split("?")[0]) or sha1(url) + ".pdf"
    local_path = os.path.join(dest_dir, fname)
    if not os.path.exists(local_path):
        print(f"  ↓ Descargando {fname}")
        with requests.get(url, headers=HEADERS, timeout=TIMEOUT, stream=True, verify=False) as r:
            r.raise_for_status()
            with open(local_path, "wb") as f:
                for chunk in r.iter_content(8192):
                    if chunk:
                        f.write(chunk)
    else:
        print(f"  ✓ Ya existe {fname}")
    return local_path

def extract_pdf_text(local_path: str) -> str:
    try:
        txt = extract_text(local_path) or ""
        if txt.strip():
            return re.sub(r"\s+", " ", txt.strip())

        # --- OCR fallback ---
        print(f"[OCR] Extrayendo texto (imagen) → {os.path.basename(local_path)}")
        images = convert_from_path(local_path, dpi=200)
        ocr_text = ""
        for img in images:
            ocr_text += pytesseract.image_to_string(img, lang="spa") + "\n"
        return re.sub(r"\s+", " ", ocr_text.strip())
    except Exception as e:
        print(f"[WARN] No se pudo extraer texto de {os.path.basename(local_path)}: {e}")
        return ""

# === FLUJO PRINCIPAL ===
pdf_links = collect_pdf_links()
print(f"Se encontraron {len(pdf_links)} enlaces PDF.")
pdf_links = pdf_links[:MAX_DOCS]

records = []
for i, pdf_url in enumerate(pdf_links, 1):
    print(f"\n[{i}/{len(pdf_links)}] {pdf_url}")
    local_path = download_pdf(pdf_url, DEST_DIR)
    text = extract_pdf_text(local_path)

    rec = {
        "_id": sha1(pdf_url),
        "fuente": "Ministerio del Interior - Normatividad",
        "pdf_url": pdf_url,
        "archivo": os.path.basename(local_path),
        "ruta_local": local_path,
        "extraido_en": datetime.utcnow().isoformat() + "Z",
        "texto": text
    }
    records.append(rec)
    time.sleep(PAUSE)

# === GUARDAR JSONs ===
for rec in records:
    out_path = os.path.join(DEST_DIR, os.path.splitext(rec["archivo"])[0] + ".json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(rec, f, ensure_ascii=False, indent=2)

index_path = os.path.join(DEST_DIR, "index.json")
with open(index_path, "w", encoding="utf-8") as f:
    json.dump({"total": len(records), "documentos": records}, f, ensure_ascii=False, indent=2)

print(f"\n✅ Listo. Se descargaron y procesaron {len(records)} PDFs (con OCR si fue necesario).")
print(f"Archivos y JSON guardados en:\n{DEST_DIR}")


Selecting previously unselected package poppler-utils.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.11_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.11) ...
Setting up poppler-utils (22.02.0-2ubuntu0.11) ...
Processing triggers for man-db (2.10.2-1) ...
Se encontraron 12 enlaces PDF.

[1/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/directiva-005-de-2024.pdf
  ↓ Descargando directiva-005-de-2024.pdf
[OCR] Extrayendo texto (imagen) → directiva-005-de-2024.pdf
[WARN] No se pudo extraer texto de directiva-005-de-2024.pdf: (1, 'Error opening data file /usr/share/tesseract-ocr/4.00/tessdata/spa.traineddata Please make sure the TESSDATA_PREFIX environment variable is set to your "tessdata" directory. Failed loading language \'spa\' Tesseract couldn\'t load any languages! Could not initialize tesseract.')


  "extraido_en": datetime.utcnow().isoformat() + "Z",



[2/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/oficio-no.-p-1159-2025-09-23.pdf
  ↓ Descargando oficio-no.-p-1159-2025-09-23.pdf
[OCR] Extrayendo texto (imagen) → oficio-no.-p-1159-2025-09-23.pdf
[WARN] No se pudo extraer texto de oficio-no.-p-1159-2025-09-23.pdf: (1, 'Error opening data file /usr/share/tesseract-ocr/4.00/tessdata/spa.traineddata Please make sure the TESSDATA_PREFIX environment variable is set to your "tessdata" directory. Failed loading language \'spa\' Tesseract couldn\'t load any languages! Could not initialize tesseract.')


  "extraido_en": datetime.utcnow().isoformat() + "Z",



[3/10] https://www.mininterior.gov.co/wp-content/uploads/2025/10/1512-1.pdf
  ↓ Descargando 1512-1.pdf
[OCR] Extrayendo texto (imagen) → 1512-1.pdf
[WARN] No se pudo extraer texto de 1512-1.pdf: (1, 'Error opening data file /usr/share/tesseract-ocr/4.00/tessdata/spa.traineddata Please make sure the TESSDATA_PREFIX environment variable is set to your "tessdata" directory. Failed loading language \'spa\' Tesseract couldn\'t load any languages! Could not initialize tesseract.')


  "extraido_en": datetime.utcnow().isoformat() + "Z",



[4/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/27_autoqueadmite_202500231autoadmisor_0_20250813111030442.pdf
  ↓ Descargando 27_autoqueadmite_202500231autoadmisor_0_20250813111030442.pdf

[5/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/resolucion-numero-rpu02322025-del-05-de-septiembre-de-2025.pdf
  ↓ Descargando resolucion-numero-rpu02322025-del-05-de-septiembre-de-2025.pdf

[6/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/resolucion-numero-rpu02462025-del-08-de-septiembre-de-2025.pdf
  ↓ Descargando resolucion-numero-rpu02462025-del-08-de-septiembre-de-2025.pdf

[7/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/resolucion-numero-rpu02582025-del-15-de-septiembre-de-2025.pdf
  ↓ Descargando resolucion-numero-rpu02582025-del-15-de-septiembre-de-2025.pdf

[8/10] https://www.mininterior.gov.co/wp-content/uploads/2025/09/resolucion-numero-rpu02592025-del-15-de-septiembre-de-2025.pdf
  ↓ Descargando resolucion-numero-rpu0259

# 4. cargar json a mongoAtlas

## 4.1 librerias

In [None]:
!pip install pymongo
!pip install py2neo

Collecting pymongo
  Downloading pymongo-4.15.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.8.0-py3-none-any.whl.metadata (5.7 kB)
Downloading pymongo-4.15.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.8.0-py3-none-any.whl (331 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.1/331.1 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.8.0 pymongo-4.15.3
Collecting py2neo
  Downloading py2neo-2021.2.4-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting interchange~=2021.0.4 (from py2neo)
  Downloading interchange-2021.0.4-py2.py3-none-any.whl.metadata (1.9 kB)
Colle

## 4.2 Establecer la conexión a mongoAtlas

In [None]:
from pymongo import MongoClient

#reemplazar el <db_password>
uri = "mongodb+srv://emoram2_password:123@emoram2.eea7az1.mongodb.net/?retryWrites=true&w=majority&appName=emoram2"
client = MongoClient(uri)
client.stats

Database(MongoClient(host=['ac-tf6ot4i-shard-00-01.eea7az1.mongodb.net:27017', 'ac-tf6ot4i-shard-00-02.eea7az1.mongodb.net:27017', 'ac-tf6ot4i-shard-00-00.eea7az1.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='emoram2', authsource='admin', replicaset='atlas-uwfx1w-shard-0', tls=True), 'stats')

## 4.3 Crear una base de datos (minInterior)

In [None]:
db_name='minInterior'
db = client[db_name]  #crear una base de datos

#crear una coleccion
collection_name='normatividad'
collection = db[collection_name]

print(f" base de datps {db_name}, coleccion {collection_name} creadas exitosamente")

 base de datps minInterior, coleccion normatividad creadas exitosamente


## 4.4 cargar archivos json a coleccion

In [None]:
import os
import json
from pymongo.errors import PyMongoError

json_input_dir = '/content/drive/MyDrive/Big data/BasesDatos/Taller 1'

# Get list of JSON files
json_files = [f for f in os.listdir(json_input_dir) if f.endswith('.json')]

# Counter for successfully loaded files
loaded_count = 0
failed_files = []

print(f"Starting to load {len(json_files)} JSON files into MongoDB...")

for json_file in json_files:
    json_path = os.path.join(json_input_dir, json_file)
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Insert the data into the collection
        insert_result = collection.insert_one(data)

        if insert_result.inserted_id:
            print(f"Successfully loaded {json_file}. Inserted ID: {insert_result.inserted_id}")
            loaded_count += 1
        else:
            print(f"Failed to insert data from {json_file}. No inserted_id returned.")
            failed_files.append(json_file)

    except FileNotFoundError:
        print(f"Error: File not found at {json_path}")
        failed_files.append(json_file)
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON in file {json_file}")
        failed_files.append(json_file)
    except PyMongoError as e:
        print(f"MongoDB error while loading {json_file}: {e}")
        failed_files.append(json_file)
    except Exception as e:
        print(f"An unexpected error occurred while processing {json_file}: {e}")
        failed_files.append(json_file)

print(f"\nFinished loading JSON files.")
print(f"Successfully loaded {loaded_count} files.")
if failed_files:
    print(f"Failed to load {len(failed_files)} files: {failed_files}")

Starting to load 11 JSON files into MongoDB...
Successfully loaded directiva-005-de-2024.json. Inserted ID: a13a680b0f9e7cb1c9bcd6fcf2ebd46bdba252cf
Successfully loaded oficio-no.-p-1159-2025-09-23.json. Inserted ID: f7a497794670d5fca2c3941c94bda9586f5f7d2f
Successfully loaded 1512-1.json. Inserted ID: 168d01f3dbbc8966df4cbc295795a7a7833d9291
Successfully loaded 27_autoqueadmite_202500231autoadmisor_0_20250813111030442.json. Inserted ID: f212ac11e48f92b2509d1d79281c57d1d8c77f02
Successfully loaded resolucion-numero-rpu02322025-del-05-de-septiembre-de-2025.json. Inserted ID: c6f78ae16bbd5b0b37da45b9d363a2fb034aef38
Successfully loaded resolucion-numero-rpu02462025-del-08-de-septiembre-de-2025.json. Inserted ID: 49f3c323b89d33d9fa34fded1f78800acb5932fb
Successfully loaded resolucion-numero-rpu02582025-del-15-de-septiembre-de-2025.json. Inserted ID: bd21f6eb62d6b2f467c1d24120bb739eb450eda2
Successfully loaded resolucion-numero-rpu02592025-del-15-de-septiembre-de-2025.json. Inserted ID: 17

#