In [1]:
import json
import fitz
import re
import nltk
import sklearn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

In [3]:
# Cargar PDF
doc = fitz.open("constitucio-politica-colombia-1991.pdf")
texto_total = ""

for pagina in doc:
    texto_total += pagina.get_text() + "\n"

# Dividir por artículos usando expresión regular
articulos = re.split(r'(Artículo\s+\d+\.?)', texto_total, flags=re.IGNORECASE)

# Combinar encabezado y cuerpo en cada artículo
estructura = []
for i in range(1, len(articulos), 2):
    titulo = articulos[i].strip()
    cuerpo = articulos[i+1].strip()
    estructura.append({
        "titulo": titulo,
        "contenido": cuerpo
    })

# Guardar en JSON
with open("articulos.json", "w", encoding="utf-8") as f:
    json.dump(estructura, f, ensure_ascii=False, indent=4)


In [4]:
# Cargar datos
with open('articulos.json', 'r', encoding='utf-8') as f:
    constitucion = json.load(f)

In [5]:
# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\juan
[nltk_data]     esteban\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\juan
[nltk_data]     esteban\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Configurar stemmer en español
stemmer = SnowballStemmer('spanish')
stop_words = set(stopwords.words('spanish'))

In [7]:
# Función para limpiar texto
def limpiar_texto(texto):
    texto = texto.lower()
    texto = re.sub(r'[^\w\s]', '', texto)
    tokens = word_tokenize(texto, language='spanish')
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [8]:
# Limpiar artículos
articulos_limpios = {}

for articulo in constitucion:
    titulo = articulo["titulo"]
    texto = articulo["contenido"]
    articulos_limpios[titulo] = limpiar_texto(texto)


In [9]:
from sentence_transformers import SentenceTransformer

# Cargar modelo preentrenado para español
modelo = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# Crear vectores de los artículos
vectores_articulos = {}
textos_originales = {}

for articulo in constitucion:
    num = articulo["titulo"]
    texto_original = articulo["contenido"]

    textos_originales[num] = texto_original
    embedding = modelo.encode(texto_original)
    vectores_articulos[num] = embedding

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
