# Leer documentos de la colección

In [229]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from gensim import corpora, models, similarities

import numpy as np

from pathlib import Path


stop_words = stopwords.words('spanish')
lematizador = SnowballStemmer('spanish')

coleccion = {}
lexicon = []

ruta_base = 'coleccion/'

In [230]:
def procesador_documentos():
    contador = 0
    with Path(ruta_base) as coleccion:
        for iterador_documento in coleccion.iterdir():
            ruta_documento = './' + ruta_base + iterador_documento.name
            documento = recuperar_documento(ruta_documento)
            documento = limpiar_documento(documento)
            guardar_documento(iterador_documento.name, documento)
            guardar_lexicon(documento)
        #generar_tabla_frecuencias()

In [231]:
def recuperar_documento(ruta):
    contador = 0
    with open(ruta, 'r') as documento_handler:
        while contador < 500:
            documento_handler.readline()
            contador += 1
        documento = documento_handler.read(5000)
        contador = 0
    return documento

In [232]:
def limpiar_documento(documento):
    documento_limpio = wordpunct_tokenize(documento) #se separa el documento en terminos
    documento_limpio = [termino.lower() for termino in documento_limpio if termino.lower() not in stop_words and len(termino) > 2] #se omite cualquier termino que sea un stop word o de longitud menor a 2
    documento_limpio = [lematizador.stem(termino) for termino in documento_limpio] #se lematiza los términos
    documento_limpio.sort()
    return documento_limpio

In [233]:
def guardar_documento(nombre, documento):
    coleccion[nombre] = documento

In [234]:
def guardar_lexicon(documento):
    for termino in documento:
        if (termino not in lexicon):
            lexicon.append(termino)
    lexicon.sort()

In [235]:
def generar_tabla_frecuencias():
    documentos = list(coleccion.values())
    tamanio = len(lexicon) * len(documentos)
    tabla_frecuencias = np.arange(tamanio, dtype = 'i').reshape(len(documentos), len(lexicon))
    for id_frecuencia, frecuencia in np.ndenumerate(tabla_frecuencias):
        if (lexicon[id_frecuencia[1]] in documentos[id_frecuencia[0]]):
            tabla_frecuencias[id_frecuencia] = documentos[id_frecuencia[0]].count(lexicon[id_frecuencia[1]])
        else:
            tabla_frecuencias[id_frecuencia] = 0 
    return tabla_frecuencias

In [236]:
def calcular_idf():
    tabla_frecuencias_transpueta = generar_tabla_frecuencias().T
    numero_documentos_x_termino = np.arange(tabla_frecuencias_transpueta.shape[0])
    for termino in tabla_frecuencias_transpueta:
        termino
        print(termino)
        '''contar_ceros = 0
    for id_frecuencia, frecuencia in np.ndenumerate(tabla_frecuencias_transpueta):
        if (frecuencia == 0):
            contar_ceros += 1
        if (id_frecuencia[0] % 3 == 0):
            numero_documentos_x_termino[id_frecuencia[0]] = tabla_frecuencias_transpueta.shape[1] - contar_ceros
            contar_ceros = 0
        
    print(numero_documentos_x_termino)'''

In [237]:
procesador_documentos()
calcular_idf()

[0 0 0 0 0 0 2 0 0 1 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0 0]
[1 0 1 0 0 0 0 0 0 1 0]
[2 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 1 4 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 0 1 0]
[2 0 0 0 0 0 2 0 0 0 0]
[ 0  0  0  0  0  0  3  0 22  3  0]
[0 0 0 0 0 0 0 0 8 0 0]
[0 0 0 0 0 0 0 0 7 0 0]
[1 0 0 0 0 2 2 0 4 5 0]
[0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 3 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 1 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[1 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 7 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 1 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 2 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 2 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 1 

[0 3 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 4 0]
[0 0 1 1 0 1 0 0 0 0 0]
[0 5 0 1 0 0 0 1 0 0 1]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0 0]
[0 0 0 1 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0]
[1 0 0 0 0 1 0 0 0 1 0]
[0 0 0 0 0 0 2 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[2 0 3 0 2 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0]
[0 0 1 0 0 0 1 0 0 1 0]
[0 0 1 0 0 1 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 0 1 0 0 0 0 0 0 0]
[3 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 0 0 1]
[0 2 0 0 0 0 0 0 0 0 0]
[0 5 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 1 0 0]
[0 5 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 2 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 0 1 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0

[0 0 0 0 0 0 0 0 0 1 0]
[0 0 3 0 0 0 0 2 0 0 0]
[1 0 0 0 0 0 0 1 0 1 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 1 0 0 0 0 0]
[1 0 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 3 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 1]
[0 0 0 0 0 0 0 0 0 0 2]
[0 0 0 0 0 0 0 0 1 0 0]
[1 0 1 0 0 0 1 0 0 0 0]
[0 0 0 0 0 0 0 0 2 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 1 0 0 0 0 1 0 0 0]
[0 0 0 1 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[1 0 0 2 0 0 1 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0 0]
[0 0 0 0 0 0 1 0 1 2 0]
[1 0 1 0 0 0 1 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 0 0 0 1 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 2 1 0 0 0 0 0 2]
[0 0 0 1 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 4 0 0 0 0 0]
[0 0 2 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 0 0 0 0 0 1]
[1 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 1 0 3 0 0 0]
[0 0 0 0 0 0 0 0