In [35]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import docx

In [30]:
# Abre el documento de Word
doc_ciencia = docx.Document("ExportacionWordCREA_Ciencia.doc")
doc_deporte = docx.Document("ExportacionWordCREA_Deporte.doc")
doc_viaje = docx.Document("ExportacionWordCREA_Viaje.doc")

archivos = [doc_ciencia, doc_deporte, doc_viaje]

# Inicializa listas para los títulos y textos
titulos = []
documentos = []

# Inicializa una bandera para determinar si estamos en una sección de título o texto
en_titulo = True

# Recorre el contenido del documento
for doc in archivos:
    for paragraph in doc.paragraphs[2:]:
        if paragraph.text:  # Verifica si el párrafo no está vacío
            if en_titulo:
                titulos.append(paragraph.text)
            else:
                documentos.append(paragraph.text)
            # Cambia la bandera en_titulo en cada iteración
            en_titulo = not en_titulo

print(f"Numero de documentos: {len(documentos)}")

Numero de documentos: 491


In [37]:

# Define custom preprocessing function
def custom_preprocess(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('spanish'))
    
    # Tokenization, lowercase, stopword removal, and stemming
    words = text.lower().split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Create a CountVectorizer with custom preprocessing
count_vectorizer = CountVectorizer(preprocessor=custom_preprocess)

# Fit and transform the documents using the vectorizer
count_matrix = count_vectorizer.fit_transform(documentos)

# Convert the term frequency matrix to a Pandas DataFrame
term_frequency_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

# Print the term frequency matrix
print(term_frequency_df)

     000  10  100  11  112  12  120  124  13  1300  ...  óseo  última   
0      0   0    0   0    0   0    0    0   0     0  ...     0       0  \
1      0   0    0   0    0   0    0    0   0     0  ...     0       0   
2      0   0    0   0    0   0    0    0   0     0  ...     0       0   
3      0   0    0   0    0   0    0    0   0     0  ...     0       0   
4      0   0    0   0    0   0    0    0   0     0  ...     0       0   
..   ...  ..  ...  ..  ...  ..  ...  ...  ..   ...  ...   ...     ...   
486    0   0    0   0    0   0    0    0   0     0  ...     0       1   
487    0   0    0   0    0   0    0    0   0     0  ...     0       0   
488    0   0    0   0    0   0    0    0   0     0  ...     0       0   
489    0   0    0   0    0   0    0    0   0     0  ...     0       0   
490    0   0    0   0    0   0    0    0   0     0  ...     0       0   

     últimament  últimamente  último  últimos  única  únicament  único  útil  
0             0            0       0        

In [28]:

# Define custom preprocessing function
def custom_preprocess(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    # Tokenization, lowercase, stopword removal, and stemming
    words = text.lower().split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Create a TfidfVectorizer with custom preprocessing
tfidf_vectorizer = TfidfVectorizer(preprocessor=custom_preprocess)

# Fit and transform the documents using the vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(documentos)

# Convert the TF-IDF matrix to a Pandas DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Print the TF-IDF matrix
print(tfidf_df)

     000   10  100   11  112   12  120  124   13  1300  ...  óseo    última   
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000  \
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   
4    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...   ...       ...   
486  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.132116   
487  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   
488  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   
489  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   
490  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  ...   0.0  0.000000   

     últimament  últimamente  último  últimos     ú