In [8]:
import os
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Define the path to the directory containing the text files
CORPUS_DIR = "reuters/training"
documents = {}

# Apertura de archivo de Stopwords
Se abre el archivo del cual se va a tomar la lista de stopwords para eliminarlas del diccionario generado a partir de los archivos

In [5]:
with open('reuters/stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = set(word.strip() for word in file.readlines())

# Limpieza de texto
Se realiza la eliminacion de las stopwords del diccionario ademas de realizar la respectiva tokenizacion y el proceso de stemming

In [6]:
def clean_text(text):
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = cleaned_text.lower()
    tokens = cleaned_text.split()
    # Aplicar stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Eliminar stopwords
    cleaned_tokens = [token for token in stemmed_tokens if token not in stop_words]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

In [9]:
for filename in os.listdir(CORPUS_DIR):
    if filename.endswith(".txt"):
        filepath = os.path.join(CORPUS_DIR, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = clean_text(text)
            documents[filename] = cleaned_text

# Aplicacion de Bag of Words

In [9]:
# Convertir el corpus a una lista de textos
corpus = list(documents.values())
# Vectorización usando Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Aplicacion de TF - IDF

In [10]:
# Convertir el corpus a una lista de textos
corpus = list(documents.values())

# Vectorización usando TF-IDF
vectorizer = TfidfVectorizer()
Y = vectorizer.fit_transform(corpus)

# Creacion de Dataframes
Creacion de los dataframes que seran la base para realizar las busquedas y guardar los indices inversos

In [11]:
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=documents.keys())
df_bow

Unnamed: 0,000,0006913,0006916,0007050,0007100,0007150,001,0015,0020,0025,...,zorinski,zseven,zuccherifici,zuckerman,zulia,zurich,zurichbas,zuyuan,zverev,zzzz
1.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10000.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9992.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9993.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9994.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_tf_idf = pd.DataFrame(Y.toarray(), columns=vectorizer.get_feature_names_out(), index=documents.keys())
df_tf_idf

Unnamed: 0,000,0006913,0006916,0007050,0007100,0007150,001,0015,0020,0025,...,zorinski,zseven,zuccherifici,zuckerman,zulia,zurich,zurichbas,zuyuan,zverev,zzzz
1.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9992.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9993.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9994.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
def create_inverted_index(df):
    inverted_index = {}
    
    for column in df.columns:
        for index, value in df[column].items():
            if value != 0:  # Si el valor no es 0, significa que el término está presente en el documento
                if column not in inverted_index:
                    inverted_index[column] = []
                inverted_index[column].append((index, value))
    
    return inverted_index

In [16]:
inverted_index_bow = create_inverted_index(df_bow)
inverted_index_tf_idf = create_inverted_index(df_tf_idf)

In [None]:
def save_inverted_index_to_txt(inverted_index, directory, filename):
    # Crear la carpeta si no existe
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    filepath = os.path.join(directory, filename)
    
    with open(filepath, 'w', encoding='utf-8') as file:
        for term, docs in inverted_index.items():
            file.write(f"Term: {term}\n")
            for doc in docs:
                file.write(f"  Document: {doc[0]}, Weight: {doc[1]}\n")
            file.write("\n")  # Añadir una línea en blanco entre términos para mayor claridad

In [None]:
save_inverted_index_to_txt(inverted_index_bow,'results', 'inverted_index_bow.txt')
save_inverted_index_to_txt(inverted_index_tf_idf, 'results' ,'inverted_index_tf_idf.txt')

# Opcion1

In [2]:
def load_inverted_index_from_txt(filepath):
    inverted_index = {}
    
    with open(filepath, 'r', encoding='utf-8') as file:
        current_term = None
        for line in file:
            line = line.strip()
            if line.startswith("Term:"):
                current_term = line.split("Term: ")[1]
                inverted_index[current_term] = []
            elif line.startswith("Document:"):
                doc_info = line.split("Document: ")[1]
                doc_name, weight = doc_info.split(", Weight: ")
                inverted_index[current_term].append((doc_name, float(weight)))
    
    return inverted_index

# Ejemplo de uso
inverted_index_bow_loaded = load_inverted_index_from_txt('results/inverted_index_bow.txt')
inverted_index_tf_idf_loaded = load_inverted_index_from_txt('results/inverted_index_tf_idf.txt')

In [10]:
def process_query(query):
    cleaned_query = clean_text(query)
    return cleaned_query.split()

def jaccard_similarity(query_tokens, document_tokens):
    intersection = len(set(query_tokens) & set(document_tokens))
    union = len(set(query_tokens) | set(document_tokens))
    return intersection / union if union != 0 else 0  # Avoid division by zero

def search_with_bow(query, inverted_index_bow, documents):
    query_tokens = process_query(query)
    scores = {}
    for term in query_tokens:
        if term in inverted_index_bow:
            for doc_id, bow_count in inverted_index_bow[term]:
                if doc_id in scores:
                    scores[doc_id] += bow_count
                else:
                    scores[doc_id] = bow_count
    
    ranked_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_results

def search_with_tfidf(query, inverted_index_tfidf, documents):
    query_tokens = process_query(query)
    scores = {}
    for term in query_tokens:
        if term in inverted_index_tfidf:
            for doc_id, tfidf_score in inverted_index_tfidf[term]:
                if doc_id in scores:
                    scores[doc_id] += tfidf_score
                else:
                    scores[doc_id] = tfidf_score
    
    ranked_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_results

# Ejemplo de uso
query = "tea"

# Búsqueda con Bag of Words
results_bow = search_with_bow(query, inverted_index_bow_loaded, documents)
print("Resultados con Bag of Words:")
for doc_id, score in results_bow:  # Mostrar los 5 documentos más relevantes
    print(f"Documento: {doc_id}, Score: {score}")

# Búsqueda con TF-IDF
results_tfidf = search_with_tfidf(query, inverted_index_tf_idf_loaded, documents)
print("\nResultados con TF-IDF:")
for doc_id, score in results_tfidf:  # Mostrar los 5 documentos más relevantes
    print(f"Documento: {doc_id}, Score: {score}")


Resultados con Bag of Words:
Documento: 12754.txt, Score: 11.0
Documento: 275.txt, Score: 9.0
Documento: 12907.txt, Score: 5.0
Documento: 6440.txt, Score: 2.0
Documento: 7545.txt, Score: 2.0
Documento: 10268.txt, Score: 1.0
Documento: 10375.txt, Score: 1.0
Documento: 10406.txt, Score: 1.0
Documento: 11882.txt, Score: 1.0
Documento: 11949.txt, Score: 1.0
Documento: 1723.txt, Score: 1.0
Documento: 235.txt, Score: 1.0
Documento: 4637.txt, Score: 1.0
Documento: 6338.txt, Score: 1.0
Documento: 6436.txt, Score: 1.0
Documento: 6447.txt, Score: 1.0
Documento: 6465.txt, Score: 1.0
Documento: 8149.txt, Score: 1.0
Documento: 9044.txt, Score: 1.0
Documento: 9153.txt, Score: 1.0
Documento: 9327.txt, Score: 1.0

Resultados con TF-IDF:
Documento: 12754.txt, Score: 0.5177027514753223
Documento: 6440.txt, Score: 0.43134082511897803
Documento: 275.txt, Score: 0.4304149392608616
Documento: 9044.txt, Score: 0.3371384946580558
Documento: 12907.txt, Score: 0.2910566514974324
Documento: 4637.txt, Score: 0.16

# opcion 3

In [13]:
# Funciones de búsqueda y Jaccard
def process_query(query):
    cleaned_query = clean_text(query)
    return cleaned_query.split()

def jaccard_similarity(query_tokens, document_tokens):
    intersection = len(set(query_tokens) & set(document_tokens))
    union = len(set(query_tokens) | set(document_tokens))
    return intersection / union if union != 0 else 0  # Avoid division by zero

def search_with_bow(query, inverted_index_bow, documents):
    query_tokens = process_query(query)
    document_tokens = {doc_id: documents[doc_id].split() for doc_id in documents}
    scores = {}
    for term in query_tokens:
        if term in inverted_index_bow:
            for doc_id, bow_count in inverted_index_bow[term]:
                if doc_id not in scores:
                    scores[doc_id] = 0
                scores[doc_id] += bow_count
    results = []
    for doc_id in scores:
        similarity = jaccard_similarity(query_tokens, document_tokens[doc_id])
        results.append((doc_id, similarity))
    ranked_results = sorted(results, key=lambda x: x[1], reverse=True)
    return ranked_results

def search_with_tfidf(query, inverted_index_tfidf, documents):
    query_tokens = process_query(query)
    document_tokens = {doc_id: documents[doc_id].split() for doc_id in documents}
    scores = {}
    for term in query_tokens:
        if term in inverted_index_tfidf:
            for doc_id, tfidf_score in inverted_index_tfidf[term]:
                if doc_id not in scores:
                    scores[doc_id] = 0
                scores[doc_id] += tfidf_score
    results = []
    for doc_id in scores:
        similarity = jaccard_similarity(query_tokens, document_tokens[doc_id])
        results.append((doc_id, similarity))
    ranked_results = sorted(results, key=lambda x: x[1], reverse=True)
    return ranked_results

# Ejemplo de uso
query = "tea"

# Búsqueda con Bag of Words
results_bow = search_with_bow(query, inverted_index_bow_loaded, documents)
print("Bag of Words Jaccard Similarities:")
print(pd.DataFrame(results_bow, columns=['document', 'similarity']))

# Búsqueda con TF-IDF
results_tfidf = search_with_tfidf(query, inverted_index_tf_idf_loaded, documents)

print(documents)
print("\nTF-IDF Jaccard Similarities:")
print(pd.DataFrame(results_tfidf, columns=['document', 'similarity']))

Bag of Words Jaccard Similarities:
     document  similarity
0    6440.txt    0.090909
1    9044.txt    0.071429
2    4637.txt    0.031250
3    6436.txt    0.021277
4    9153.txt    0.021277
5    8149.txt    0.017544
6    1723.txt    0.017241
7    6465.txt    0.016393
8    6447.txt    0.015385
9    9327.txt    0.011905
10  12754.txt    0.010526
11  11949.txt    0.010204
12    275.txt    0.009709
13  12907.txt    0.009524
14  10406.txt    0.009009
15    235.txt    0.006849
16  11882.txt    0.005952
17  10375.txt    0.004950
18   6338.txt    0.004310
19  10268.txt    0.004065
20   7545.txt    0.003984
{'1.txt': 'bahia cocoa review shower continu week bahia cocoa zone allevi drought sinc earli januari improv prospect temporao normal humid level restor comissaria smith weekli review dri period temporao late thi year arriv week end februari 22 155221 bag 60 kilo make cumul total season 593 mln 581 stage year cocoa deliv earlier consign wa includ arriv figur comissaria smith doubt crop cocoa

# Opcion4

In [40]:
import re
from nltk.stem import PorterStemmer

def preprocess_query(query):
    cleaned_query = re.sub(r'[^\w\s]', '', query)
    cleaned_query = cleaned_query.lower()
    tokens = cleaned_query.split()
    # Aplicar stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Eliminar stopwords
    cleaned_tokens = [token for token in stemmed_tokens if token not in stop_words]
    return cleaned_tokens

def jaccard_similarity(query_tokens, document_tokens):
    intersection = len(set(query_tokens).intersection(set(document_tokens)))
    union = len(set(query_tokens).union(set(document_tokens)))
    return intersection / union if union != 0 else 0

def search_index(query, inverted_index):
    query_tokens = preprocess_query(query)
    document_scores = {}
    for token in query_tokens:
        if token in inverted_index:
            for document, score in inverted_index[token]:
                if document not in document_scores:
                    document_scores[document] = 0
                document_scores[document] += score * jaccard_similarity(query_tokens, df_bow.loc[document].to_numpy().nonzero()[0])
    sorted_documents = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_documents

query = "tea"
results_bow = search_index(query, inverted_index_bow)
results_tfidf = search_index(query, inverted_index_tf_idf)

print("Resultados de la búsqueda usando Bag of Words:")
for document, score in results_bow:
    print(f"Documento: {document}, Puntaje de similitud: {score}")

print("\nResultados de la búsqueda usando TF-IDF:")
for document, score in results_tfidf:
    print(f"Documento: {document}, Puntaje de similitud: {score}")



Resultados de la búsqueda usando Bag of Words:
Documento: 10268.txt, Puntaje de similitud: 0.0
Documento: 10375.txt, Puntaje de similitud: 0.0
Documento: 10406.txt, Puntaje de similitud: 0.0
Documento: 11882.txt, Puntaje de similitud: 0.0
Documento: 11949.txt, Puntaje de similitud: 0.0
Documento: 12754.txt, Puntaje de similitud: 0.0
Documento: 12907.txt, Puntaje de similitud: 0.0
Documento: 1723.txt, Puntaje de similitud: 0.0
Documento: 235.txt, Puntaje de similitud: 0.0
Documento: 275.txt, Puntaje de similitud: 0.0
Documento: 4637.txt, Puntaje de similitud: 0.0
Documento: 6338.txt, Puntaje de similitud: 0.0
Documento: 6436.txt, Puntaje de similitud: 0.0
Documento: 6440.txt, Puntaje de similitud: 0.0
Documento: 6447.txt, Puntaje de similitud: 0.0
Documento: 6465.txt, Puntaje de similitud: 0.0
Documento: 7545.txt, Puntaje de similitud: 0.0
Documento: 8149.txt, Puntaje de similitud: 0.0
Documento: 9044.txt, Puntaje de similitud: 0.0
Documento: 9153.txt, Puntaje de similitud: 0.0
Document