We began by downloading the data corpus

In [None]:
with open('/content/sample_file.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

# Splitting the main file

In [None]:
import re
import os


pattern = r'^REVISION \d+'


output_directory = r'/content/documents'


os.makedirs(output_directory, exist_ok=True)


with open('/content/sample_file.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

documents = re.split(pattern, text_data, flags=re.MULTILINE)

documents = [doc.strip() for doc in documents if doc.strip()]


max_documents = 1000
documents = documents[:max_documents]


for i, doc in enumerate(documents):
    Create the file path for each document
    file_path = os.path.join(output_directory, f'document_{i+1}.txt')
    with open(file_path, 'w', encoding='utf-8') as doc_file:
        doc_file.write(doc)
    print(f"Saved: {file_path}")


Saved: /content/documents/document_1.txt
Saved: /content/documents/document_2.txt
Saved: /content/documents/document_3.txt
Saved: /content/documents/document_4.txt
Saved: /content/documents/document_5.txt
Saved: /content/documents/document_6.txt
Saved: /content/documents/document_7.txt
Saved: /content/documents/document_8.txt
Saved: /content/documents/document_9.txt
Saved: /content/documents/document_10.txt
Saved: /content/documents/document_11.txt
Saved: /content/documents/document_12.txt
Saved: /content/documents/document_13.txt
Saved: /content/documents/document_14.txt
Saved: /content/documents/document_15.txt
Saved: /content/documents/document_16.txt
Saved: /content/documents/document_17.txt
Saved: /content/documents/document_18.txt
Saved: /content/documents/document_19.txt
Saved: /content/documents/document_20.txt
Saved: /content/documents/document_21.txt
Saved: /content/documents/document_22.txt
Saved: /content/documents/document_23.txt
Saved: /content/documents/document_24.txt
S

After that we moved on to preparing the text documents for phase one. This involved a preprocessing step, where we performed the following tasks:
-Converting words to lowercase
-Removing HTML tags
-Removing URLs
-Removing special characters
-Removing extra whitespaces
We combined these tasks into a function called "preprocess_text".

# Data Preprocessing

In [None]:
import re
import os
from bs4 import BeautifulSoup

preprocessed_directory = r'/content/preprocessed_documents'


os.makedirs(preprocessed_directory, exist_ok=True)


def preprocess_text(text):
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text


In order to process each document in the corpus, we will iterate through the documents using a loop, and then save the preprocessed document.

In [None]:
for filename in os.listdir(output_directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(output_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data = file.read()

        # Preprocess the document
        preprocessed_doc = preprocess_text(text_data)


        preprocessed_file_path = os.path.join(preprocessed_directory, filename.replace('.txt', '_preprocessed.txt'))
        with open(preprocessed_file_path, 'w', encoding='utf-8') as preprocessed_file:
            preprocessed_file.write(preprocessed_doc)
            print(f"Preprocessed document saved to {preprocessed_file_path}")


Preprocessed document saved to /content/preprocessed_documents/document_969_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_765_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_35_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_767_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_686_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_116_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_888_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_537_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_499_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_103_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document

  text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags


Preprocessed document saved to /content/preprocessed_documents/document_376_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_525_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_181_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_246_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_821_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_313_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_266_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_590_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_14_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_13_preprocessed.txt
Preprocessed document saved to /content/preprocessed_documents/document_

# Tokenization

In the tokenization step, we utilized two primary libraries: os and nltk. The os library provides functions for interacting with the operating system to manage directories and file paths. The second library, nltk (Natural Language Toolkit), was used to split the text into individual words or tokens.

In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize


nltk.download('punkt')


input_directory = r'/content/preprocessed_documents'
tokenized_directory = r'/content/tokenized_documents'

os.makedirs(tokenized_directory, exist_ok=True)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import os
import nltk

The implementation started by specifying two directories: the input_directory for preprocessed documents and the tokenized_directory where the tokenized documents would be saved.

In [None]:
for filename in os.listdir(input_directory):
    if filename.endswith('_preprocessed.txt'):
        file_path = os.path.join(input_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data = file.read()


        tokens = word_tokenize(text_data)


        tokenized_file_path = os.path.join(tokenized_directory, filename.replace('_preprocessed.txt', '_tokenized.txt'))
        with open(tokenized_file_path, 'w', encoding='utf-8') as file:
            for token in tokens:
                file.write(token + '\n')
        print(f"Tokenized document saved to {tokenized_file_path}")


Tokenized document saved to /content/tokenized_documents/document_380_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_702_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_948_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_988_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_874_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_235_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_911_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_613_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_16_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_398_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_441_tokenized.txt
Tokenized document saved to /content/tokenized_documents/document_877_tokeniz

Next, the script read each preprocessed document from the input directory by iterating over the files listed in os.listdir(input_directory). It filtered out files that did not end with '_preprocessed.txt', ensuring that only preprocessed documents were processed.
For each valid file, the script opened it in read mode, read its contents into a variable named text_data, and then used word_tokenize(text_data) from the nltk library to split the text into tokens. Finally, the tokenized data was saved to a new file.

# Stemming

In the stemming step, we use the os and nltk libraries, specifically utilizing the PorterStemmer from the nltk.stem module. As we have studied, stemming is the process of reducing words to their root forms. This helps standardize words for analysis by removing affixes, thereby reducing the variation of word forms within the data corpus.

The stemming process begins by defining the directories for input and output. The tokenized_directory contains the previously tokenized documents, while the stemmed_directory is designated to store the stemmed versions of these documents.

In [None]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

tokenized_directory = r'/content/tokenized_documents'
stemmed_directory = r'/content/stemmed_documents'

os.makedirs(stemmed_directory, exist_ok=True)


We start by initializing a PorterStemmer object using stemmer = PorterStemmer(). The script then iterates over each file in the tokenized_directory using os.listdir(tokenized_directory). Similar to the tokenization step, it filters the files to include only those that end with '_tokenized.txt', ensuring that only tokenized documents are processed.

For each valid file, the script opens it in read mode, reads each line (representing a token), and stores these tokens in a list. This step converts the text data into a structured list format suitable for stemming. The script then applies the stemming process to each token using a list comprehension: stemmed_tokens = [stemmer.stem(token) for token in tokens].

Finally, the stemmed tokens are saved into a new file. Each file is named with the suffix '_stemmed.txt' to indicate that the tokens have been stemmed.

In [None]:
# Initialize the stemmer
stemmer = PorterStemmer()


for filename in os.listdir(tokenized_directory):
    if filename.endswith('_tokenized.txt'):
        file_path = os.path.join(tokenized_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            tokens = [line.strip() for line in file]

        stemmed_tokens = [stemmer.stem(token) for token in tokens]


        stemmed_file_path = os.path.join(stemmed_directory, filename.replace('_tokenized.txt', '_stemmed.txt'))
        with open(stemmed_file_path, 'w', encoding='utf-8') as file:
            for token in stemmed_tokens:
                file.write(token + '\n')
        print(f"Stemmed document saved to {stemmed_file_path}")


Stemmed document saved to /content/stemmed_documents/document_456_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_238_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_594_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_212_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_430_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_788_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_488_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_927_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_953_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_768_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_889_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_684_stemmed.txt
Stemmed document saved to /content/stemmed_documents/document_97

# constructing the inverted index

In constructing the inverted index, we use the os library and the defaultdict class from Python’s collections module. The defaultdict class allows us to create dictionaries that automatically initialize default values, making it ideal for counting term frequencies and storing document identifiers associated with each term.

In [None]:
import os
from collections import defaultdict


stemmed_directory = r'/content/stemmed_documents'
document_lengths = defaultdict(int)
inverted_index = defaultdict(list)
doc_max_term_freq = defaultdict(int)
most_frequent_term_per_doc = {}
least_frequent_term_per_doc = {}

The process begins by defining the stemmed_directory, which contains the stemmed documents to be processed. Several defaultdict objects are initialized to track various aspects of the documents:

document_lengths: Tracks the number of tokens in each document.
inverted_index: Maps each term to a list of tuples, with each tuple containing a document ID and the term's position in the document.
doc_max_term_freq: Tracks the maximum term frequency for each document.
most_frequent_term_per_doc and least_frequent_term_per_doc: Track the most and least frequent terms within each document, respectively.

In [None]:
# Process each stemmed document
for doc_id, filename in enumerate(os.listdir(stemmed_directory)):
    if filename.endswith('_stemmed.txt'):
        file_path = os.path.join(stemmed_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            tokens = [line.strip() for line in file]
            document_lengths[doc_id] = len(tokens)
            token_freq = defaultdict(int)

            # Count term frequencies
            for token in tokens:
                token_freq[token] += 1
                inverted_index[token].append((doc_id, tokens.index(token)))

            # Find most and least frequent terms for the document
            if token_freq:
                most_frequent_term = max(token_freq, key=token_freq.get)
                least_frequent_term = min(token_freq, key=token_freq.get)
                most_frequent_term_per_doc[doc_id] = (most_frequent_term, token_freq[most_frequent_term])
                least_frequent_term_per_doc[doc_id] = (least_frequent_term, token_freq[least_frequent_term])



To build the inverted_index, the script iterates over each token, updating token_freq and adding the token's occurrence to the inverted_index. Each token is mapped to a list of tuples containing the document ID and the token's position.

After processing all tokens, the most and least frequent terms are identified using the max() and min() functions on the token_freq dictionary. These terms, along with their frequencies, are stored in most_frequent_term_per_doc and least_frequent_term_per_doc.  


In [None]:
import json
# Output results
print(f"Total number of documents: {len(document_lengths)}")

for doc_id, (term, freq) in most_frequent_term_per_doc.items():
    print(f"Document ID {doc_id}: Most frequent term is '{term}' with frequency {freq}")

for doc_id, (term, freq) in least_frequent_term_per_doc.items():
    print(f"Document ID {doc_id}: Least frequent term is '{term}' with frequency {freq}")

# Save the inverted index to a JSON file
with open('inverted_index.json', 'w', encoding='utf-8') as outfile:
    json.dump(inverted_index, outfile, indent=4)


Total number of documents: 1000
Document ID 0: Most frequent term is 'element_cel' with frequency 134
Document ID 1: Most frequent term is 'element_cel' with frequency 134
Document ID 2: Most frequent term is 'brion_vibb' with frequency 8
Document ID 3: Most frequent term is 'maveric149' with frequency 11
Document ID 4: Most frequent term is 'steffenb' with frequency 12
Document ID 5: Most frequent term is 'canadian_c' with frequency 3
Document ID 6: Most frequent term is 'femto' with frequency 27
Document ID 7: Most frequent term is 'maveric149' with frequency 37
Document ID 8: Most frequent term is 'maveric149' with frequency 5
Document ID 9: Most frequent term is 'maveric149' with frequency 38
Document ID 10: Most frequent term is 'femto' with frequency 50
Document ID 11: Most frequent term is 'femto' with frequency 28
Document ID 12: Most frequent term is 'free_link' with frequency 2
Document ID 13: Most frequent term is 'templat' with frequency 121
Document ID 14: Most frequent te


The script prints the total number of processed documents and displays the most and least frequent terms for each document. Finally, the inverted_index is saved to a JSON file named inverted_index.json using Python's json module.


In [None]:
#remaining steps >> saving meatadata in a json file to use it in the next step >> still not sure the suitable way to store
                 # also the last code need to be adjusted to calculate the maximum
                #term frequency for <<each>> document,>> DONE
                #the length of the weight vector for each document, >> in order to finish it , we first need to construct the VSM

# Phase II

   # Building VSM

The Vector Space Model (VSM) The main idea behind VSM is to quantify the similarity between documents and a search query by representing them as vectors in a high-dimensional space and then measuring their similarity using a metric like a cosine similarity and term weighting.

In [None]:
import os
import math
from sklearn.feature_extraction.text import TfidfVectorizer


The script initializes two lists: doc_nam to store the names of text files, and docs to store their content. It then iterates through all files in a specified directory , the script reads the content of the file and appends it to the docs list while simultaneously adding the filename to the doc_nam list. This approach systematically collects and organizes the names and contents of text files from the given directory.

In [None]:
output_file='/content/stemmed_documents'
doc_nam = []
docs = []

for filename in os.listdir(output_file):
    print(filename)
    if filename.endswith('.txt'):
        with open(os.path.join(output_file, filename)) as f:
            text = f.read()
            docs.append(text)
            doc_nam.append(filename)


document_669_stemmed.txt
document_686_stemmed.txt
document_882_stemmed.txt
document_119_stemmed.txt
document_349_stemmed.txt
document_955_stemmed.txt
document_528_stemmed.txt
document_270_stemmed.txt
document_107_stemmed.txt
document_207_stemmed.txt
document_462_stemmed.txt
document_428_stemmed.txt
document_908_stemmed.txt
document_784_stemmed.txt
document_777_stemmed.txt
document_439_stemmed.txt
document_756_stemmed.txt
document_560_stemmed.txt
document_5_stemmed.txt
document_466_stemmed.txt
document_344_stemmed.txt
document_938_stemmed.txt
document_886_stemmed.txt
document_993_stemmed.txt
document_599_stemmed.txt
document_969_stemmed.txt
document_668_stemmed.txt
document_627_stemmed.txt
document_84_stemmed.txt
document_799_stemmed.txt
document_873_stemmed.txt
document_971_stemmed.txt
document_519_stemmed.txt
document_988_stemmed.txt
document_697_stemmed.txt
document_877_stemmed.txt
document_812_stemmed.txt
document_186_stemmed.txt
document_476_stemmed.txt
document_716_stemmed.txt
doc

In natural language processing, TF-IDF (Term Frequency-Inverse Document Frequency) is a popular weighting scheme used to evaluate the importance of words in documents.

We employed TfidfVectorizer() from sklearn.feature_extraction.text  to convert the documents into
TF-IDF vectors. Each row in the resulting matrix corresponds to a document, and each column represents a word.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)
#tfidf_matrix represents the VSM for the documents.

In [None]:
vector_len = {}#is a dictionary to store the length of the TF-IDF vector for each document
for i, filename in enumerate(doc_nam):
    v = tfidf_matrix[i].toarray()[0]  # Get the dense representation of the TF-IDF vector
    length = math.sqrt(sum(value ** 2 for value in v))  # Calculate the vector length
    vector_len[filename] = length


In [None]:
vector_len

{'document_669_stemmed.txt': 0.9999999999999996,
 'document_686_stemmed.txt': 0.9999999999999996,
 'document_882_stemmed.txt': 1.0000000000000002,
 'document_119_stemmed.txt': 0.9999999999999999,
 'document_349_stemmed.txt': 1.0,
 'document_955_stemmed.txt': 1.0000000000000002,
 'document_528_stemmed.txt': 1.0,
 'document_270_stemmed.txt': 1.0000000000000002,
 'document_107_stemmed.txt': 0.9999999999999999,
 'document_207_stemmed.txt': 1.0,
 'document_462_stemmed.txt': 0.9999999999999998,
 'document_428_stemmed.txt': 0.9999999999999997,
 'document_908_stemmed.txt': 1.0000000000000002,
 'document_784_stemmed.txt': 0.9999999999999992,
 'document_777_stemmed.txt': 1.0000000000000004,
 'document_439_stemmed.txt': 0.9999999999999998,
 'document_756_stemmed.txt': 1.0000000000000004,
 'document_560_stemmed.txt': 1.0,
 'document_5_stemmed.txt': 0.9999999999999998,
 'document_466_stemmed.txt': 0.9999999999999993,
 'document_344_stemmed.txt': 1.0,
 'document_938_stemmed.txt': 1.0000000000000002,

# Query processing

In [None]:
import sys
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

We have developed a search engine interface using the command line, which allows users to enter a query and receive relevant document results. The system begins by loading documents from a specified directory using the load_documents_from_directory(directory_path) function. This function reads each text file in the directory, stores its content, and collects metadata such as the document ID, title (based on the filename), and file URL.

For Query preprocessing, we implemented a tokenize_and_normalize(text) function. This function handles tokenization, normalization and stemming.

 These steps ensure that both the documents and user queries are processed consistently, facilitating accurate comparison.

To represent the documents in numerical form, we use the initialize_vectorizer(documents) function. This function initializes a TF-IDF  vectorizer and fits it on the preprocessed documents, creating a sparse matrix document_vectors where each row corresponds to a document and each column corresponds to a term. The matrix's values represent the TF-IDF scores, which indicate the importance of terms in the documents relative to the entire corpus.

In [None]:
import sys
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


# Ensure NLTK resources are downloaded
import nltk
nltk.download('punkt')

# 1. Load documents from the directory
def load_documents_from_directory(directory_path):
    documents = []
    metadata = []

    for doc_id, filename in enumerate(os.listdir(directory_path), start=1):
        if filename.endswith(".txt"):  # Assuming documents are text files
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                documents.append(content)
                metadata.append({
                    'id': doc_id,
                    'title': filename,  # Using the filename as the title for now
                    'url': f'file://{file_path}'  # Assuming a file URL, adjust as needed
                })

    return documents, metadata

# Preprocessing function
def tokenize_and_normalize(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Normalization: remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalnum()]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# Initialize the TF-IDF vectorizer and fit it on the documents
def initialize_vectorizer(documents):
    vectorizer = TfidfVectorizer()
    document_vectors = vectorizer.fit_transform(documents)
    return vectorizer, document_vectors




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


When a user submits a search query via the command line, the on_submit() function is called. This function prompts the user to input a search query, which is then passed to the search_query function, then the processed query is transformed into a TF-IDF vector using the previously fitted vectorizer.

To find relevant documents, the system calculates the cosine similarity between the query vector and each document vector. This similarity score is a measure of how closely the query matches each document.


In [None]:
def search_query(query, vectorizer, document_vectors, metadata, top_n=3):
    processed_query = tokenize_and_normalize(query)
    query_vector = vectorizer.transform([processed_query])
    similarity_scores = cosine_similarity(query_vector, document_vectors).flatten()

    # Combine metadata with similarity scores and filter top N results
    results = []
    for i, score in enumerate(similarity_scores):
        if score > 0:
         results.append({
            'id': metadata[i]['id'],
            'title': metadata[i]['title'],
            'url': metadata[i]['url'],
            'score': score
        })

    results.sort(key=lambda x: x['score'], reverse=True)
    return results[:]  # Return only the top N results

def calculate_precision_recall(ground_truth, retrieved_docs):
    relevant_docs = set(ground_truth)
    retrieved_set = set(retrieved_docs)

    true_positives = len(relevant_docs & retrieved_set)
    false_positives = len(retrieved_set - relevant_docs)
    false_negatives = len(relevant_docs - retrieved_set)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    return precision, recall

# Handle user input and search execution
def on_submit():
    ground_truth_ids = {1, 3, 5}  # Replace with actual relevant document IDs

    # Get the query from user input
    user_query = input("Enter your search query: ")

    # Perform the search
    results = search_query(user_query, vectorizer, document_vectors, metadata)

    # Extract retrieved document IDs
    retrieved_ids = {result['id'] for result in results}

    # Calculate precision and recall
    precision, recall = calculate_precision_recall(ground_truth_ids, retrieved_ids)

    print("\nSearch Results:")
    for result in results:
        print(f"Document ID: {result['id']}, Title: {result['title']}, URL: {result['url']}, Score: {result['score']:.4f}")

    print(f"\nPrecision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

# Main function
if __name__ == "__main__":
    # Specify the directory containing your text files
    directory_path = "/content/stemmed_documents"

    # Load the documents from the specified directory
    documents, metadata = load_documents_from_directory(directory_path)

    # Initialize the TF-IDF vectorizer and fit it on the documents
    vectorizer, document_vectors = initialize_vectorizer(documents)

    # Run the search application
    on_submit()



Enter your search query: wikipedia

Search Results:
Document ID: 192, Title: document_726_stemmed.txt, URL: file:///content/stemmed_documents/document_726_stemmed.txt, Score: 0.2560
Document ID: 84, Title: document_720_stemmed.txt, URL: file:///content/stemmed_documents/document_720_stemmed.txt, Score: 0.2516
Document ID: 598, Title: document_719_stemmed.txt, URL: file:///content/stemmed_documents/document_719_stemmed.txt, Score: 0.2477
Document ID: 443, Title: document_721_stemmed.txt, URL: file:///content/stemmed_documents/document_721_stemmed.txt, Score: 0.2453
Document ID: 533, Title: document_723_stemmed.txt, URL: file:///content/stemmed_documents/document_723_stemmed.txt, Score: 0.2422
Document ID: 760, Title: document_722_stemmed.txt, URL: file:///content/stemmed_documents/document_722_stemmed.txt, Score: 0.2405
Document ID: 627, Title: document_725_stemmed.txt, URL: file:///content/stemmed_documents/document_725_stemmed.txt, Score: 0.2380
Document ID: 239, Title: document_724_s