In [13]:
import os
import math
import re
from collections import defaultdict

# Preprocess the document (tokenization, case folding, stop word removal)
def preprocess(text):
    text = text.lower()  # case folding
    text = re.sub(r'\W+', ' ', text)  # remove non-word characters
    tokens = text.split()
    return tokens

# Build the document collection
def build_document_collection(directory):
    documents = {}
    for file_name in os.listdir(directory):
        if file_name.endswith('.txt'):
            with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
                documents[file_name] = preprocess(file.read())
    return documents

# Build the inverted index with tf
def build_index(documents):
    index = defaultdict(list)
    doc_lengths = defaultdict(float)
    N = len(documents)

    # Calculate term frequencies and build postings list
    for doc_id, terms in documents.items():
        term_freqs = defaultdict(int)
        for term in terms:
            term_freqs[term] += 1

        for term, freq in term_freqs.items():
            index[term].append((doc_id, freq))
            doc_lengths[doc_id] += (1 + math.log10(freq)) ** 2  # Accumulate length for normalization

    # Normalize document lengths
    for doc_id in doc_lengths:
        doc_lengths[doc_id] = math.sqrt(doc_lengths[doc_id])

    return index, doc_lengths, N

# Compute IDF for terms in the index
def compute_idf(index, N):
    idf = {}
    for term in index:
        df = len(index[term])
        idf[term] = math.log10(N / df)
    return idf

# Compute tf-idf weights for the query
def compute_query_weights(query, index, idf):
    query_terms = preprocess(query)
    query_weights = defaultdict(float)

    term_freqs = defaultdict(int)
    for term in query_terms:
        term_freqs[term] += 1

    for term, freq in term_freqs.items():
        if term in index:
            query_weights[term] = (1 + math.log10(freq)) * idf[term]  # log tf * idf

    return query_weights

# Compute cosine similarity
def cosine_similarity(query_weights, index, doc_lengths, idf):
    scores = defaultdict(float)

    for term, query_weight in query_weights.items():
        if term in index:
            for doc_id, tf in index[term]:
                doc_weight = 1 + math.log10(tf)  # lnc: log tf but no idf
                scores[doc_id] += query_weight * doc_weight

    # Normalize the scores
    for doc_id in scores:
        scores[doc_id] /= doc_lengths[doc_id]  # cosine normalization

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# Main function to execute VSM retrieval
def vsm_retrieval(query, directory):
    documents = build_document_collection(directory)
    index, doc_lengths, N = build_index(documents)
    idf = compute_idf(index, N)

    query_weights = compute_query_weights(query, index, idf)
    results = cosine_similarity(query_weights, index, doc_lengths, idf)

    return results

# Example Usage
if __name__ == "__main__":
    directory = "documents"  # Directory containing the documents

    # Take user input for query
    query = input("Enter your search query: ")

    # Perform VSM retrieval
    results = vsm_retrieval(query, directory)

    # Display top-ranked documents
    print("\nTop-ranked documents:")
    if results:
        for doc_id, score in results[:5]:
            print(f"{doc_id}: {score:.4f}")
    else:
        print("No relevant documents found.")


Top-ranked documents:
Dell.txt: 0.5411
samsung.txt: 0.3267
apple.txt: 0.2149
HP.txt: 0.2052
steam.txt: 0.1748
