In [1]:
import os
import math
from collections import defaultdict
from nltk.stem import PorterStemmer
import re
import numpy as np

folder_path = "./data"

with open('stopwords.txt', 'r') as file:
    stop_words = set(file.read().splitlines())

def preprocess(text):
    text = text.lower()
    
    text = text.replace('_', '')
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = text.split()
    
    tokens = [word for word in tokens if word not in stop_words]
    
    porter = PorterStemmer()
    stemmed_tokens = [porter.stem(word) for word in tokens]
    
    return stemmed_tokens

# sort files by number
def natural_key(filename):
    return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]

def read_documents(folder_path):
    documents = {}

    for filename in sorted(os.listdir(folder_path), key=natural_key):
        if filename.endswith(".txt"):
            file_id = filename[:-4]
            
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                
                content = preprocess(content)
                documents[file_id] = content
    
    return documents

In [2]:
# Calculate df
def calculate_df(documents):
    doc_count = defaultdict(int)
    
    # Count number of documents containing each term
    for tokens in documents.values():
        unique_terms = set(tokens)
        for term in unique_terms:
            doc_count[term] += 1
    return doc_count

In [3]:
# Save dictionary to dictionary.txt
def save_dictionary_to_file(doc_count, output_filename="dictionary.txt"):
    dictionary = [(term, df) for term, df in doc_count.items()]
    
    dictionary.sort(key=lambda x: x[0])
    
    indexed_dictionary = [(index + 1, term, df) for index, (term, df) in enumerate(dictionary)]
    
    term_to_index = {term: t_index for t_index, term, df in indexed_dictionary}
    
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write("t_index\tterm\tdf\n")
        for t_index, term, df in indexed_dictionary:
            file.write(f"{t_index}\t{term}\t{df}\n")
    
    print(f"Dictionary saved to {output_filename}")
    
    return term_to_index

In [4]:
# Calculate tf-idf
def calculate_tfidf(documents):
    doc_count = calculate_df(documents)
    total_documents = len(documents)
    
    tfidf_vectors = {}
    
    for doc_id, tokens in documents.items():
        tf = defaultdict(int)
        for token in tokens:
            tf[token] += 1
            
        # Normalize TF
        total_terms = len(tokens)
        for term in tf:
            tf[term] /= total_terms
        
        tfidf = {}
        for term, term_tf in tf.items():
            idf = math.log10(total_documents / (1 + doc_count[term])) # log10
            tfidf[term] = term_tf * idf
        
        # Convert to unit vector
        norm = math.sqrt(sum(value ** 2 for value in tfidf.values()))
        tfidf_unit_vector = {term: value / norm for term, value in tfidf.items()} if norm > 0 else tfidf
        
        tfidf_vectors[doc_id] = tfidf_unit_vector
    
    return tfidf_vectors

In [5]:
# Save tf-idf unit vector to output/
def save_tfidf_to_files(tfidf_vectors, term_to_index, output_folder="./output"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for index, (doc_id, vector) in enumerate(tfidf_vectors.items(), start=1):
        output_filename = os.path.join(output_folder, f"{index}.txt")
        
        with open(output_filename, "w", encoding="utf-8") as file:
            file.write(f"{len(vector)}\n")
            file.write("t_index\ttf-idf\n")
            
            sorted_tfidf = [(term_to_index[term], tfidf) for term, tfidf in vector.items() if term in term_to_index]
            sorted_tfidf.sort(key=lambda x: x[0])
            
            for t_index, tfidf_value in sorted_tfidf:
                file.write(f"{t_index}\t{tfidf_value}\n")
        
        # print(f"TF-IDF vector for Document {doc_id} saved to {output_filename}")


In [6]:
# Cosine similarity

def read_tfidf_file(filename):
    tfidf_vector = {}
    
    with open(filename, "r", encoding="utf-8") as file:
        lines = file.readlines()[2:]
        for line in lines:
            t_index, tfidf_value = line.strip().split('\t')
            tfidf_vector[int(t_index)] = float(tfidf_value)
    
    return tfidf_vector

def cosine(Docx, Docy):
    # Read TF-IDF vectors for the two documents
    tfidf_vector_x = read_tfidf_file(f"./output/{Docx}.txt")
    tfidf_vector_y = read_tfidf_file(f"./output/{Docy}.txt")
    
    # Find the common t_indices
    common_indices = set(tfidf_vector_x.keys()) & set(tfidf_vector_y.keys())
    
    dot_product = sum(tfidf_vector_x[i] * tfidf_vector_y[i] for i in common_indices)
    
    norm_x = math.sqrt(sum(value ** 2 for value in tfidf_vector_x.values()))
    norm_y = math.sqrt(sum(value ** 2 for value in tfidf_vector_y.values()))
    
    if norm_x == 0 or norm_y == 0:
        return 0.0
    
    cosine_similarity = dot_product / (norm_x * norm_y)
    
    return cosine_similarity

In [7]:
# Main execution
documents = read_documents(folder_path)
doc_count = calculate_df(documents)

term_to_index = save_dictionary_to_file(doc_count)

tfidf_vectors = calculate_tfidf(documents)

save_tfidf_to_files(tfidf_vectors, term_to_index)

# Calculate cosine similarity
similarity = cosine(1, 2)
print(f"Cosine Similarity between Document 1 and Document 2 is: {similarity}")

Dictionary saved to dictionary.txt
Cosine Similarity between Document 1 and Document 2 is: 0.1881508600743812
