In [1]:
from nltk.stem import PorterStemmer 
import os  
import numpy as np  
import math
from collections import defaultdict  

document_folder = "./data"  # Path to the folder containing document files
stopword_file = './stopwords.txt'  # Path to the file containing stopwords

# Read stopwords into a set for quick lookup
with open(stopword_file, 'r') as f:
    stop_words = set(line.strip().lower() for line in f if line.strip())  # Store stopwords in lowercase

# Function for tokenization, lowercasing, and stemming
def tokenization(text):
    tokens = text.split() 
    tokens = [word.lower() for word in tokens]  
    porter_stemmer = PorterStemmer() 
    # Filter tokens: stem words, keep only alphanumeric tokens, and remove stopwords
    filtered_tokens = [
        porter_stemmer.stem(word) 
        for word in tokens 
        if word.isalnum() and word not in stop_words
    ]    
    return filtered_tokens  

# Process documents and apply tokenization
documents = [] 
for i in range(1, 1096): 
    file_path = os.path.join(document_folder, f"{i}.txt") 
    with open(file_path, 'r', encoding='utf-8') as file:  
        content = file.read()  # Read the file content
        processed_content = tokenization(content)  # Apply the tokenization function
        documents.append(processed_content)  # Append processed content to the documents list


In [2]:
term_df = defaultdict(int)  # Initialize a defaultdict to store document frequency of terms

# Count document frequency for each term
for doc in documents:
    unique_terms = set(doc)  # Ensure each term is counted only once per document
    for term in unique_terms:
        term_df[term] += 1  # Increment the document frequency for each unique term

# Sort the terms in dictionary order
sorted_terms = sorted(term_df.items(), key=lambda x: x[0]) 

# Assign term indices and create a list of dictionary entries
dictionary_entries = []
for idx, (term, df) in enumerate(sorted_terms, start=1): 
    dictionary_entries.append((idx, term, df))  

# Define the path to save the dictionary
dictionary_file = os.path.join('./dictionary.txt') 

# Write the dictionary to a text file
with open(dictionary_file, 'w', encoding='utf-8') as f: 
    f.write(f"t_index\tterm\tdf\n")
    for entry in dictionary_entries:
        f.write(f"{entry[0]}\t{entry[1]}\t{entry[2]}\n")  # Write index, term, and document frequency



In [3]:
 # Calculate IDF value
N = 1095
idf_dict = {}
for idx, term, df in dictionary_entries:  
    idf_value = math.log10(N / df) if df > 0 else 0 
    idf_dict[term] = (idx, idf_value)
    # Store the term index and its corresponding IDF value


# Create a list to hold TF-IDF results for each document
tfidf_results = []

for doc in documents:
    # Count term frequencies
    term_count = defaultdict(int)
    for term in doc:
        term_count[term] += 1    
    # Total number of terms in the document
    total_terms = len(doc)    
    tfidf_vector = {}
    for term, count in term_count.items():
        tf = count / total_terms  # Calculate TF
        idx, idf_value = idf_dict.get(term, (0, 0))  
        tfidf_vector[idx] = tf * idf_value  # Calculate TF-IDF
        
    magnitude = np.linalg.norm(list(tfidf_vector.values()))  # Compute the magnitude
    if magnitude > 0:
        tfidf_vector = {idx: value / magnitude for idx, value in tfidf_vector.items()}  # Normalize values
    
    # Sort tfidf_vector by idx (the key)
    sorted_tfidf_vector = dict(sorted(tfidf_vector.items()))  
    tfidf_results.append(sorted_tfidf_vector)  
output_folder = os.path.join( "./output/")
os.makedirs(output_folder, exist_ok=True)
for i, tfidf in enumerate(tfidf_results):
    filename = os.path.join(output_folder, f"{i + 1}.txt")
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f"t_index\ttf_idf\n")  # Header
        for idx, tfidf_value in tfidf.items():
            file.write(f"{idx}\t{tfidf_value}\n")  # Write each TF-IDF pair

In [4]:
def cosine(Docx, Docy):
    # Load TF-IDF vectors for the specified documents
    tfidf_x = tfidf_results[Docx - 1]  
    tfidf_y = tfidf_results[Docy - 1]  

    # Get the union of keys from both TF-IDF dictionaries
    keys = set(tfidf_x.keys()).union(set(tfidf_y.keys()))
    
    # Create vectors for each document based on the keys
    vector_x = np.array([tfidf_x.get(key, 0) for key in keys]) 
    vector_y = np.array([tfidf_y.get(key, 0) for key in keys]) 

    # Calculate cosine similarity
    cosine_similarity = np.dot(vector_x, vector_y)    
    return cosine_similarity  

# Example usage
similarity = cosine(1, 2)  # Calculate similarity between Document 1 and Document 2
print(f"Cosine similarity between Document 1 and Document 2: {similarity}")


Cosine similarity between Document 1 and Document 2: 0.1672930057087566
