In [14]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
import numpy as np
nltk.download('wordnet')
# Load your subset dataframe
file_path = r"C:\Users\PC\Documents\Symmester 4\Big Data\Assignment 2/subset.csv"
#subset_df = pd.read_csv(file_path)
#to avoid dtype warning reads all as str
subset_df = pd.read_csv(file_path, dtype=str) 


# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        # Tokenize the text
        words = word_tokenize(text)
        # Remove stop words and perform lemmatization
        filtered_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum() and word.lower() not in stop_words]
        return filtered_words
    else:
        return []


# Create a corpus
corpus = OrderedDict()

# Iterate through the rows to build the corpus
for index, row in subset_df.iterrows():
    article_text = row['SECTION_TEXT']
    preprocessed_words = preprocess_text(article_text)
    for word in preprocessed_words:
        if word not in corpus:
            corpus[word] = len(corpus)

# Print all words in the vocabulary along with their index
# for index, word in corpus.items():
#     print(f"({index}, '{word}'), ", end="")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:

# Initialize a dictionary to store TF for each article
tf_dict = {}

# Iterate through the rows to calculate TF
for index, row in subset_df.iterrows():
    article_text = row['SECTION_TEXT']
    preprocessed_words = preprocess_text(article_text)
    article_tf = {idx: 0 for idx in corpus.values()}
    for word in preprocessed_words:
        if word in corpus:
            article_tf[corpus[word]] += 1
    article_tf = {idx: freq for idx, freq in article_tf.items() if freq > 0}
    tf_dict[index] = article_tf


# Print TF for each article
# for article_id, tf in tf_dict.items():
#     print(f"\n{article_id}")
#     for term_idx, tf_value in tf.items():
#         print(f"({term_idx}, {tf_value}), ", end="")


In [16]:

# Initialize an array to store the document frequency (DF) for each word
df_array = np.zeros(len(corpus))

# Iterate through the TF dictionary to calculate DF
for tf in tf_dict.values():
    for word_idx in tf.keys():
        df_array[word_idx] += 1

# Print index and DF for each word
# for idx, df_value in enumerate(df_array):
#     print(f"({idx}, {df_value}), ", end="")


In [17]:
# Compute TF/DF weights
tf_df_weights = {}
for article_id, article_tf in tf_dict.items():
    tf_df_weights[article_id] = {term_idx: tf_value * df_array[term_idx] for term_idx, tf_value in article_tf.items()}

# Print TF/DF for each document
# for article_id, tfdf in tf_df_weights.items():
#     print(f"{article_id}")
#     for term_idx, tfdf_value in tfdf.items():
#         print(f"({term_idx}, {tfdf_value}), ", end="")
#     print()

In [18]:
# Initialize a 2D array to store document vectors
document_vectors = []

# Iterate through the TF dictionary to create vectors for each document
for tfidf in tf_dict.values():
    document_vector = [0] * len(corpus)
    for word_idx, tfidf_value in tfidf.items():
        document_vector[word_idx] = tfidf_value
    document_vectors.append(document_vector)



# Print the first two document vectors to check
# for i in range(2):
#     print(f"Document Vector {i+1}: {document_vectors[i]}")


In [19]:
# Function to calculate relevance between query vector and document vector
def calculate_relevance_sparse(query_vector, document_vector):
    relevance = 0
    for term_idx in query_vector:
        if term_idx in document_vector:
            relevance += query_vector[term_idx] * document_vector[term_idx]
    return relevance

# Function to calculate relevance between query vector and document vector
def calculate_relevance(query_vector, document_vector):
    relevance = sum(qi * di for qi, di in zip(query_vector, document_vector))
    return relevance

# Get user input for the query text
query_text = input("Enter your query text: ")

# Preprocess the query text
preprocessed_query = preprocess_text(query_text)

# Initialize a dictionary to store TF for the query
query_tf = {idx: 0 for idx in corpus.values()}

# Calculate TF for the query
for word in preprocessed_query:
    if word in corpus:
        query_tf[corpus[word]] += 1

# Create the query vector (sparse representation)
query_vector_sparse = {idx: tf_value for idx, tf_value in query_tf.items() if tf_value != 0}

# Initialize a dictionary to store relevance scores along with document index
document_relevance_sparse = {}

# Initialize document_vectors_sparse dictionary
document_vectors_sparse = {}

# Iterate through the TF dictionary to create sparse vectors for each document
for idx, doc_vector in enumerate(document_vectors):
    doc_sparse_vector = {term_idx: tfidf_value for term_idx, tfidf_value in enumerate(doc_vector) if tfidf_value != 0}
    document_vectors_sparse[idx] = doc_sparse_vector

# Calculate relevance between query vector and each document vector (using sparse representation)
for doc_id, doc_vector in document_vectors_sparse.items():
    relevance_sparse = calculate_relevance_sparse(query_vector_sparse, doc_vector)
    document_relevance_sparse[doc_id] = relevance_sparse


# print("Query Vector:", query_vector_sparse)

In [21]:

# Calculate relevance between query vector and each document vector (using full representation)
document_relevance = []

for doc_vector in document_vectors:
    relevance = calculate_relevance(list(query_vector_sparse.values()), doc_vector)
    document_relevance.append(relevance)

# Sort the document relevance dictionaries by relevance score
sorted_document_relevance_sparse = sorted(document_relevance_sparse.items(), key=lambda x: x[1], reverse=True)
sorted_document_relevance = sorted(enumerate(document_relevance), key=lambda x: x[1], reverse=True)

# Display the top 10 relevant documents
print("Top 10 relevant documents:")
for i, (doc_id, relevance) in enumerate(sorted_document_relevance_sparse[:10], start=1):
    print(f"doc {doc_id + 1}: {relevance:.2f}")


Top 10 relevant documents:
doc 212: 5.00
doc 236: 5.00
doc 1063: 5.00
doc 3: 4.00
doc 6: 3.00
doc 51: 3.00
doc 402: 3.00
doc 489: 3.00
doc 618: 3.00
doc 690: 3.00
