Reading Original Data Set and Creating Sample 

In [None]:
# import pandas as pd

# original_df = pd.read_csv("dataset.csv")

# # Selecting only ARTICLE_ID and SECTION_TEXT columns
# subset_df = original_df[['ARTICLE_ID', 'SECTION_TEXT']]

# # Taking a subset of 2000 rows from the original dataset
# subset_df = subset_df.head(2000)

# subset_df.to_csv("subset.csv", index=False)

Reading and Pre Processing Data from Sample (subset.csv)

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import OrderedDict

subset_df = pd.read_csv("subset.csv")

def preprocess_text(text):
    
    # Tokenizing the text
    words = word_tokenize(text)
    # Removing stopwords and punctuation, and converting to lowercase
    filtered_words = [word.lower() for word in words if word.isalnum()]
    return filtered_words


# Dictionary to store unique words with index
corpus = OrderedDict()

# Iterate through the rows to build the corpus
for index, row in subset_df.iterrows():
    article_text = row['SECTION_TEXT']
    preprocessed_words = preprocess_text(article_text)
    # Adding preprocessed words to the corpus
    for word in preprocessed_words:
        if word not in corpus:
            corpus[word] = len(corpus)  # Assign index if the word is new


# print("Corpus with assigned indices:")
# for idx, word in corpus.items():
#     print(f"({word}, '{idx}')")

Calculating Term Frequency

In [None]:
# Dictionary to store TF for each word in the corpus
term_frequency = {index: 0 for index in corpus.values()}

# Iterating through the rows to calculate term frequency
for index, row in subset_df.iterrows():
    article_text = row['SECTION_TEXT']
    preprocessed_words = preprocess_text(article_text)

    # Dictionary to store TF in articles
    tf_article = {index: 0 for index in corpus.values()}

    # TF for each word in the article
    for word in preprocessed_words:
        if word in corpus:
            index = corpus[word]
            tf_article[index] += 1
            term_frequency[index] += 1

    # Remove words with 0 frequency
    non_zero_tf_article = {word: tf for word,
                           tf in tf_article.items() if tf > 0}
#     print(f"Article {row['ARTICLE_ID']} TF: {non_zero_tf_article}")

# term_frequency = {index: tf for index, tf in term_frequency.items() if tf > 0}


# print("\nNon-zero Term Frequencies:")
# print(term_frequency)

Calculating Inverse Document Frequency

In [None]:
import numpy as np

#  an array to store inverse document frequency 
df_array = np.zeros(len(corpus))

# Iterate through the rows to calculate IDF
for index, row in subset_df.iterrows():
    article_text = row['SECTION_TEXT']
    preprocessed_words = preprocess_text(article_text)

    # Track whether a word has appeared in the current article
    appeared_words = set()

    # Count the appearance of each word in the current article
    for word in preprocessed_words:
        if word in corpus and word not in appeared_words:
            word_index = corpus[word]
            df_array[word_index] += 1
            appeared_words.add(word)

# Print the document frequency for each word with its index
# print("Inverse Document Frequency (IDF):")
# for word_idx, doc_freq in enumerate(df_array):
#     print(f"({word_idx}, {doc_freq})")

TF / IDF Weights

In [None]:
# Dictionary to store weights
weights = {}

num_documents = len(subset_df)


for index, row in subset_df.iterrows():

    article_text = row['SECTION_TEXT']

    preprocessed_words = preprocess_text(article_text)

    tfidf_article = {}
    appeared_words = set()

    for word in preprocessed_words:

        if word in corpus and word not in appeared_words:

            word_index = corpus[word]

            tf = preprocessed_words.count(word)

            idf = np.log(num_documents / (1 + df_array[word_index]))

            tfidf = tf * idf

            tfidf_article[word_index] = round(

                tfidf)  # Round to nearest integer

            appeared_words.add(word)

    weights[index] = tfidf_article


# Print TF/IDF weights

# print("TF/IDF Weights:")

# for article_id, tfidf in weights.items():

#     print(f"Article {article_id}: {tfidf}")

Vector Space Model

In [None]:
import math


#  Calculating TF-IDF score for each word in the query based on the corpus document frequency.
def calculate_tf_idf_query(query, corpus, df_array, num_documents):
  
    preprocessed_query = preprocess_text(query)
    tf_query = {word: 0 for word in corpus}
    for word in preprocessed_query:
        if word in corpus:
            tf_query[word] += 1

    tf_idf_query = {}
    for word, tf in tf_query.items():
        if tf > 0 and word in corpus:
            word_index = corpus[word]
            idf = np.log(num_documents / (1 + df_array[word_index]))
            tf_idf_query[word_index] = tf * idf

    return tf_idf_query


# Calculate the cosine similarity between the document vector and the query vector.
def calculate_cosine_similarity(doc_vector, query_vector):
  
    dot_product = sum([doc_vector[word_index] * query_vector.get(word_index, 0)
                      for word_index in doc_vector])
    doc_vector_magnitude = math.sqrt(
        sum([val**2 for val in doc_vector.values()]))
    query_vector_magnitude = math.sqrt(
        sum([val**2 for val in query_vector.values()]))

    if doc_vector_magnitude == 0 or query_vector_magnitude == 0:
        return 0
    else:
        cosine_similarity = dot_product / \
            (doc_vector_magnitude * query_vector_magnitude)
        return cosine_similarity


query = input("Enter your query statement: ")
tf_idf_query = calculate_tf_idf_query(query, corpus, df_array, num_documents)

# Calculating relevance scores
relevance_scores = {}
for doc_id, doc_vector in weights.items():
    relevance_scores[doc_id] = calculate_cosine_similarity(
        doc_vector, tf_idf_query)

# Ranking documents on their relevance scores
ranked_docs = sorted(relevance_scores.items(),
                     key=lambda x: x[1], reverse=True)

print("\nRanked Documents (by relevance):")
for doc_id, score in ranked_docs:
    print(f"Document {doc_id}: {score:.2f}")