In [4]:
import math
import re
import csv
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

documents = []
with open("UnionAddressTable_All.csv", "r") as file:
    reader = csv.reader(file)
    for row in reader:
        documents.append(row[4])

# Define user query topics
query_topics_1 = ["freedom", "freedom of speech", "freedom of press"]
query_topics_2 = ["security", "peace", "reestablishment of peace", "preservation of peace"]

def preprocess(text):
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

def tokenize(text):
    return text.split()

def count_term_frequency(documents):
    term_frequency = []
    for document in documents:
        words = tokenize(document)
        term_frequency.append(Counter(words))
    return term_frequency

def calculate_df(documents, topics):
    df = Counter()
    for document in documents:
        words = set(tokenize(document))
        for topic in topics:
            if topic in words:
                df[topic] += 1
    return df

def calculate_tfidf(tf, df, num_documents):
    tfidf = {}
    for term, tf_value in tf.items():
        if df[term] != 0:
            idf = math.log(num_documents / df[term])
            tfidf[term] = tf_value * idf
        else:
            tfidf[term] = 0
    return tfidf

def construct_document_vectors(documents, topics):
    num_documents = len(documents)
    document_vectors = []
    term_frequency = count_term_frequency(documents)
    df = calculate_df(documents, topics)
    for tf in term_frequency:
        tfidf = calculate_tfidf(tf, df, num_documents)
        document_vectors.append(tfidf)
    return document_vectors

def calculate_similarity(document_vector, query_topics):
    similarity = sum(document_vector.get(topic, 0) for topic in query_topics)
    return similarity

def find_best_match(documents, query_topics):
    document_vectors = construct_document_vectors(documents, query_topics)
    best_match = None
    best_similarity = -1
    for i, document_vector in enumerate(document_vectors):
        similarity = calculate_similarity(document_vector, query_topics)
        if similarity > best_similarity:
            best_similarity = similarity
            best_match = i
    return best_match

best_match_1 = find_best_match(documents, query_topics_1)
print("State of the Union Address addressing freedom-related topics: Document", best_match_1+1)

best_match_2 = find_best_match(documents, query_topics_2)
print("State of the Union Address addressing security-related topics: Document", best_match_2+1)

State of the Union Address addressing freedom-related topics: Document 225
State of the Union Address addressing security-related topics: Document 216
